def k_means(): k = 8 data = get_corpus(20000) baseline_clf = svm_pipeline() train, dev, test = get_train_data() baseline_clf.fit(train[0], train[1]) max_consistency = label_counter([(0, data)], baseline_clf) logging.info('baseline consistency: %f' % max_consistency) tmp = [ (tweet.strip(), int(label.strip())) for tweet, label in zip( codecs.open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'), codecs.open(root+'Data/logs/cluster.txt', 'r,', 'utf-8')) ] clf = {} for tweet, label in tmp: if label not in clf: clf[label] = [tweet] else: clf[label].append(tweet) max_consistency = label_counter(clf.items(), baseline_clf) logging.info('max consistency: %f' % max_consistency) for label in clf.keys(): logging.info(label) frq = {} for tw in clf[label]: try: est = baseline_clf.predict([tw])[0] if est in frq: frq[est] += 1 else: frq[est] = 1 except ValueError: logging.warn('could not parse tweet %s' % tw) logging.info(frq) max_clusters = () km = k_means_pipeline(k) for run in range(50): logging.debug('run number %i' % run) km.fit(data) clusters = {idx: [] for idx in range(k)} for entry, cluster in zip(data, km.named_steps['k_means'].labels_): clusters[cluster].append(entry) local_consistency = label_counter(clusters.items(), baseline_clf) if local_consistency > max_consistency: max_consistency = local_consistency max_clusters = km.named_steps['k_means'].labels_ logging.debug('max consistency updated to %f' % max_consistency) logging.info('most consistent score: %f' % max_consistency) with open(root+'Data/logs/cluster.txt', 'w') as log: for line in max_clusters: log.write('%s\n' % line)
def __init__(self, clusters_loc, sentiments_loc): """ A mutator using automatically created word clusters as basis for re-weighting. Filter parameters are not needed, as class membership is binary. :param clusters_loc: location of the tweet-cluster association data :param sentiments_loc: location of the clusters-label association data """ super(AutoCluster, self).__init__() # init classifier self.clusters = k_means_pipeline(8) # load tweets tmp = zip(*[(tweet.strip(), int(label.strip())) for tweet, label in zip( open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'), open(clusters_loc, 'r,', 'utf-8'))]) tmp[0] = tmp[0][:10000] tmp[1] = tmp[1][:10000] # load cluster class sentiment and size info self.cluster_sentiment = {} self.cluster_sizes = [] for idx, line in enumerate(open(sentiments_loc, 'r,', 'utf-8')): sent, size = line.strip().split(' ') if sent == 'positive': self.cluster_sentiment[idx] = POS elif sent == 'negative': self.cluster_sentiment[idx] = NEG elif sent == 'neutral': self.cluster_sentiment[idx] = NEU else: logging.warn('Invalid sentiment specified: %s' % sent) self.cluster_sizes.append(int(size)) if set(self.cluster_sentiment.keys()) != set(tmp[1]): logging.warn('Inconsistency between amount of clusters and associated sentiments, ' 'clusters: %s sentiments: %s' % (self.cluster_sentiment.keys(), set(tmp[1]))) self.cluster_sizes = [float(size)/sum(self.cluster_sizes) for size in self.cluster_sizes] # train classifier logging.info('Start training for k-means classifier..') self.clusters.fit(*tmp) logging.info('Finished!')