def k_means(): k = 8 data = get_corpus(20000) baseline_clf = svm_pipeline() train, dev, test = get_train_data() baseline_clf.fit(train[0], train[1]) max_consistency = label_counter([(0, data)], baseline_clf) logging.info('baseline consistency: %f' % max_consistency) tmp = [ (tweet.strip(), int(label.strip())) for tweet, label in zip( codecs.open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'), codecs.open(root+'Data/logs/cluster.txt', 'r,', 'utf-8')) ] clf = {} for tweet, label in tmp: if label not in clf: clf[label] = [tweet] else: clf[label].append(tweet) max_consistency = label_counter(clf.items(), baseline_clf) logging.info('max consistency: %f' % max_consistency) for label in clf.keys(): logging.info(label) frq = {} for tw in clf[label]: try: est = baseline_clf.predict([tw])[0] if est in frq: frq[est] += 1 else: frq[est] = 1 except ValueError: logging.warn('could not parse tweet %s' % tw) logging.info(frq) max_clusters = () km = k_means_pipeline(k) for run in range(50): logging.debug('run number %i' % run) km.fit(data) clusters = {idx: [] for idx in range(k)} for entry, cluster in zip(data, km.named_steps['k_means'].labels_): clusters[cluster].append(entry) local_consistency = label_counter(clusters.items(), baseline_clf) if local_consistency > max_consistency: max_consistency = local_consistency max_clusters = km.named_steps['k_means'].labels_ logging.debug('max consistency updated to %f' % max_consistency) logging.info('most consistent score: %f' % max_consistency) with open(root+'Data/logs/cluster.txt', 'w') as log: for line in max_clusters: log.write('%s\n' % line)
def main(): # load labelled data classes = POS | NEU | NEG train_loc = root+'Data/twitterData/train_alternative.tsv' dev_loc = root+'Data/twitterData/dev_alternative.tsv' test_loc = root+'Data/twitterData/test_alternative.tsv' train, dev, test = get_final_semeval_data(classes, train_loc, dev_loc, test_loc) # load model model = svm_pipeline() # run main routine for funrun in ('km', 'af', 'cl'): run(model, train[0], train[1], dev[0], dev[1], mode=['filter'], retrain=5, token=funrun)