Exemple #1
0
def k_means():
    k = 8
    data = get_corpus(20000)
    baseline_clf = svm_pipeline()
    train, dev, test = get_train_data()
    baseline_clf.fit(train[0], train[1])

    max_consistency = label_counter([(0, data)], baseline_clf)
    logging.info('baseline consistency: %f' % max_consistency)

    tmp = [
        (tweet.strip(), int(label.strip())) for tweet, label in zip(
            codecs.open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'),
            codecs.open(root+'Data/logs/cluster.txt', 'r,', 'utf-8'))
    ]
    clf = {}
    for tweet, label in tmp:
        if label not in clf:
            clf[label] = [tweet]
        else:
            clf[label].append(tweet)
    max_consistency = label_counter(clf.items(), baseline_clf)
    logging.info('max consistency: %f' % max_consistency)

    for label in clf.keys():
        logging.info(label)
        frq = {}
        for tw in clf[label]:
            try:
                est = baseline_clf.predict([tw])[0]
                if est in frq:
                    frq[est] += 1
                else:
                    frq[est] = 1
            except ValueError:
                logging.warn('could not parse tweet %s' % tw)
        logging.info(frq)

    max_clusters = ()
    km = k_means_pipeline(k)
    for run in range(50):
        logging.debug('run number %i' % run)
        km.fit(data)

        clusters = {idx: [] for idx in range(k)}
        for entry, cluster in zip(data, km.named_steps['k_means'].labels_):
            clusters[cluster].append(entry)
        local_consistency = label_counter(clusters.items(), baseline_clf)

        if local_consistency > max_consistency:
            max_consistency = local_consistency
            max_clusters = km.named_steps['k_means'].labels_
            logging.debug('max consistency updated to %f' % max_consistency)

    logging.info('most consistent score: %f' % max_consistency)
    with open(root+'Data/logs/cluster.txt', 'w') as log:
        for line in max_clusters:
            log.write('%s\n' % line)
Exemple #2
0
def main():
    # load labelled data
    classes = POS | NEU | NEG
    train_loc = root+'Data/twitterData/train_alternative.tsv'
    dev_loc = root+'Data/twitterData/dev_alternative.tsv'
    test_loc = root+'Data/twitterData/test_alternative.tsv'
    train, dev, test = get_final_semeval_data(classes, train_loc, dev_loc, test_loc)

    # load model
    model = svm_pipeline()

    # run main routine
    for funrun in ('km', 'af', 'cl'):
        run(model, train[0], train[1], dev[0], dev[1], mode=['filter'], retrain=5,
            token=funrun)