Exemple #1
0
def k_means():
    k = 8
    data = get_corpus(20000)
    baseline_clf = svm_pipeline()
    train, dev, test = get_train_data()
    baseline_clf.fit(train[0], train[1])

    max_consistency = label_counter([(0, data)], baseline_clf)
    logging.info('baseline consistency: %f' % max_consistency)

    tmp = [
        (tweet.strip(), int(label.strip())) for tweet, label in zip(
            codecs.open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'),
            codecs.open(root+'Data/logs/cluster.txt', 'r,', 'utf-8'))
    ]
    clf = {}
    for tweet, label in tmp:
        if label not in clf:
            clf[label] = [tweet]
        else:
            clf[label].append(tweet)
    max_consistency = label_counter(clf.items(), baseline_clf)
    logging.info('max consistency: %f' % max_consistency)

    for label in clf.keys():
        logging.info(label)
        frq = {}
        for tw in clf[label]:
            try:
                est = baseline_clf.predict([tw])[0]
                if est in frq:
                    frq[est] += 1
                else:
                    frq[est] = 1
            except ValueError:
                logging.warn('could not parse tweet %s' % tw)
        logging.info(frq)

    max_clusters = ()
    km = k_means_pipeline(k)
    for run in range(50):
        logging.debug('run number %i' % run)
        km.fit(data)

        clusters = {idx: [] for idx in range(k)}
        for entry, cluster in zip(data, km.named_steps['k_means'].labels_):
            clusters[cluster].append(entry)
        local_consistency = label_counter(clusters.items(), baseline_clf)

        if local_consistency > max_consistency:
            max_consistency = local_consistency
            max_clusters = km.named_steps['k_means'].labels_
            logging.debug('max consistency updated to %f' % max_consistency)

    logging.info('most consistent score: %f' % max_consistency)
    with open(root+'Data/logs/cluster.txt', 'w') as log:
        for line in max_clusters:
            log.write('%s\n' % line)
Exemple #2
0
 def __init__(self, limit=5000):
     """
     Constructor for a Feeder instance
     :param limit: Cutoff for testing, lower value improves speed
     """
     self.current_idx = 0
     self.corpus = {idx: tweet for idx, tweet in enumerate(get_corpus(limit))}
     self.mutators = []
     logging.debug('Feeder instance successfully initiated with corpus size %i' % limit)
Exemple #3
0
 def __init__(self):
     self.corpus = get_corpus()
     self.corpus.initialize()
     self.config = get_config()
     self.model = None