Ejemplo n.º 1
0
def k_means():
    k = 8
    data = get_corpus(20000)
    baseline_clf = svm_pipeline()
    train, dev, test = get_train_data()
    baseline_clf.fit(train[0], train[1])

    max_consistency = label_counter([(0, data)], baseline_clf)
    logging.info('baseline consistency: %f' % max_consistency)

    tmp = [
        (tweet.strip(), int(label.strip())) for tweet, label in zip(
            codecs.open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'),
            codecs.open(root+'Data/logs/cluster.txt', 'r,', 'utf-8'))
    ]
    clf = {}
    for tweet, label in tmp:
        if label not in clf:
            clf[label] = [tweet]
        else:
            clf[label].append(tweet)
    max_consistency = label_counter(clf.items(), baseline_clf)
    logging.info('max consistency: %f' % max_consistency)

    for label in clf.keys():
        logging.info(label)
        frq = {}
        for tw in clf[label]:
            try:
                est = baseline_clf.predict([tw])[0]
                if est in frq:
                    frq[est] += 1
                else:
                    frq[est] = 1
            except ValueError:
                logging.warn('could not parse tweet %s' % tw)
        logging.info(frq)

    max_clusters = ()
    km = k_means_pipeline(k)
    for run in range(50):
        logging.debug('run number %i' % run)
        km.fit(data)

        clusters = {idx: [] for idx in range(k)}
        for entry, cluster in zip(data, km.named_steps['k_means'].labels_):
            clusters[cluster].append(entry)
        local_consistency = label_counter(clusters.items(), baseline_clf)

        if local_consistency > max_consistency:
            max_consistency = local_consistency
            max_clusters = km.named_steps['k_means'].labels_
            logging.debug('max consistency updated to %f' % max_consistency)

    logging.info('most consistent score: %f' % max_consistency)
    with open(root+'Data/logs/cluster.txt', 'w') as log:
        for line in max_clusters:
            log.write('%s\n' % line)
Ejemplo n.º 2
0
    def __init__(self, clusters_loc, sentiments_loc):
        """
        A mutator using automatically created word clusters as basis for
        re-weighting. Filter parameters are not needed, as class membership
        is binary.
        :param clusters_loc: location of the tweet-cluster association data
        :param sentiments_loc: location of the clusters-label association data
        """
        super(AutoCluster, self).__init__()

        # init classifier
        self.clusters = k_means_pipeline(8)

        # load tweets
        tmp = zip(*[(tweet.strip(), int(label.strip())) for tweet, label in zip(
                open(root+'Data/Corpora/batches/tokenized.tsv', 'r,', 'utf-8'),
                open(clusters_loc, 'r,', 'utf-8'))])
        tmp[0] = tmp[0][:10000]
        tmp[1] = tmp[1][:10000]

        # load cluster class sentiment and size info
        self.cluster_sentiment = {}
        self.cluster_sizes = []
        for idx, line in enumerate(open(sentiments_loc, 'r,', 'utf-8')):
            sent, size = line.strip().split(' ')
            if sent == 'positive':
                self.cluster_sentiment[idx] = POS
            elif sent == 'negative':
                self.cluster_sentiment[idx] = NEG
            elif sent == 'neutral':
                self.cluster_sentiment[idx] = NEU
            else:
                logging.warn('Invalid sentiment specified: %s' % sent)
            self.cluster_sizes.append(int(size))
        if set(self.cluster_sentiment.keys()) != set(tmp[1]):
            logging.warn('Inconsistency between amount of clusters and associated sentiments, '
                         'clusters: %s sentiments: %s' % (self.cluster_sentiment.keys(), set(tmp[1])))
        self.cluster_sizes = [float(size)/sum(self.cluster_sizes) for size in self.cluster_sizes]

        # train classifier
        logging.info('Start training for k-means classifier..')
        self.clusters.fit(*tmp)
        logging.info('Finished!')