Example #1
0
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels):
    # lda_model = gensim.models.ldamodel.LdaModel.load(model_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    # print len(sys_labels)
    # print len(gold_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    ri_score = rand_index(gold_labels, sys_labels)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels)
    # print 'Purity: %f' % purity(gold_labels, sys_labels)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels)

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    return nmi_score, purity_score, ri_score
Example #2
0
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels):
    # lda_model = gensim.models.ldamodel.LdaModel.load(model_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    # print len(sys_labels)
    # print len(gold_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    ri_score = rand_index(gold_labels, sys_labels)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels)
    # print 'Purity: %f' % purity(gold_labels, sys_labels)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels)

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
Example #3
0
def bow_kmeans(bow_vecs, gold_labels, num_clusters):
    print 'performing kmeans ...'
    model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20)
    model.fit(bow_vecs)

    # print len(gold_labels), 'samples'

    nmi_score = normalized_mutual_info_score(gold_labels, model.labels_)
    purity_score = purity(gold_labels, model.labels_)
    ri_score = rand_index(gold_labels, model.labels_)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_)
    # print 'Purity: %f' % purity(gold_labels, model.labels_)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_)
    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
Example #4
0
def bow_kmeans(bow_vecs, gold_labels, num_clusters):
    print 'performing kmeans ...'
    model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20)
    model.fit(bow_vecs)

    # print len(gold_labels), 'samples'

    nmi_score = normalized_mutual_info_score(gold_labels, model.labels_)
    purity_score = purity(gold_labels, model.labels_)
    ri_score = 0
    # ri_score = rand_index(gold_labels, model.labels_)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_)
    # print 'Purity: %f' % purity(gold_labels, model.labels_)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_)
    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
Example #5
0
def __lda_clustering():
    num_topics = 20
    min_occurrence = 30
    # datadir = 'e:/data/emadr/20ng_bydate/'
    # labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))
    datadir = 'e:/data/emadr/nyt-less-docs/world'
    labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    topic_vecs_file = os.path.join(
        datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))

    topic_vecs = ioutils.load_vec_list_file(topic_vecs_file)
    gold_labels = ioutils.load_labels_file(labels_file)
    sys_labels = list()
    for i, topic_vec in enumerate(topic_vecs):
        cluster_idx = 0
        max_dist = 0
        for j, v in enumerate(topic_vec):
            if v > max_dist:
                cluster_idx = j
                max_dist = v
        # print cluster_idx, max_dist
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    # ri_score = rand_index(gold_labels, sys_labels)
    ri_score = 0

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_)

    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)