Example #1
0
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels):
    # lda_model = gensim.models.ldamodel.LdaModel.load(model_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    # print len(sys_labels)
    # print len(gold_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    ri_score = rand_index(gold_labels, sys_labels)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels)
    # print 'Purity: %f' % purity(gold_labels, sys_labels)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels)

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    return nmi_score, purity_score, ri_score
Example #2
0
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels):
    # lda_model = gensim.models.ldamodel.LdaModel.load(model_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    # print len(sys_labels)
    # print len(gold_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    ri_score = rand_index(gold_labels, sys_labels)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels)
    # print 'Purity: %f' % purity(gold_labels, sys_labels)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels)

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
Example #3
0
def bow_kmeans(bow_vecs, gold_labels, num_clusters):
    print 'performing kmeans ...'
    model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20)
    model.fit(bow_vecs)

    # print len(gold_labels), 'samples'

    nmi_score = normalized_mutual_info_score(gold_labels, model.labels_)
    purity_score = purity(gold_labels, model.labels_)
    ri_score = rand_index(gold_labels, model.labels_)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_)
    # print 'Purity: %f' % purity(gold_labels, model.labels_)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_)
    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score