コード例 #1
0
def lda_terms_analysis(lda_model_filename, word2vec_model_filename):
    topics = LDA.get_topics_terms(lda_model_filename)
    word2vec = models.Word2Vec.load(word2vec_model_filename)
    new_topics = []
    useless = []
    for topic in topics:
        words = topic[-1]
        dictionary, matrix = get_words_matrix(words, word2vec)
        clusters, centers = cluster(matrix, dictionary, 2, 10)
        cohesions = []
        for c in clusters.items():
            sub_words = c[-1]
            label = c[0]
            _, sub_matrix = get_words_matrix(sub_words, word2vec)
            center = centers[label]
            cohesion = utilities.cohesion(sub_matrix, center)
            cohesions.append((label, cohesion))
        cohesions.sort(key=lambda x: x[-1])
        new_topic = list(topic[:-1])
        new_topic.append(cohesions[0][1])
        new_topic.append(clusters[cohesions[0][0]])
        new_topics.append(new_topic)
        for c in cohesions[1:]:
            u_topic = list(topic[:-1])
            u_topic.append(c[0])
            u_topic.append(c[1])
            u_topic.append(clusters[c[0]])
            useless.append(u_topic)
    return new_topics, useless
コード例 #2
0
def hierarchical_topic_analyse_with_silhouette(corpus_filename, word2vec_model_filename, lda_filter=False, k=1):
    if lda_filter:
        topic2terms = pickle.load(open(corpus_filename))
    else:
        topic2terms = LDA.get_topics_terms(corpus_filename)
        # topic2terms, _ = lda_terms_analysis(corpus_filename, word2vec_model_filename)
    topics = []
    for t in topic2terms:
        topics.append(list(t))
    if k == 0:
        return topics
    word2vec_model = models.Word2Vec.load(word2vec_model_filename)
    new_topics = []
    for topic in topics:
        words = topic[-1]
        clusters = cluster_analyse_with_silhouette(words, word2vec_model, k)
        for c in clusters:
            new_topic = topic[:-1]
            new_topic.extend(c)
            new_topics.append(new_topic)
    return new_topics
コード例 #3
0
def hierarchical_topic_analyse(lda_model_file, word2vec_model_file, k=1):
    topic2terms = LDA.get_topics_terms(lda_model_file)
    topics = []
    for t in topic2terms:
        topics.append(list(t))
    word2vec_model = models.Word2Vec.load(word2vec_model_file)
    for i in range(k):
        new_topics = []
        for t in topics:
            words = t[-1]
            dictionary, matrix = get_words_matrix(words, word2vec_model)
            clusters = cluster(matrix, dictionary, 2, 10)
            for item in clusters:
                labels = t[:-1]
                labels.extend(list(item))
                new_topics.append(labels)
            del dictionary
            del matrix
            del clusters
        topics = new_topics
    # topic_tree = get_topic_tree(topics)
    # return topic_tree
    return topics
コード例 #4
0
ファイル: test.py プロジェクト: hxiaofeng/HTopicModel
#!/usr/bin/env python
# encoding: utf-8

import LDA


if __name__ == '__main__':
    ts = LDA.get_topics_terms('../data/models/sougou_lda_50_model.md')
    out = open('topics', 'wb')
    for t, terms in ts.items():
        out.write(str(t) + '\n')
        out.write('\t'.join(terms).encode('utf8') + '\n')
        out.write('\n')
    out.close()