def lda_terms_analysis(lda_model_filename, word2vec_model_filename): topics = LDA.get_topics_terms(lda_model_filename) word2vec = models.Word2Vec.load(word2vec_model_filename) new_topics = [] useless = [] for topic in topics: words = topic[-1] dictionary, matrix = get_words_matrix(words, word2vec) clusters, centers = cluster(matrix, dictionary, 2, 10) cohesions = [] for c in clusters.items(): sub_words = c[-1] label = c[0] _, sub_matrix = get_words_matrix(sub_words, word2vec) center = centers[label] cohesion = utilities.cohesion(sub_matrix, center) cohesions.append((label, cohesion)) cohesions.sort(key=lambda x: x[-1]) new_topic = list(topic[:-1]) new_topic.append(cohesions[0][1]) new_topic.append(clusters[cohesions[0][0]]) new_topics.append(new_topic) for c in cohesions[1:]: u_topic = list(topic[:-1]) u_topic.append(c[0]) u_topic.append(c[1]) u_topic.append(clusters[c[0]]) useless.append(u_topic) return new_topics, useless
def hierarchical_topic_analyse_with_silhouette(corpus_filename, word2vec_model_filename, lda_filter=False, k=1): if lda_filter: topic2terms = pickle.load(open(corpus_filename)) else: topic2terms = LDA.get_topics_terms(corpus_filename) # topic2terms, _ = lda_terms_analysis(corpus_filename, word2vec_model_filename) topics = [] for t in topic2terms: topics.append(list(t)) if k == 0: return topics word2vec_model = models.Word2Vec.load(word2vec_model_filename) new_topics = [] for topic in topics: words = topic[-1] clusters = cluster_analyse_with_silhouette(words, word2vec_model, k) for c in clusters: new_topic = topic[:-1] new_topic.extend(c) new_topics.append(new_topic) return new_topics
def hierarchical_topic_analyse(lda_model_file, word2vec_model_file, k=1): topic2terms = LDA.get_topics_terms(lda_model_file) topics = [] for t in topic2terms: topics.append(list(t)) word2vec_model = models.Word2Vec.load(word2vec_model_file) for i in range(k): new_topics = [] for t in topics: words = t[-1] dictionary, matrix = get_words_matrix(words, word2vec_model) clusters = cluster(matrix, dictionary, 2, 10) for item in clusters: labels = t[:-1] labels.extend(list(item)) new_topics.append(labels) del dictionary del matrix del clusters topics = new_topics # topic_tree = get_topic_tree(topics) # return topic_tree return topics
#!/usr/bin/env python # encoding: utf-8 import LDA if __name__ == '__main__': ts = LDA.get_topics_terms('../data/models/sougou_lda_50_model.md') out = open('topics', 'wb') for t, terms in ts.items(): out.write(str(t) + '\n') out.write('\t'.join(terms).encode('utf8') + '\n') out.write('\n') out.close()