def lda_terms_analysis(lda_model_filename, word2vec_model_filename): topics = LDA.get_topics_terms(lda_model_filename) word2vec = models.Word2Vec.load(word2vec_model_filename) new_topics = [] useless = [] for topic in topics: words = topic[-1] dictionary, matrix = get_words_matrix(words, word2vec) clusters, centers = cluster(matrix, dictionary, 2, 10) cohesions = [] for c in clusters.items(): sub_words = c[-1] label = c[0] _, sub_matrix = get_words_matrix(sub_words, word2vec) center = centers[label] cohesion = utilities.cohesion(sub_matrix, center) cohesions.append((label, cohesion)) cohesions.sort(key=lambda x: x[-1]) new_topic = list(topic[:-1]) new_topic.append(cohesions[0][1]) new_topic.append(clusters[cohesions[0][0]]) new_topics.append(new_topic) for c in cohesions[1:]: u_topic = list(topic[:-1]) u_topic.append(c[0]) u_topic.append(c[1]) u_topic.append(clusters[c[0]]) useless.append(u_topic) return new_topics, useless
def cluster_analyse_with_cohesion(words, word2vec_model, k=1): dictionary, matrix = get_words_matrix(words, word2vec_model) clusters, centers = cluster(matrix, dictionary, 2, 10) topics = [] t_centers = dict() depth = 0 cohesions = [] for c in clusters.items(): topics.append(list(c)) label = c[:-1] t_centers[label] = centers[c[0]] words = c[-1] sub_matrix = get_words_matrix(words, word2vec_model) center = t_centers[label] cohesion = utilities.cohesion(sub_matrix, center) cohesions.append((label, cohesion)) if len(c) - 1 > depth: depth = len(c) - 1 cohesions.sort(key=lambda x: x[-1]) while clusters_score and depth < k: label = clusters_score[0][0] del clusters_score[0] topic = topics[label] subwords = topic[-1] dictionary, matrix = get_words_matrix(subwords, word2vec_model) clusters = cluster(matrix, dictionary, 2, 10) new_topics = [] for c in clusters: new_topic = topic[:-1] new_topic.extend(list(c)) new_topics.append(new_topic) new_topics.extend(topics[:label]) new_topics.extend(topics[label + 1 :]) new_labels, new_samples_score = silhouette_samples(new_topics, word2vec_model) clusters_score = silhouette_clusters(new_labels, new_samples_score) clusters_score.sort(key=lambda x: x[1]) topics = new_topics for t in topics: if len(t) - 1 > depth: depth = len(t) - 1 return topics