def nmf_optimum_coherence(corpus, start, end, step): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess( corpus=corpus) topic_numbers = [] coherence_values = [] for num_topics in range(start, end + 1, step): vectorizer = TfidfVectorizer() A = vectorizer.fit_transform(cleaned_data) nmf_model = sk_NMF(n_components=num_topics, init='nndsvd') W = nmf_model.fit_transform(A) # document topic distribution H = nmf_model.components_ # topic word distribution feature_names = vectorizer.get_feature_names() # terms doc_number = len(W) topic_number = len(H) word_count = 20 word_distributions = [] for topic in range(topic_number): top_indices = np.argsort(H[topic, :])[::-1] doc_list = [] for term_index in top_indices[0:word_count]: doc_list.append( [feature_names[term_index], H[topic, term_index]]) word_distributions.append(doc_list) nmf_topics = [[word[0] for word in topic] for topic in word_distributions] coherence = CoherenceModel(topics=nmf_topics, texts=data_tokens, dictionary=id2word).get_coherence() topic_numbers.append(num_topics) coherence_values.append(coherence) fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values)) return fig
def LSA(corpus, n_topic): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess( corpus=corpus) doc_number = len(data_tokens) lsi_model = LsiModel(corpus=corpus, num_topics=n_topic, id2word=id2word) coherence_v = coherence.coherence_value(model=lsi_model, tokens=data_tokens, dictionary=id2word) word_distributions = distributions.word_distribution(model=lsi_model, n_topic=n_topic) topic_distributions = distributions.lsi_topic_distribution( doc_number=doc_number, model=lsi_model, corpus=corpus) doc_dist = distributions.lsi_doc_distribution(n_topic=n_topic, doc_number=doc_number, model=lsi_model, corpus=corpus) output = { "filecount": doc_number, "coherence_value": float(coherence_v), "word_distributions": word_distributions, "topic_distributions": topic_distributions, "doc_dist": doc_dist, "data_tokens": data_tokens } return output
def LDA(corpus, n_topic): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(corpus=corpus) doc_number = len(data_tokens) lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topic, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True, minimum_probability=1e-8) coherence_v = coherence.coherence_value(model=lda_model, tokens=data_tokens, dictionary=id2word) word_distributions = distributions.word_distribution(model=lda_model, n_topic=n_topic) topic_distributions = distributions.lda_topic_distribution(doc_number=doc_number, model=lda_model, corpus=corpus) doc_dist = distributions.lda_doc_distribution(n_topic=n_topic, doc_number=doc_number, model=lda_model, corpus=corpus) output = {"filecount": doc_number, "coherence_value": float(coherence_v), "word_distributions": word_distributions, "topic_distributions": topic_distributions, "doc_dist": doc_dist, "data_tokens": data_tokens } return output
def w2v_kmeans(corpus, n_clusters): doc_vectors, cleaned_data_tokens = doc_vector_generator(corpus) kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', n_init=40) kmeans_model.fit(doc_vectors) labels = kmeans_model.labels_.tolist() doc_number = len(labels) nested_corpus = [] for i in range(n_clusters): nested_corpus.append([]) for i in range(doc_number): nested_corpus[labels[i]].append(corpus[i]) doc_dist = {} document_dists = np.array(labels) for cluster in range(n_clusters): doc_dist.update( {cluster: np.where(document_dists == cluster)[0].tolist()}) topic_distributions = [] for i in range(doc_number): topic_distributions.append([[labels[i], 1.0]]) word_distributions = [] for cluster_number in range(len(nested_corpus)): cleaned_data, data_tokens, id2word, corpus3 = preprocess.preprocess( corpus=nested_corpus[cluster_number]) n_topic = 1 lda_model = LdaModel( corpus=corpus3, id2word=id2word, num_topics=n_topic, random_state=100, update_every=1, # chunksize=50, passes=10, alpha='auto', per_word_topics=True, minimum_probability=1e-8) word_distributions.append( distributions.word_distribution(model=lda_model, n_topic=n_topic)[0]) silhouette_score = metrics.silhouette_score(doc_vectors, labels, metric='cosine') output = { "filecount": doc_number, "silhouette_score": float(silhouette_score), "word_distributions": word_distributions, "topic_distributions": topic_distributions, "doc_dist": doc_dist, "data_tokens": cleaned_data_tokens, "labels": labels, "doc_vectors": [doc_vec.tolist() for doc_vec in doc_vectors] } return output
def NMF(corpus, n_topic): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess( corpus=corpus) vectorizer = TfidfVectorizer() A = vectorizer.fit_transform(cleaned_data) nmf_model = sk_NMF(n_components=n_topic, init='nndsvd') W = nmf_model.fit_transform(A) # document topic distribution H = nmf_model.components_ # topic word distribution feature_names = vectorizer.get_feature_names() # terms doc_number = len(W) topic_number = len(H) word_count = 20 word_distributions = [] for topic in range(topic_number): top_indices = np.argsort(H[topic, :])[::-1] doc_list = [] for term_index in top_indices[0:word_count]: doc_list.append([feature_names[term_index], H[topic, term_index]]) word_distributions.append(doc_list) topic_distributions = [] for document in range(doc_number): topic_distributions.append([[topic, W[document][topic]] for topic in range(len(W[document]))]) doc_dist = {} for i in range(topic_number): doc_dist.update({i: []}) for i in range(doc_number): doc_dist[topic_distance.get_topic_dist_max( topic_distributions[i])[0]].append(i) nmf_topics = [[word[0] for word in topic] for topic in word_distributions] coherence = CoherenceModel(topics=nmf_topics, texts=data_tokens, dictionary=id2word).get_coherence() output = { "filecount": doc_number, "coherence_value": float(coherence), "word_distributions": word_distributions, "topic_distributions": topic_distributions, "doc_dist": doc_dist, "data_tokens": data_tokens } return output
def lsa_optimum_coherence(corpus, start, end, step): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess( corpus=corpus) topic_numbers = [] coherence_values = [] for num_topics in range(start, end + 1, step): lsi_model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word) coh = coherence.coherence_value(model=lsi_model, tokens=data_tokens, dictionary=id2word) topic_numbers.append(num_topics) coherence_values.append(coh) fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values)) return fig
def lda_optimum_coherence(corpus, start, end, step): cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(corpus=corpus) topic_numbers = [] coherence_values = [] for num_topics in range(start, end + 1, step): lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True, minimum_probability=1e-8) coh = coherence.coherence_value(model=lda, tokens=data_tokens, dictionary=id2word) topic_numbers.append(num_topics) coherence_values.append(coh) fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values)) return fig
def HDP(corpus): n_topic = 150 cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess( corpus=corpus) doc_number = len(data_tokens) hdp_model = HdpModel(corpus=corpus, id2word=id2word) coherence_v = coherence.coherence_value(model=hdp_model, tokens=data_tokens, dictionary=id2word) topic_distributions = distributions.lsi_topic_distribution( doc_number=doc_number, model=hdp_model, corpus=corpus) topics_n = [] for document in topic_distributions: for topic_dist in document: if topic_dist[0] not in topics_n: topics_n.append(topic_dist[0]) first_index = topic_distributions.index(document) second_index = topic_distributions[first_index].index(topic_dist) topic_distributions[first_index][second_index][0] = topics_n.index( topic_dist[0]) word_distributions = distributions.hdp_word_distribution(model=hdp_model, topics_n=topics_n) doc_dist = distributions.hdp_doc_distribution(n_topic=n_topic, topics_n=topics_n, doc_number=doc_number, model=hdp_model, corpus=corpus) output = { "filecount": doc_number, "coherence_value": float(coherence_v), "word_distributions": word_distributions, "topic_distributions": topic_distributions, "doc_dist": doc_dist, "topics_n": topics_n, "data_tokens": data_tokens } return output