def compute_coherence_values(_dictionary, _corpus, _texts, _limit, _start, _step, _path, _seed): """ Compute c_v coherence for various number of topics Parameters: ----------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max number of topics Returns: -------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ # containers coherence_values = [] model_list = [] for num_topics in range(_start, _limit, _step): model = gensim.models.wrappers.Lda(mallet_path=_path, corpus=_corpus, num_topics=num_topics, id2word=_dictionary, random_seed=_seed) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=_texts, dictionary=_dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) # return output return model_list, coherence_values
def compute_coherence_values(dictionary, corpus, texts, limit, start, step): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(start, limit, step): model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=processed_docs, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values
def compute_coherence_s(num_topic): print(f"Computing for num topic {num_topic}") ldamodel_s = LdaModel(corpus_s, num_topics=num_topic, id2word=dict_corp_s, passes=15) return CoherenceModel( model=ldamodel_s, texts=order_lag_one['stylist_processed_msg'].tolist(), dictionary=dict_corp_s, coherence='c_v').get_coherence()
def search_lda(self, data_train, data_val, min_topics=5, max_topics=20): doc_train = self.clean(data_train["opinion"]) doc_val = self.clean(data_val["opinion"]) self.id2word = corpora.Dictionary(doc_train) train_corpus = self.get_corpus(data_train) lda_search = [] coherences = [] for t in range(min_topics, max_topics + 1): lda = gensim.models.wrappers.LdaMallet("../../mallet-2.0.8/bin/mallet", corpus=train_corpus, id2word=self.id2word, num_topics=t, random_seed=utils.RANDOM_SEED) lda_search.append(lda) coherence_model = CoherenceModel(lda, texts=doc_val, dictionary=self.id2word, coherence='c_v') coherence = coherence_model.get_coherence() coherences.append(coherence) self.lda = lda_search[np.argmax(coherences)] return lda_search, coherences
def compute_coherence_c(num_topic): print(f"Computing for num topic {num_topic}") ldamodel_ = LdaModel(corpus, num_topics=num_topic, id2word=dict_corp_c, passes=15, random_state=60616) return CoherenceModel( model=ldamodel_, texts=order_lag_one['customer_processed_msg'].tolist(), dictionary=dict_corp_c, coherence='c_v').get_coherence()
import sys import numpy as np from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel, CoherenceModel from gensim.models import word2vec data_file = sys.argv[1] alpha = float(sys.argv[2]) sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] for i in range(1, 31): lda = LdaModel(corpus = corpus, id2word = dic, num_topics = i, alpha = 0.01, random_state = 1) cm = CoherenceModel(model = lda, corpus = corpus, coherence = 'u_mass') coherence = cm.get_coherence() perwordbound = lda.log_perplexity(corpus) perplexity = np.exp2(-perwordbound) print(f"num_topics = {i}, coherence = {coherence}, perplexity = {perplexity}")
alpha = float(sys.argv[3]) sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] coh_list = ['u_mass', 'c_v', 'c_w2v', 'c_uci', 'c_npmi'] print(','.join(['topic_num'] + coh_list)) for i in range(1, max_topic_num + 1): lda = LdaModel(corpus=corpus, id2word=dic, num_topics=i, alpha=alpha, random_state=1) coh = [ str( CoherenceModel(model=lda, corpus=corpus, texts=sentences, coherence=c, processes=1).get_coherence()) for c in coh_list ] print(','.join([str(i)] + coh))
#Also use TDIF topic modelling in future lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=32, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic)) #sample doc test score # for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]): # print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 30))) from gensim.models.ldamodel import CoherenceModel from gensim.models.ldamodel import LdaModel coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() def compute_coherence_values(dictionary, corpus, texts, limit, start, step): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: -------
sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] print('topic_num,avg,bound,perplexity,coherence') for i in range(1, max_topic_num + 1): lda = LdaModel(corpus=corpus, id2word=dic, num_topics=i, alpha=alpha, random_state=1) avg_topics = mean([len(t) for t in [lda[c] for c in corpus]]) bound = lda.bound(corpus) perwordbound = lda.log_perplexity(corpus) perplexity = np.exp2(-perwordbound) cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass', processes=1) coherence = cm.get_coherence() print(f"{i},{avg_topics},{bound},{perplexity},{coherence}")