class LMDL_LDA(): def __init__(self): self.lmdl = LMDL_Corpus() self.texts = self.lmdl.get_corpus_texts_words() self.dictionary = Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.lda = LdaModel(self.corpus, num_topics=LDA_NUM_TOPICS, id2word=self.dictionary) def print_topics(self): return self.lda.print_topics(LDA_NUM_TOPICS) def get_document_topics(self, document_name): document_tokens = self.lmdl.token_list_processed(document_name) topics = self.lda.get_document_topics( self.dictionary.doc2bow(document_tokens), minimum_probability=None, minimum_phi_value=None, per_word_topics=False) show_topics_list = [] for topic in topics: lda_topic = self.lda.show_topic(topic[0], topn=10) show_topics_list.append(lda_topic) return show_topics_list def top_topics(self): return self.lda.top_topics(corpus=self.corpus, texts=self.texts, dictionary=self.dictionary, window_size=None, coherence='u_mass', topn=20, processes=-1)
def find_topic(self, num_topics, num_words=2, passes=20): dic = Dictionary(self.texts) corpus = [dic.doc2bow(text) for text in self.texts] lda = LdaModel(corpus, num_topics=num_topics, id2word=dic, passes=passes) return lda.top_topics(topn=2, dictionary=dic, corpus=corpus)
import sys from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel from gensim.models import word2vec data_file = sys.argv[1] topic_num = int(sys.argv[2]) alpha = float(sys.argv[3]) sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] lda = LdaModel(corpus=corpus, num_topics=topic_num, id2word=dic, alpha=alpha, random_state=1) for t in lda.top_topics(corpus=corpus): print(f"coherence = {t[1]}, topic = {t[0]}")
iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) # Pickle the model for later use pickle.dump( lda_model, open(os.path.join('./results/lda_save_' + str(num_topics) + '.pk'), 'wb')) print('The top 10 keywords in each topic') pprint(lda_model.print_topics(num_words=10)) # Topic coherence https://rare-technologies.com/what-is-topic-coherence/ top_topics = lda_model.top_topics(corpus) # , num_words=20) avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) print('Top topics and their coherence:') pprint(top_topics) # Comparing LDA models # https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html # LDA Results Visual Analysis if visualize: # pyLDAvis.enable_notebook() lda_res_path = os.path.join('./results/lda_pyldavis_' + str(num_topics)) prepped_results = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)