def train(data, mallet_path: str, random_seed: int, num_topics: int, *args, **kwargs): logger.debug(f"start training, args:{args}, kwargs:{kwargs}") # Create Dictionary id2word = corpora.Dictionary(data) # Create Corpus texts = data # Term Document Frequency corpus = [id2word.doc2bow(text1) for text1 in texts] lda_mallet_model = gensim.models.wrappers.LdaMallet( mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word, random_seed=random_seed) # mallet models need to first be converted to gensim models gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( lda_mallet_model) pyldaVis_prepared_model = prepare(gensim_model, corpus, id2word, n_jobs=1) return pyldaVis_prepared_model
def visualize_lda_model(): data = preprocess_to_lemmatization() stopwords_verbs = [ 'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can' ] stopwords_other = [ 'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something' ] my_stopwords = stopwords.words( 'english') + stopwords_verbs + stopwords_other data['tokens'] = data['tokens_sentences_lemmatized'].map( lambda sentences: list(chain.from_iterable(sentences))) data['tokens'] = data['tokens'].map(lambda tokens: [ token.lower() for token in tokens if token.isalpha() and token.lower() not in my_stopwords and len(token) > 1 ]) tokens = data['tokens'].tolist() bigram_model = Phrases(tokens) trigram_model = Phrases(bigram_model[tokens], min_count=1) tokens = list(trigram_model[bigram_model[tokens]]) dictionary_LDA = corpora.Dictionary(tokens) dictionary_LDA.filter_extremes(no_below=3) corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens] np.random.seed(123456) num_topics = 20 lda_model = models.LdaModel(corpus, num_topics=num_topics, \ id2word=dictionary_LDA, \ passes=4, alpha=[0.01]*num_topics, \ eta=[0.01]*len(dictionary_LDA.keys())) lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA) pyLDAvis.enable_notebook() return pyLDAvis.display(lda_viz)
def test_hdp(): """Trains a HDP model and tests the html outputs.""" corpus, dictionary = get_corpus_dictionary() hdp = HdpModel(corpus, dictionary.id2token) data = gensim_models.prepare(hdp, corpus, dictionary) pyLDAvis.save_html(data, 'index_hdp.html') os.remove('index_hdp.html')
def test_lda(): """Trains a LDA model and tests the html outputs.""" corpus, dictionary = get_corpus_dictionary() lda = LdaModel(corpus=corpus, num_topics=2) data = gensim_models.prepare(lda, corpus, dictionary) pyLDAvis.save_html(data, 'index_lda.html') os.remove('index_lda.html')
def lda_model(data, corpus, dictionary, num_topics): lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics) # Visualize topics with pyLDAvis lda_data = gensimvis.prepare(lda, corpus, dictionary) html_string = pyLDAvis.prepared_data_to_html(lda_data) components.v1.html(html_string, width=1280, height=1024) # Visualize documents w/ t-SNE visualize_topics(data, corpus, lda, num_topics)
def lda_report(lda_model, corpus, data_lemmatized, id2word): """ This function reports Perplexity, Coherence Score and also visualize the topics. :param lda_model: :type lda_model: :param corpus: :type corpus: :param data_lemmatized: :type data_lemmatized: :param id2word: :type id2word: """ with open(r'../results/output/lda/Best_lda_model/lda_bm_n15.txt', 'w') as f: with redirect_stdout(f): print(lda_model.print_topics()) # Compute Perplexity with open(r'../results/output/lda/Best_lda_model/lda_output_bm_n15.txt', 'w') as f: with redirect_stdout(f): print('\nPerplexity: ', lda_model.log_perplexity(corpus) ) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, # coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() with open(r'../results/output/lda/Best_lda_model/lda_output_bm_n15.txt', 'a') as f: with redirect_stdout(f): print('\nCoherence Score: ', coherence_lda) # Visualize the topics # vis = gensimvis.prepare(lda_model, corpus, id2word) LDAvis_data_filepath = os.path.join( r'../results/output/lda/Best_lda_model/ldavis_prepared_n15') if 1 == 1: LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html( LDAvis_prepared, r'../results/output/lda/Best_lda_model/ldavis_n15.html')
def test_sorted_terms(): """This tests that we can get the terms of a given topic using lambda to calculate the relevance ranking. A common workflow is that once we visualize the topics we modify the lambda slide and we are interested in a particular lambda value, then with this function we can get the terms in that order. """ corpus, dictionary = get_corpus_dictionary() lda = LdaModel(corpus=corpus, num_topics=2) data = gensim_models.prepare(lda, corpus, dictionary) # https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf # lambda = 0 should rank the terms by loglift # lambda = 1 should rank them by logprob. sorted_terms = data.sorted_terms(topic=1, _lambda=1).to_dict() assert (sorted_terms['logprob'] == sorted_terms['relevance']) sorted_terms = data.sorted_terms(topic=1, _lambda=0).to_dict() assert (sorted_terms['loglift'] == sorted_terms['relevance'])
def train(data, *args, **kwargs): logger.debug(f"start training, args:{args}, kwargs:{kwargs}") # Create Dictionary id2word = corpora.Dictionary(data) # Create Corpus texts = data # Term Document Frequency corpus = [id2word.doc2bow(text1) for text1 in texts] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=kwargs["num_topics"], random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) pyldaVis_prepared_model = prepare(lda_model, corpus, id2word, n_jobs=1) return pyldaVis_prepared_model
# plt.ylabel("Topic proportion") plt.xlabel("") plt.savefig("img/nmfPropStacked.png") plt.clf() exit() # Stuff in the following is not used anymore ################################################################# # LDAvis ################################################################# # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ldaDisp = gensimvis.prepare(lda, corpus, dct, sort_topics=False) pyLDAvis.save_html(ldaDisp, "ldavistest.html") os.startfile(".\ldavistest.html") if False: for topic in range(0, NUM_TOPICS): termslda = lda.show_topic( topic, topn=50 ) # get_topic_terms would return words as dict IDs, not strings # Model returns list of tuples, wordcloud wants a dictionary instead wordcloudlda = WordCloud( background_color="white").generate_from_frequencies(dict(termslda)) plt.subplot(3, 3, topic + 1) plt.imshow(wordcloudlda) plt.axis("off") plt.title("Topic" + str(topic + 1))
# ## LDA # In[133]: lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5) # In[134]: lda.lda_topics(lda_allbow) # In[135]: import pyLDAvis import pyLDAvis.gensim_models as gensimvis pyLDAvis.enable_notebook() # In[137]: lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5) vis = gensimvis.prepare(lda_allbow, bow_corpus, dictionary) vis
def hdp_model(corpus, dictionary): hdp = models.HdpModel(corpus, id2word=dictionary) hdp_data = gensimvis.prepare(hdp, corpus, dictionary) html_string = pyLDAvis.prepared_data_to_html(hdp_data) components.v1.html(html_string, width=1280, height=1024)
chunksize = 4000 # size of the doc looked at every pass passes = 20 # number of passes through documents iterations = 100 eval_every = 1 # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token # %time model = gensim.models.LdaModel(corpus=gensim_corpus, id2word=id2word, chunksize=chunksize, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) # pyLDAvis.gensim.prepare(model, gensim_corpus, dictionary) gensimvis.prepare(model, gensim_corpus, dictionary) model.show_topic(3) df['topics'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][0]) df['score'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][1]) df['topic_name'] = df['Review_tokenized'].apply(lambda x: model.show_topic(model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][0])) df['topic_name2'] = df['topic_name'].apply(lambda x:convert_to_topic(x)) df[['Restaurant', 'Review', 'topics','topic_name2','score']] """Interpretation พบว่า Topic Name สามารถแบ่งย่อยได้ 5 เรื่อง Topic 0 พูดถึง ดี น้ำ ราคา เนื้อ อาหาร ทาน หวาน อร่อย กิน คุ้ม
"""## Clear visualization of Lda model with number of topics taken as 12""" import pyLDAvis import os import pickle import pyLDAvis.gensim_models as gensimvis # Visualize the topics num_topics = 12 pyLDAvis.enable_notebook() LDAvis_data_filepath = os.path.join('/content/sample_data/ldavis_prepared_' + str(num_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html( LDAvis_prepared, '/content/sample_data/ldavis_prepared_' + str(num_topics) + '.html') LDAvis_prepared """The 12 clusters mainly talk about the battery/heating/charging issues and phone's camera quality issues # coherence score (c_v metric) for number of topics = 12 """ # Compute Coherence Score using c_v
####################################### # # 20190207 Simple LDA demo build # # Step 04 - visualise the LDA model # ####################################### ldamodel.show_topics() #or a formatted version pd.set_option('max_colwidth', 700) num_words = 15 topic_list = ldamodel.show_topics(num_words=num_words, formatted=True) df = pd.DataFrame(topic_list) df warnings.simplefilter('ignore') import pyLDAvis import pyLDAvis.gensim_models as gensimvis pyLDAvis.enable_notebook() vis_data = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=False) pyLDAvis.display(vis_data) #######################################
# %% [code] {"jupyter":{"outputs_hidden":false}} lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 4, id2word = dic, passes = 10, workers = 2) lda_model.show_topics() # %% [markdown] # ### VISUALIZING RESULTS OF LDA # %% [code] {"jupyter":{"outputs_hidden":false}} import pyLDAvis from pyLDAvis import gensim_models # %% [code] {"jupyter":{"outputs_hidden":false}} # fig = plt.figure(figsize = (20, 20)) pyLDAvis.enable_notebook() vis = gensim_models.prepare(lda_model, bow_corpus, dic) vis # %% [code] {"jupyter":{"outputs_hidden":false}} from wordcloud import WordCloud, STOPWORDS stopwords = set(STOPWORDS) def show_wordcloud(data): wordcloud = WordCloud( background_color='white', stopwords=stopwords, max_words=100, max_font_size=30, scale=3, random_state=1)