def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'): """ Print important topics based on decomposition. Parameters ---------- mds : str, optional (default='pcoa') 2D Decomposition. Allowed values: * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling) * ``'mmds'`` - Dimension reduction via Multidimensional scaling * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding """ if not isinstance(self.comp, LatentDirichletAllocation): raise ValueError('only support lda_topic_modelling()') import pyLDAvis import pyLDAvis.sklearn if notebook_mode: pyLDAvis.enable_notebook() prepared_vis_data = pyLDAvis.sklearn.prepare(self.comp, self._vectors, self.vectorizer, mds=mds) if notebook_mode: return prepared_vis_data else: pyLDAvis.show(prepared_vis_data)
def LDAvisualization(lda, X_train, vectorizer): # Creates a HTML document that graphicaly shows # the performance of a LDA model. pyLDAvis.enable_notebook() panel = pyLDAvis.sklearn.prepare(lda, X_train, vectorizer, mds='tsne') pyLDAvis.save_html(panel, "./ldavis_prepared.html")
def files_10(): #Change number to select file count = 7 data = pd.read_csv('file' + str(count) + '.csv') df = pd.DataFrame(data) for index, c in df.iterrows(): preprocess(c['"QUESTION' + str(count) + '"']) print(result) text = 'Question ' + str(count) with open(text, "w") as result_file: result_file.write('') dictionary = gensim.corpora.Dictionary(result) bow_corpus = [dictionary.doc2bow(doc) for doc in result] bow_doc_x = bow_corpus[0] for i in range(len(bow_doc_x)): print("Word {} (\"{}\") appears {} time.".format( bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1])) lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=10, workers=2, per_word_topics=True) for idx, topic in lda_model.print_topics(-1): with open(text, "a") as result_file: result_file.write("Topic: {} \nWords: {}".format(idx, topic) + "\n") vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary) pyLDAvis.enable_notebook() pyLDAvis.show(vis)
def visualize_LDA_model(self): pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(self.LDA_model, self.corpus, self.id2word) self.plot = vis vis return vis
def topic_visual(best_lda_model, data_vectorized, vectorizer): pyLDAvis.enable_notebook() panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne') pyLDAvis.show(panel)
def view_clusters(self): ''' ''' if self.number_of_topics is None: print('Error: Number of topics not set.') print('Set number of topics with [object].set_number_of_topics(X)') return self.id2word = hf.create_id2word(self.texts) self.corpus = hf.create_corpus(self.id2word, self.texts) clusters = self.number_of_topics # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=clusters, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Display clusters pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word) pyLDAvis.display(vis) return vis
def visualize_lda_model(): data = preprocess_to_lemmatization() stopwords_verbs = [ 'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can' ] stopwords_other = [ 'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something' ] my_stopwords = stopwords.words( 'english') + stopwords_verbs + stopwords_other data['tokens'] = data['tokens_sentences_lemmatized'].map( lambda sentences: list(chain.from_iterable(sentences))) data['tokens'] = data['tokens'].map(lambda tokens: [ token.lower() for token in tokens if token.isalpha() and token.lower() not in my_stopwords and len(token) > 1 ]) tokens = data['tokens'].tolist() bigram_model = Phrases(tokens) trigram_model = Phrases(bigram_model[tokens], min_count=1) tokens = list(trigram_model[bigram_model[tokens]]) dictionary_LDA = corpora.Dictionary(tokens) dictionary_LDA.filter_extremes(no_below=3) corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens] np.random.seed(123456) num_topics = 20 lda_model = models.LdaModel(corpus, num_topics=num_topics, \ id2word=dictionary_LDA, \ passes=4, alpha=[0.01]*num_topics, \ eta=[0.01]*len(dictionary_LDA.keys())) lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA) pyLDAvis.enable_notebook() return pyLDAvis.display(lda_viz)
def visualize(): # just for later import pyLDAvis import pyLDAvis.gensim vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA) pyLDAvis.enable_notebook() pyLDAvis.display(vis)
def plot_pyLDAvis(self): import pyLDAvis.gensim import pickle import pyLDAvis # Visualize the topics pyLDAvis.enable_notebook() LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_nounAdj, self.corpus_NounAdj, self.id2word_nounAdj) return LDAvis_prepared
def visLDAIPython(model, data, vectorizer, ip, port): pyLDAvis.enable_notebook() # https://github.com/bmabey/pyLDAvis/issues/69 visData = pyLDAvis.sklearn.prepare(model, data, vectorizer, mds='mmds', sort_topics=False) pyLDAvis.show(visData, ip=ip, port=port)
def visualize_lda(model, corpus, dictionary): """returns the pyLDAvis PreparedData given model, corpus, dictionary""" """Could pickle this to save it""" """pyLDAvis.save_html(vis, "filename") also works to export in html""" pyLDAvis.enable_notebook() t0 = time.time() vis = pyLDAvis.gensim.prepare(model, corpus, dictionary) print('{} seconds'.format(time.time() - t0)) return vis
def visualize_topics(model, corpus, id2word, cv): d = corpora.Dictionary() word2id = dict((k, v) for k, v in cv.vocabulary_.items()) d.id2word = id2word d.token2id = word2id pyLDAvis.enable_notebook() visualization = pyLDAvis.gensim.prepare(model, corpus, d) return visualization
def visualise_lda_topics(lda_model, corpus, id2word): ''' Visualizes the topics for Gensim's LDA implementation :param lda_model: :param corpus: :param id2word: :return: visualisation ''' pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return vis
def visualize(ldamodel, doc_term_matrix, dictionary): import pyLDAvis try: pyLDAvis.enable_notebook() except: print('not in jupyter notebook') viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) pyLDAvis.save_html(viz, 'TM_viz50Com.html') return viz
def get_visualization(top_dir, nr_samples, nr_topics): saved_model = SavedLdaModel(top_dir, nr_samples, nr_topics) lda_model = saved_model.get_model() corpus = saved_model.get_corpus() id2word = saved_model.get_dict() # Visualize the topic pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return vis
def showPyLDAvisNB(allDict, numTopics=30): # TODO: see if we can get ngrams into pyLDAvis dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics) data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2]) output_notebook() pyLDAvis.enable_notebook(True) p = pyLDAvis.display(data, template_type='general') plt.tight_layout() display(p) return
def LDA(doc_term_matrix): # Creating the object for LDA model using gensim library Lda = gensim.models.LdaMulticore # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=25, id2word = dictionary, passes=50, workers=4) ldamodel.save("ldamodel_sample") # Load a potentially pretrained model from disk. ldamodel = gensim.models.LdaMulticore.load("ldamodel_sample") pprint(ldamodel.print_topics(num_topics=15, num_words=5)) # Visualize the topics pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) pyLDAvis.save_html(vis,fileobj='visuals.html')
def visualise_ldamallet_topics(dataset, alpha, num_topic): ''' Extracts relevant information form ldamallet's LDA model and visualizes the topics with Gensim's LDA visualisation :return: visualisation ''' ldamallet_dir = 'data/topic_models/basic/{}_alpha{}_{}/ldamallet'.format( dataset, alpha, num_topic) # e.g. Semeval_alpha50_20 convertedLDAmallet = convertLDAmallet(dataDir=ldamallet_dir, filename='state.mallet.gz') pyLDAvis.enable_notebook() vis = pyLDAvis.prepare(**convertedLDAmallet) # pyLDAvis.display(vis) return vis
def visualize_model(model, corpus, id2word): """ Parameters: - `model` a gensim LDA model - `corpus` the corpus on which the model was trained - `id2word` the dictionary on which the model was trained Returns: a pyLDAvis visualization """ pyLDAvis.enable_notebook() return pyLDAvis.gensim.prepare(model, corpus, id2word, mds='mmds')
def plot_date_model(lda_model, date="2018-12-25"): df = result[result["created_at"] == date] df = df.dropna() xs = [df["cleaned_text"].iloc[i].split() for i in range(df.shape[0])] id2word = corpora.Dictionary(xs) texts = xs corpus = [id2word.doc2bow(text) for text in texts] pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return vis
def visualize(): lda_model, corpus, data_lemmatized, dictionary = train() #Perplejidad print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Score de coherencia coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Visualizamos los temas pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) vis
def showPyLDAvis(allDict, notebook=True, numTopics=30): # TODO: see if we can get ngrams into pyLDAvis dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics) data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2]) if notebook == True: output_notebook() pyLDAvis.enable_notebook(True) p = pyLDAvis.display(data, template_type='general') display(p) else: output_file("pyDAVis.html") p = pyLDAvis.show( data) # displays in own window combined with output_file show(p) return
def lda(df, n_topics=5, lda_str='all'): all_words = [] for text in df['text']: all_words.append(text) # Create dictionary and corpus word2num = cp.Dictionary(all_words) texts = all_words # Get term frequency corpus = [word2num.doc2bow(text) for text in texts] lda_model = gs.models.LdaMulticore(corpus=corpus, id2word=word2num, num_topics=n_topics) doc_lda = lda_model[corpus] print('\nTopics') print(lda_model.print_topics()) print('\nScores') for i in range(0, len(corpus), 500): for index, score in sorted(lda_model[corpus[i]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\t \nTopic: {}".format( score, lda_model.print_topic(index, 10))) LDAvis_prepared = pyLDAvis.enable_notebook() pyLDAvis.save_html(LDAvis_prepared, './html/{}_lda_n{}.html'.format(lda_str, n_topics))
def lda_vis(lda_model, corpus, dictionary): # visualize the topics and words import pyLDAvis import pyLDAvis.gensim # don't skip this import matplotlib.pyplot as plt #%matplotlib inline pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) vis return vis
def topicModeling(corpus, dictionary, texts): ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=5) x = ldamodel.show_topics() #show generated topics #---------------------------------------------------------- sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series( [int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = [ 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords' ] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) #-------Generate Visualization------------------------------ pyLDAvis.enable_notebook() topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.save_html( topicModel, '/Users/[email protected]/Documents/projects/PEM/elon.html') pyLDAvis.show(topicModel) return x, sent_topics_df
def visualize(self): import pyLDAvis try: pyLDAvis.enable_notebook() except: print ('not in jupyter notebook') start = time() self.viz = pyLDAvis.gensim.prepare(self.ldamodel, self.doc_term_matrix, self.dictionary) print ('used: {:.2f}s'.format(time()-start)) print ('saving viz to '+self.name+'_viz.html') pyLDAvis.save_html(self.viz, self.name+'_viz.html') return self.viz
def visualize_topics(id2word, corpus, lda_model, path='./', num_topics=10): print('Creating visualization html at {}'.format(path)) #visualizing topics pyLDAvis.enable_notebook() LDAvis_data_filepath = os.path.join('{}ldavis_prepared_{}topics'.format( path, str(num_topics))) #this is a bit time consuming LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) #load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html( LDAvis_prepared, '{}ldavis_prepared_{}topics{}'.format(path, str(num_topics), '.html')) LDAvis_prepared
def main(): # Training data preprocessing docs, asin_list, test_docs = read_content('../data/product_description_complete.tsv') docs = tokenize(docs) docs = lemmatize(docs) docs = compute_bigrams(docs) dictionary = remove_rare_common_words(docs) corpus = vectorize(dictionary, docs) # Train model (model,id2word) = train(dictionary,corpus,17,docs) # Print topics for i in range(17): topics = model.show_topic(i) print(i,[topic[0] for topic in topics]) # Testing data preprocessing test_docs = tokenize(test_docs) test_docs = lemmatize(test_docs) test_docs = compute_bigrams(test_docs) test_dictionary = remove_rare_common_words(test_docs) test_corpus = vectorize(test_dictionary, test_docs) # Write predicted results i = 0 with open('../results/product_description_complete.tsv', 'wt') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') writer.writerow(["asin", "topic_distribution"]) for c in test_corpus: writer.writerow([asin_list[i], model[c]]) i += 1 # Visualize the topics (the following code can only be run on Notebook) pyLDAvis.enable_notebook() LDAvis_prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary) LDAvis_prepared if __name__ == '__main__': main()
def build_topic_model_dict(text_array): global dictionary dictionary = corpora.Dictionary(text_array) global doc_term_matrix doc_term_matrix = [dictionary.doc2bow(rev) for rev in text_array] # creating the object for LDA model using gensim library LDA = gensim.models.ldamodel.LdaModel #build the model global lda_model lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=7, random_state=100, chunksize=1000, passes=50) #print topics #lda_model.print_topics() pyLDAvis.enable_notebook()
def visualize_topics(self, notebook_mode = False, mds = 'pcoa'): """ Print important topics based on decomposition. Parameters ---------- mds : str, optional (default='pcoa') 2D Decomposition. Allowed values: * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling) * ``'mmds'`` - Dimension reduction via Multidimensional scaling * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding """ if not isinstance(mds, str): raise ValueError('mds must be a string') if not isinstance(notebook_mode, bool): raise ValueError('notebook_mode must be a boolean') try: import pyLDAvis import pyLDAvis.sklearn except: raise Exception( 'pyldavis not installed. Please install it and try again.' ) if notebook_mode: pyLDAvis.enable_notebook() vis_data = _prepare_topics( self._doc_embed, self._topic_embed, self._word_embed, np.array(self._features), doc_lengths = self._doc_len, term_frequency = self._freqs, normalize = True, ) prepared_vis_data = pyLDAvis.prepare(**vis_data) if notebook_mode: return prepared_vis_data else: pyLDAvis.show(prepared_vis_data)
ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True) ntvf_features = ntvf.fit_transform(negative_reviews) # view feature set dimensions print(ptvf_features.shape, ntvf_features.shape) # # Topic Modeling on Reviews # In[4]: import pyLDAvis import pyLDAvis.sklearn from sklearn.decomposition import NMF import topic_model_utils as tmu pyLDAvis.enable_notebook() total_topics = 10 # ## Display and visualize topics for positive reviews # In[5]: # build topic model on positive sentiment review features pos_nmf = NMF(n_components=total_topics, random_state=42, alpha=0.1, l1_ratio=0.2) pos_nmf.fit(ptvf_features) # extract features and component weights pos_feature_names = ptvf.get_feature_names() pos_weights = pos_nmf.components_ # extract and display topics and their components