def visualize(): # just for later import pyLDAvis import pyLDAvis.gensim vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA) pyLDAvis.enable_notebook() pyLDAvis.display(vis)
def view_clusters(self): ''' ''' if self.number_of_topics is None: print('Error: Number of topics not set.') print('Set number of topics with [object].set_number_of_topics(X)') return self.id2word = hf.create_id2word(self.texts) self.corpus = hf.create_corpus(self.id2word, self.texts) clusters = self.number_of_topics # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=clusters, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Display clusters pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word) pyLDAvis.display(vis) return vis
def visualize(self, mds='pcoa'): """ visualize LDA using pyLDAvis see: https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb#topic=8&lambda=1&term= paper: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf Parameters ---------- mds: str scaling function valild options are ['pcoa', 'tnse', mmds'] Returns ------- """ import pyLDAvis import pyLDAvis.gensim print("Make sure you have pyLDAviz imported in the notebook:\n\n" "import pyLDAvis\n" "pyLDAvis.enable_notebook()\n") ldavis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary, mds=mds) pyLDAvis.display(ldavis) return ldavis
def pylda_visualize(csv_chemin, ecriture_chemin, tfidf_visualization = False, num_topic=3, filter_by_cluster=None): ''' gets the clustering result from csv_chemin and then writes the LDA visualisation as an html file into ecriture_chemin csv_chemin points to a dataframe with two columns: one corresponding to the cluster, the other containing the text num_topic is the number of topics we want to extract from the texts filter_by_cluster is the cluster index, if we want to extract topics from one cluster only ''' #df = pd.read_csv('df_brown.csv') clustering_result_df = pd.read_csv(csv_chemin) if filter_by_cluster: clustering_result_df[clustering_result_df['pred_cluster'] == filter_by_cluster] text = clustering_result_df['text'].values #text = ' '.join(text) docs = pd.DataFrame(list(map(load_doc, enumerate(list(clustering_result_df['text'].apply(clean)))))) docs.head() dictionary, corpus = prep_corpus(docs['tokens']) #dictionary : keys = word_id ; value = word #corpus[i] = list of tuples (word_id, count) where count is the number of occurence of the word in the text corpus[i] if tfidf_visualization: # Instead of representing each text as tuples (word_idx, term_frequency), we represent them as (word_idx, word_tfidf_weight) model = TfidfModel(corpus) new_corpus = [] for i in range(len(corpus)): element = corpus[i] new_element = [] for j in range(len(element)): #word = dictionary[pair[0]] pair = element[j] #dict_idx = pair[0] tfidf_vector = model[element] word_tfidf_weight = tfidf_vector[j] new_element += (pair[0], word_tfidf_weight) new_corpus.append(new_element) MmCorpus.serialize(ecriture_chemin + '.mm', corpus) dictionary.save(ecriture_chemin + '.dict') lda = models.ldamodel.LdaModel(corpus=new_corpus, id2word=dictionary, num_topics=15, passes=10) lda.save(ecriture_chemin + '.model') vis_data = gensimvis.prepare(lda, new_corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.save_html(vis_data, ecriture_chemin + '.html') else: MmCorpus.serialize(ecriture_chemin + '.mm', corpus) dictionary.save(ecriture_chemin + '.dict') lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=10) lda.save(ecriture_chemin + '.model') vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')
def display_data(self): lda = LdaMulticore.load(self.lda_model_filepath) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open(self.LDAvis_data_filepath, 'w') as f: f.write(str(LDAvis_prepared)) # json.dump(LDAvis_prepared.to_json(), f) with open(self.LDAvis_data_filepath) as f: LDAvis_prepared = f pyLDAvis.display(LDAvis_prepared)
def ldavis_create(lda, corpus, gensim_dict, LDAvis_data_filepath=fpathroot + fpathappend + '_lda_vis', return_ldavis=False): LDAvis_prepared = pyLDAvis.prepare(lda, corpus, gensim_dict) with open(LDAvis_data_filepath, 'w') as f: pickle.dump(LDAvis_prepared, f) if return_ldavis == True: return LDAvis_prepared else: pyLDAvis.display(LDAvis_prepared)
def visualise(model_file, corpus_file, dictionary_file): # use Notebook version if not working print('Loading corpus from ' + corpus_file) corpus = MmCorpus(corpus_file) print('Loading dictionary from ' + dictionary_file) dictionary = Dictionary.load(dictionary_file) print('Loading model from ' + model_file) model = models.ldamulticore.LdaMulticore.load(model_file) vis_data = gensimvis.prepare(model, corpus, dictionary) pyLDAvis.display(vis_data) print('Please use Jupyter notebook visualise.ipynb if not working')
def topicmodel_forproyect(id_proyect): df_comments = get_data(id_proyect) #list_mask=np.unique(df_comments.project_id) #mask = df_comments["project_id"] == id_proyect #df2 = pd.read_excel("datos_congresista_virtual.xlsx", sheet_name="clasificaciones") num_topics = 5 df2 = df_comments.body df2 = df2.str.lower() pattern = r"@([A-Za-z0-9_]+)" df2 = df2.str.replace(pattern, '') elements = np.array(df2.tolist()) tokenizer = RegexpTokenizer(r'\w+') es_stop = get_stop_words('es') p_stemmer = PorterStemmer() texts = [] print(str(id_proyect)) for i in elements: raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in es_stop] #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stopped_tokens) #texts.append(stemmed_tokens) print(i) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20) #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, distributed=True, passes=20) try: ldamodel = gensim.models.ldamulticore.LdaMulticore( corpus, num_topics=num_topics, id2word=dictionary, passes=20) except ValueError: return "Coleccion Vacia. Aparentemente parametros faltantes o mal ingresados." import pyLDAvis.gensim import pyLDAvis vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.display(vis_data) return pyLDAvis.prepared_data_to_html(vis_data)
def visualize_lda_model(): data = preprocess_to_lemmatization() stopwords_verbs = [ 'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can' ] stopwords_other = [ 'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something' ] my_stopwords = stopwords.words( 'english') + stopwords_verbs + stopwords_other data['tokens'] = data['tokens_sentences_lemmatized'].map( lambda sentences: list(chain.from_iterable(sentences))) data['tokens'] = data['tokens'].map(lambda tokens: [ token.lower() for token in tokens if token.isalpha() and token.lower() not in my_stopwords and len(token) > 1 ]) tokens = data['tokens'].tolist() bigram_model = Phrases(tokens) trigram_model = Phrases(bigram_model[tokens], min_count=1) tokens = list(trigram_model[bigram_model[tokens]]) dictionary_LDA = corpora.Dictionary(tokens) dictionary_LDA.filter_extremes(no_below=3) corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens] np.random.seed(123456) num_topics = 20 lda_model = models.LdaModel(corpus, num_topics=num_topics, \ id2word=dictionary_LDA, \ passes=4, alpha=[0.01]*num_topics, \ eta=[0.01]*len(dictionary_LDA.keys())) lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA) pyLDAvis.enable_notebook() return pyLDAvis.display(lda_viz)
def visual_lda(): lda = LdaMulticore.load("../model/lda.model") with open("../result/ad_issue_reviews") as fin: reviews = json.load(fin) # build bag-of-words, corpus reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews] from collections import defaultdict freq = defaultdict(int) for review in reviews: for token in review: freq[token] += 1 reviews = [[token for token in review if freq[token] > 1] for review in reviews] dictionary = corpora.Dictionary(reviews) corpus = [dictionary.doc2bow(review) for review in reviews] import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data)
def topicmodel_allcoments(): df_comments = get_data() pattern = r"http\S+" #df['TEXTO'] = df['TEXTO'].str.replace(pattern,'') df_comments['body'] = df_comments['body'].str.replace(pattern, '') df2 = df_comments.body df2 = df2.str.lower() pattern = r"@([A-Za-z0-9_]+)" df2 = df2.str.replace(pattern, '') #pattern = r"\b(word1|word2|word3|word4|word5|word|etc)\b" #df2 = df2.str.replace(pattern,'') elements = np.array(df2.tolist()) tokenizer = RegexpTokenizer(r'\w+') es_stop = get_stop_words('es') p_stemmer = PorterStemmer() texts = [] for i in elements: raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in es_stop] #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stopped_tokens) #texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20) import pyLDAvis.gensim import pyLDAvis vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.display(vis_data) #return pyLDAvis.save_json(vis_data, 'TopicModel_allcomments.json') return pyLDAvis.json.dumps(vis_data)
def display(self): """ Use advance view on document topics with salient words with pyLDAvis framework. :return: """ vis = pyLDAvis.gensim.prepare(topic_model=self.lda_model_tfidf, corpus=self.tf_idf_corpus, dictionary=self.dictionary) from IPython.core.display import HTML html: HTML = pyLDAvis.display(vis) return html.data
def vectorize(self): ''' args: none output: generates an LDA topic model of the document using gensim and pyLDAvis ''' # tokenize and remove stopwords sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text #sentences = Topic(raw_input('topic: ')).text # get text from wikipedia #stoplist = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split()) texts = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences] # compute the frequency of each token frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # remove words that appear only once texts = [[token for token in text if frequency[token] > 1] for text in texts] # construct a gensim dictionary and corpus (bag of words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document # define LDA model lda = models.ldamodel.LdaModel( corpus = corpus, id2word = dictionary, num_topics = 10, #what should this be ??? update_every = 1, chunksize = 10000, passes = 1 ) # visualize the lda space vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.show(vis_data) with open('topic_models/'+self.name+'.json', 'a+') as topic_json: pyLDAvis.save_json(vis_data, topic_json) with open('topic_models/'+self.name+'.html', 'a+') as topic_html: pyLDAvis.save_html(vis_data, topic_html)
def show_topics(corpus): """ Topics visualization Parameters ---------- corpus : list corpus of (string) documents """ dic = gensim.corpora.Dictionary(corpus) bow_corpus = [dic.doc2bow(doc) for doc in corpus] lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=4, id2word=dic, passes=10, workers=2) lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic) pyLDAvis.enable_notebook() pyLDAvis.display(lda_vis)
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size): """This method will launch a locally hosted session of pyLDAvis that will visualize the results of our model Parameters ---------- data_path : str Location where your data is stored. model : Lda2Vec Loaded lda2vec tensorflow model. idx_to_word : dict index to word mapping dictionary freqs list: Frequencies of each token. vocab_size : int Total size of your vocabulary """ doc_embed = model.sesh.run(model.mixture.doc_embedding) topic_embed = model.sesh.run(model.mixture.topic_embedding) word_embed = model.sesh.run(model.w_embed.embedding) # Extract all unique words in order of index 0-vocab_size vocabulary = [] for k, v in idx_to_word.items(): vocabulary.append(v) # Read in document lengths doc_lengths = np.load(data_path + "/doc_lengths.npy") # The prepare_topics function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.display(prepared_vis_data)
def showPyLDAvisNB(allDict, numTopics=30): # TODO: see if we can get ngrams into pyLDAvis dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics) data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2]) output_notebook() pyLDAvis.enable_notebook(True) p = pyLDAvis.display(data, template_type='general') plt.tight_layout() display(p) return
def textTopicmodel(n_topics=2): segment = segWord() segment = [str(w) for w in segment if len(str(w)) >= 2] corpus = [''.join(one) for one in segment] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=1500, stop_words=None) tf = tf_vectorizer.fit_transform(corpus) words = tf_vectorizer.get_feature_names() #提取文本的关键字 lda = LatentDirichletAllocation(n_components=n_topics, learning_offset=50, random_state=0) docres = lda.fit_transform(tf) print('============================') print(docres) print('==========================') print(lda.components_) # pyLDAvis.enable_notebook() visualisation = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) # pyLDAvis.save_html(visualisation,'visualisation.html') pyLDAvis.display(visualisation) pyLDAvis.show(visualisation)
def evaluate_pyldavis(self, model=None, use_jupyter=None): """ Method for a visual evaluation of the LDA topic model using pyldavis. :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved within the class. :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run from jupyter and set the method accordingly :return: """ if model is None: if self.lda_model is None: raise Exception( "Please create a LDA model for evaluation before running this method." ) model = self.lda_model if isinstance(model, LdaMallet): model = malletmodel2ldamodel(model) panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word) if use_jupyter is None: try: is_jupyter = os.environ['_'].split( "/")[-1] == "jupyter-notebook" if is_jupyter: pyLDAvis.enable_notebook() except KeyError: is_jupyter = False if is_jupyter: pyLDAvis.display(panel) else: pyLDAvis.show(panel) else: if use_jupyter: pyLDAvis.enable_notebook() pyLDAvis.display(panel) elif not use_jupyter: pyLDAvis.show(panel)
def visuzalization(ldamodel, corpus, dictionary, num_words): viz = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) legend = topic_items(ldamodel, 15) for i, (k, v) in enumerate(legend.items()): plt.figure() plt.imshow( WordCloud(background_color="white").fit_words( ldamodel.show_topic(k, num_words))) plt.axis("off") plt.title("Topic #" + str(k + 1)) plt.show() display = pyLDAvis.display(viz) return display
def showPyLDAvis(allDict, notebook=True, numTopics=30): # TODO: see if we can get ngrams into pyLDAvis dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics) data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2]) if notebook == True: output_notebook() pyLDAvis.enable_notebook(True) p = pyLDAvis.display(data, template_type='general') display(p) else: output_file("pyDAVis.html") p = pyLDAvis.show( data) # displays in own window combined with output_file show(p) return
def py_lda_vis(column, lib, lda_models, dtm=None, vectorizer=None, corpus=None, dictionary=None): if lib == 'sklearn': vis_data = pyLDAvis.sklearn.prepare(lda_models[column], np.asmatrix(dtm[column]), vectorizer[column], sort_topics=False) else: vis_data = pyLDAvis.gensim.prepare(lda_models[column], corpus[column], dictionary[column], sort_topics=False) display(pyLDAvis.display(vis_data))
def fit( self, num_topics, alpha="symmetric", beta=None, passes=2, random_state=9, tuning=False, predict_training_samples=False, ): self.model = models.ldamodel.LdaModel( self.bow_corpus, num_topics=num_topics, alpha=alpha, eta=beta, id2word=self.dictionary, passes=passes, random_state=random_state, ) # calculate perplexity score (the lower the better) self.perplexity_score_ = self.model.log_perplexity(self.bow_corpus) # # calculate coherence score (the higher the better) self.coherence_score_, self.coherence_score_per_topic_ = self.score( self.docs, return_per_topic=True) if not tuning: pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(self.model, self.bow_corpus, self.dictionary) self.visualize_topics_ = pyLDAvis.display(vis) else: self.visualize_topics_ = 'Set tuning parameter in fit function to "False" to visualize LDA result!' return self.coherence_score_ if predict_training_samples: ( self.training_samples_predict_proba_, self.training_samples_prediction_, ) = self.predict(self.texts, True)
def visualize_lda_to_html( self, target_topic_num, top_n=10, r_normalized=False, relevence_lambda_val=.6, workers_n=2, random_seed=1, savepath='./', filename_affix='lda', # save_type='html', # {'html', 'json'} save_relevent_terms_ok=True, save_html_ok=True, display_ok=False, ): """ Run `pyLDAvis.prepare` & get adjusted scores(use saliency & relevence) of terms by each topic. Parameters ---------- target_topic_num: int A topic number of LDA model to visualize. top_n: int (default: `10`) A number of the most relevent terms in a topic. r_normalized: bool (default: `False`) Use normalized probabilities when it is `True`. (not recommended in most cases.) relevence_lambda_val: float (defautl: `.6`). A lambda value(ratio) to calculate relevence. workers_n: int (default: `2`) A number of CPU cores to calculate(`pyLDAvis.prepare`) random_seed: int (default: `1`) A random seed number. savepath: str (default: `'./'`) A dirpath to save `pyLDAvis` or other `pandas.DataFrame`s. filename_affix: str (default: `'lda'`) An affix of filename to save `pyLDAvis` html or json. save_relevent_terms_ok: bool (default: `True`) An option to save `pandas.DataFrame` of `top_relevent_terms`. save_html_ok: bool (default: `True`) An option to save html. display_ok: bool (default: `False`) Call `pyLDAvis.display` when it is `True`. References ---------- Saliency: `Chuang, J., 2012. Termite: Visualization techniques for assessing textual topic models` Relevence: `Sievert, C., 2014. LDAvis: A method for visualizing and interpreting topics` Example ------- >>> import unipy_nlp.analyze.topic_modeling as utpm >>> tpm = utpm.TopicModeler(sentence_list, tokenized) >>> tpm.pick_best_lda_topics( ... num_topic=5, ... workers_n=8, ... random_seed=1, ... ) >>> tpm.visualize_lda_to_html( ... 7, ... top_n=10, ... r_normalized=False, ... relevence_lambda_val=.6, ... workers_n=8, ... random_seed=1, ... savepath='data/_tmp_dump/topic_modeling', ... filename_affix='lda', ... save_relevent_terms_ok=True, ... save_html_ok=True, ... display_ok=False, ... ) """ if target_topic_num in self.lda_model_dict.keys(): self.selected_topic_num = target_topic_num self.selected_model = ( self.lda_model_dict[target_topic_num]['model'] ) else: raise KeyError("Model doesn't exist. Select a proper number.") (vis_prepared, total_terms_df, top_relevant_terms_df, r_adj_score_df, bow_score_list) = self._get_terminfo_table( self.selected_model, corpus=self.bow_corpus_doc, dictionary=self.corpora_dict, doc_topic_dists=None, use_gensim_prepared=True, top_n=top_n, r_normalized=r_normalized, relevence_lambda_val=relevence_lambda_val, workers_n=workers_n, random_seed=random_seed, ) self.vis_prepared = vis_prepared self.total_terms_df = total_terms_df self.top_relevant_terms_df = top_relevant_terms_df self.r_adj_score_df = r_adj_score_df self.bow_score_list = bow_score_list if save_html_ok: os.makedirs(savepath, exist_ok=True) ldavis_filename_html_str = os.path.join( savepath, f'{filename_affix}_topics-{target_topic_num}.html', ) pyLDAvis.save_html( self.vis_prepared, ldavis_filename_html_str, ) print(f"LDAVIS HTML Saved: '{ldavis_filename_html_str}'") if save_relevent_terms_ok: os.makedirs(savepath, exist_ok=True) ldavis_filename_rdf_str = os.path.join( savepath, '_'.join([ f'{filename_affix}', f'topics-{target_topic_num}', f'top{top_n}_relevent_terms_df.csv', ]), ) self.top_relevant_terms_df.to_csv( ldavis_filename_rdf_str, index=True, header=True, encoding='utf-8', ) print(f"LDAVIS DF Saved: '{ldavis_filename_rdf_str}'") if display_ok: pyLDAvis.display(self.vis_prepared, local=False)
def pyLDAvisData(lda, num_topics, len_vocab, corpus, text, dictionary_tokens): data = {'topic_term_dists':topic_term_dists(lda,num_topics,len_vocab), 'doc_topic_dists': doc_topic_dists(corpus, lda), 'doc_lengths': doc_lengths(text), 'vocab': get_vocabularyAlpha(dictionary_tokens), 'term_frequency':get_term_frequency(corpus) } return data # 1 - PyLDAvis import pyLDAvis data = pyLDAvisData(lda, 5, len(dictionary.token2id), corpus, texts, dictionary.token2id) topics_model_data = data topics_vis_data = pyLDAvis.prepare(**topics_model_data) pyLDAvis.display(topics_vis_data) # 2 - Tendance des topics import matplotlib.pyplot as plt from collections import Counter %matplotlib inline def get_topic_apperences_year_month(data, info): dict_topics = {} for i in range(0,len(data)): idt = data[i].index(max(data[i])) dict_topics.setdefault(idt, []).append(info[i][0][7:14]) return dict_topics def get_topic_apperences_year(data, info):
from gensim import corpora, models import pyLDAvis import pyLDAvis.gensim warnings.simplefilter('ignore') # Convert reviews into bag of words total_review_text = pd.DataFrame(list(business_reviews.items()), columns = ['business_id', 'review']).review.apply(tokenize_text) # Create dictionary of words dictionary = corpora.Dictionary(total_review_text) # Compute the term frequency of terms in each document corpus = [dictionary.doc2bow(review) for review in total_review_text] # Compute LDA model (num_topics = 4, since we want to compare the topics to the previous 4 wordclouds) lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word = dictionary, passes = 10) print('The words and scores defining each topic are:') lda_model.print_topics(num_topics = 4, num_words = 8) # In[23]: vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary) pyLDAvis.enable_notebook() pyLDAvis.display(vis) # After using the LDA algorithm to find 4 large topics, it can be observed that the topics do indeed have a number of similar words shown in wordclouds (which is created through Louvain for partitioning and TF-IDF for scoring). # # For example, topic 4 presented here is clearly showing words related to food and dessert, such as: '_salad_', '_steak_' or '_buffet_', which is very similar to the words shown in the wordcloud for community 2.
startrow=0) print('LDA_result_pos 成功输出!\n') # 负面主题分析 neg_dict = corpora.Dictionary(neg) neg_corpus = [neg_dict.doc2bow(i) for i in neg] neg_lda = models.LdaModel(neg_corpus, num_topics=10, id2word=neg_dict, passes=10) for i in range(10): print('neg_topic' + ' ' + str(i + 1) + ' : ') print(neg_lda.print_topic(i)) LDA_result_neg = neg_lda.print_topics(num_topics=10, num_words=10) df_neg = pd.DataFrame(data=LDA_result_neg) df_neg.to_excel('LDA_result_neg.xlsx') print('LDA_result_neg 成功输出!\n') # =================主题聚类可视化================== data2 = pyLDAvis.gensim.prepare(pos_lda, pos_corpus, pos_dict) print('以下是正面可视化参数\n') print(data2) pyLDAvis.save_html(data2, 'postopic.html') pyLDAvis.display(data2) pyLDAvis.show(data2, open_browser=True) # data1 = pyLDAvis.gensim.prepare(neg_lda, neg_corpus, neg_dict) # print('以下是负面可视化参数\n') # print(data1) # pyLDAvis.save_html(data1, 'negtopic.html') # pyLDAvis.display(data1) # pyLDAvis.show(data1, open_browser=True)
from gensim import corpora, models import pyLDAvis.gensim import pyLDAvis dic = corpora.Dictionary.load('data/model/newsgroups.dict') corp = corpora.MmCorpus('data/model/newsgroups.mm') lda = models.ldamodel.LdaModel.load('data/model/newsgroups_50.model') # Prepare the data for the visualization newsgroup_data = pyLDAvis.gensim.prepare(lda, corp, dic) # Create the visualization pyLDAvis.display(newsgroup_data) # Save the visualization as a html file pyLDAvis.save_html(newsgroup_data, 'data/model/newsgroup_ldavis.html')
def display(self) -> None: pyLDAvis.display(self.get_vis())
def visualization(self): zit = pyLDAvis.sklearn.prepare(self.lda, self.X, self.vectorizer) return (pyLDAvis.display(zit))
rn = ReviewNormalizer() normalized_reviews = [rn.tokenize(r) for r in reviews] pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), " ".join(normalized_reviews[randint(0, len(normalized_reviews))])]) # #### Training the model (this might take a while...) # In[12]: dictionary = corpora.Dictionary(normalized_reviews) corpus = [dictionary.doc2bow(r) for r in normalized_reviews] lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100) # #### Prepare data and visualize! # In[14]: prepared_data = prepare(lda, corpus, dictionary) pyLDAvis.display(prepared_data) # In[ ]:
def get_vis(model,corpus,dictionary): vis=pyLDAvis.gensim.prepare(model,corpus,dictionary) pyLDAvis.display(vis) pyLDAvis.save_html(vis,configuration.lda_dir + 'lda_visualization_test.html')
tokens_after_lemmas_and_rm_stopwords = open('tokens_after_lemmas_and_rm_stopwords.txt', 'w') for item in texts: tokens_after_lemmas_and_rm_stopwords.write("%s\n" % item) dictionary.save_as_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_' + sys.argv[2] + '.txt') corpora.MmCorpus.serialize('lemmas_nostopwords_corpus_'+ sys.argv[2] +'.mm', corpus) joblib.dump(lda, 'ldamodel_'+ sys.argv[2]+ '.pkl') # In[6]: print(corpus[56]) # In[4]: dictionary = gensim.corpora.Dictionary.load_from_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_1000000.txt') corpus = gensim.corpora.MmCorpus('lemmas_nostopwords_corpus_1000000.mm') lda = joblib.load('ldamodel_1000000.pkl') (lda.print_topics(num_topics=20, num_words=8)) # In[4]: lda_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.display(lda_vis)
1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) # calculate doc lengths as the sum of each row of the dtm doc_lengths = count_data.sum(axis=1) doc_lengths = doc_lengths.flatten() doc_lengths = doc_lengths.tolist()[0] len(doc_lengths) # transpose the dtm and get a sum of the overall term frequency dtm_trans = count_data.T total = dtm_trans.sum(axis=1) total = total.flatten() total = total.tolist()[0] len(total) len(vocab) data = { 'topic_term_dists': model.topic_word_, 'doc_topic_dists': model.doc_topic_, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': list(total) } # prepare the data tef_vis_data = pyLDAvis.prepare(**data) # this bit needs to be run after running the earlier code for reasons pyLDAvis.display(tef_vis_data) pyLDAvis.save_html(tef_vis_data, './guidedldavis_prepared_250k' + '.html')
import json import numpy as np import pyLDAvis # TODO for readme # conda install -c conda-forge pyldavis from bokeh.io import show, output_notebook, output_file def load_R_model(filename): with open(filename, 'r') as j: data_input = json.load(j) data = { 'topic_term_dists': data_input['phi'], 'doc_topic_dists': data_input['theta'], 'doc_lengths': data_input['doc.length'], 'vocab': data_input['vocab'], 'term_frequency': data_input['term.frequency'] } return data f = output_file("pyDAVis.html") # output_notebook() # TODO for use in notebook # pyLDAvis.enable_notebook() movies_model_data = load_R_model('data/movie_reviews_input.json') movies_vis_data = pyLDAvis.prepare(**movies_model_data) p = pyLDAvis.display(movies_vis_data) # should use this in notebook # p=pyLDAvis.show(movies_vis_data) # displays in own window combined with output_file show(p)
# list(map(load_doc, [glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')[0]])) # docs = pd.DataFrame(list(map(load_doc, glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')))).set_index(['group', 'id']) docs.head() # %% # %% docs = docs[docs.astype(str)["tokens"] != '[]'] # remove empty letters dictionary, corpus = prep_corpus(docs['tokens']) MmCorpus.serialize('courrier.mm', corpus) dictionary.save('courrier.dict') # %% num_topics = 5 lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10) lda.save(f'courrier_{num_topics}_lda.model') import pyLDAvis.gensim as gensimvis import pyLDAvis #%% lda.load(f'courrier_{num_topics}_lda.model') vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data)
def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [p_stemmer.stem(t) for t in filtered_tokens] return stems from gensim import corpora, models, similarities #tokenize token_emails = [tokenize_and_stem(text) for text in clean_emails] # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(token_emails) #remove extremes dictionary.filter_extremes(no_below=1, no_above=0.8) dictionary.compactify() # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in token_emails] final=models.ldamodel.LdaModel.load('output/final_topic10.model') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(final, corpus, dictionary) pyLDAvis.display(vis_data)