def files_10(): #Change number to select file count = 7 data = pd.read_csv('file' + str(count) + '.csv') df = pd.DataFrame(data) for index, c in df.iterrows(): preprocess(c['"QUESTION' + str(count) + '"']) print(result) text = 'Question ' + str(count) with open(text, "w") as result_file: result_file.write('') dictionary = gensim.corpora.Dictionary(result) bow_corpus = [dictionary.doc2bow(doc) for doc in result] bow_doc_x = bow_corpus[0] for i in range(len(bow_doc_x)): print("Word {} (\"{}\") appears {} time.".format( bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1])) lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=10, workers=2, per_word_topics=True) for idx, topic in lda_model.print_topics(-1): with open(text, "a") as result_file: result_file.write("Topic: {} \nWords: {}".format(idx, topic) + "\n") vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary) pyLDAvis.enable_notebook() pyLDAvis.show(vis)
def lda(doctors, topic): refined_tweets = lda_user(doctors) refined_tweets += lda_topic(topic) tokenizer = RegexpTokenizer(r'\w+') texts = [] for i in range(0, len(refined_tweets)): texts.append(tokenizer.tokenize(refined_tweets[i])) keywordArray = [] dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=2, no_above=0.8) corpus = [dictionary.doc2bow(text) for text in texts] m = models.LdaModel(corpus, id2word=dictionary, num_topics=3, update_every=5, chunksize=10000, passes=10) topics_matrix = m.show_topics(formatted=True, num_words=5) topics_matrix = np.array(topics_matrix) #for i in range(0,20,1): # print topics_matrix[i,1] #keywordArray = topics_matrix[:,:,1] #keywordArrayProb = topics_matrix[:,:,0] p = pyLDAvis.gensim.prepare(m, corpus, dictionary) pyLDAvis.show(p)
def generate_ldavis_data_v1(data_path, run_name, model, idx_to_word, freqs, vocab_size): """This function will launch a locally hosted session of pyLDAvis to visualize the results of our model""" doc_embed = model.sess.run(model.doc_embedding) topic_embed = model.sess.run(model.topic_embedding) word_embed = model.sess.run(model.word_embedding) # Extract all unique words in order of index: 0 - vocab_size vocabulary = [] # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token for i in range(1, vocab_size + 1): vocabulary.append(idx_to_word[i]) # Read document lengths doc_lengths = np.load(data_path / run_name / 'doc_lengths.npy') # The `prepare_topics` function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def lda(doctors, topic): refined_tweets = lda_user(doctors) refined_tweets += lda_topic(topic) tokenizer = RegexpTokenizer(r'\w+') texts = [] for i in range(0,len(refined_tweets)): texts.append(tokenizer.tokenize(refined_tweets[i])) keywordArray = [] dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=2, no_above=0.8) corpus = [dictionary.doc2bow(text) for text in texts] m = models.LdaModel(corpus,id2word=dictionary,num_topics=3,update_every=5,chunksize=10000,passes=10) topics_matrix = m.show_topics(formatted=True, num_words=5) topics_matrix = np.array(topics_matrix) #for i in range(0,20,1): # print topics_matrix[i,1] #keywordArray = topics_matrix[:,:,1] #keywordArrayProb = topics_matrix[:,:,0] p = pyLDAvis.gensim.prepare(m,corpus,dictionary) pyLDAvis.show(p)
def generate_ldavis_data(data_path, run_name, model, idx_to_word, freqs, vocab_size): """This method will launch a locally hosted session of pyLDAvis that will visualize the results of our model """ doc_embed = model.sesh.run(model.doc_embedding) topic_embed = model.sesh.run(model.topic_embedding) word_embed = model.sesh.run(model.word_embedding) # Extract all unique words in order of index 0-vocab_size vocabulary = [] for i in range(vocab_size): vocabulary.append(idx_to_word[i]) # Read in document lengths doc_lengths = np.load(data_path + "/" + run_name + "/" + "doc_lengths.npy") # The prepare_topics function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def get_lda(data, n_components=5, n_features=6000): ####LDA主题模型实现 # n_features = 3000 #使用关键词数 n_components = 5 #文本向量化 tf_vectorizer = CountVectorizer(strip_accents='unicode', max_features=n_features, stop_words='english', max_df=0.5, min_df=10) tf = tf_vectorizer.fit_transform(data.fcjg) ##控制主题数 lda = LatentDirichletAllocation(n_components=n_components, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) n_top_words = 300 #每个主题显示多少个词 tf_feature_names = tf_vectorizer.get_feature_names() data_list = print_top_words(lda, tf_feature_names, n_top_words) data_plot = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) pyLDAvis.show(data_plot) return tf_feature_names, data_list
def LDAvis(self, model, save_plot=True, save_dir='results', filename='', ext='.html', show_plot=True, is_notebook=True, mds='mds', sort_topics=False, **kwargs): ''' Use pyLDAvis to visualize clustering ''' print('Rendering visualization...') vis = gensimvis.prepare(model, self.bow, self.gensim_dict, mds=mds, sort_topics=sort_topics, **kwargs) if save_plot: if len(filename) == 0: filename = 'LDAvis_plot_' full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='LDAvis_plots') if ext == '.html': pyLDAvis.save_html(vis, full_path) else: print('File extension not supported') if show_plot: if is_notebook: return(vis) # show else: pyLDAvis.show(vis)
def plot_pyldavis(topic_model, document_topic_matrix, document_term_matrix, file=None, **kwargs): """ Generate a pyLDAvis visualization of the given topic model. For more information about the visualization read the `original paper <http://www.aclweb.org/anthology/W14-3110>`_ by Sievert and Shirley. Note that pyLDAvis only supports LDA models, passing a nmf model will cause an exception. :param document_topic_matrix: A document-topic matrix as returned by calling get_document_topic_matrix() on a topic model. :type document_topic_matrix: np.ndarray :param document_term_matrix: Term count weighted document-term matrix of the documents used to infer the document_topic_matrix. :type document_term_matrix: np.ndarray :param file: Path to store the HTML output. If no file is passed the plot is visualized in the browser. :type file: str :param kwargs: Further parameters passed directly to pyLDAvis's prepare function. See the `documentation <http://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepare>`_ for options. Note, that sort_topics=False is already set. """ if topic_model.model_name != 'lda': raise Exception('pyLDAvis only supports LDA. {} not supported'.format(topic_model.model_name)) topic_token_matrix = topic_model.get_topic_token_matrix(normalize=True) id2word = topic_model.id2token document_lengths = np.sum(document_term_matrix, axis=1).getA1() term_frequencies = np.sum(document_term_matrix, axis=0).getA1() prepared_data = pyLDAvis.prepare(topic_token_matrix, document_topic_matrix, document_lengths, id2word, term_frequencies, sort_topics=False, **kwargs) ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') REPORT_DIR = os.path.join(ROOT_DIR, 'reports') if file: base_path = os.path.join(REPORT_DIR, 'figures/pyLDAvis') pa = os.path.join(base_path, file) with open(pa, 'w') as f: pyLDAvis.save_html(prepared_data, f) else: pyLDAvis.show(prepared_data)
def LDA_model(vectorized_data, num_topics=20): # Create Dictionnary texts = [text.split() for text in vectorized_data] id2word = corpora.Dictionary(texts) #remove extremes (similar to the min/max df step used when creating the tf-idf matrix) id2word.filter_extremes(no_below=1, no_above=0.8) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Model lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, update_every=1, chunksize=100, passes=10, random_state=0, per_word_topics=True) topics_per_cluster = lda_model.show_topics(formatted=False, num_words=20) All_topics = [] for idx, topic in enumerate(topics_per_cluster): print('Topic: {} \nWords: {}'.format( idx, '|'.join([w[0] for w in topic[1]]))) All_topics.append([w[0] for w in topic[1]]) print('----next topic-----') vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.show(vis)
def topic_visual(best_lda_model, data_vectorized, vectorizer): pyLDAvis.enable_notebook() panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne') pyLDAvis.show(panel)
def vis(self): """ Visualization of the data through browser. """ vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word) pyLDAvis.show(vis)
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size): """ This function will launch a locally hosted session of pyLDAvis to visualize the results of our model. :param data_path: (PosixPath) data location :param model: TensorFlow model :param idx_to_word: (dict) index-to-word mapping :param freqs: (list) frequency counts of each token :param vocab_size: (int) size of vocabulary :return: """ doc_embed = model.sess.run(model.doc_embedding) topic_embed = model.sess.run(model.topic_embedding) word_embed = model.sess.run(model.word_embedding) # Extract all unique words in order of index: 1 - (vocab_size + 1) # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token vocabulary = ['<PAD>'] for i in range(1, vocab_size): vocabulary.append(idx_to_word[i]) # Read document lengths doc_lengths = np.load(data_path / 'doc_lengths.npy') # The `prepare_topics` function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'): """ Print important topics based on decomposition. Parameters ---------- mds : str, optional (default='pcoa') 2D Decomposition. Allowed values: * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling) * ``'mmds'`` - Dimension reduction via Multidimensional scaling * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding """ if not isinstance(self.comp, LatentDirichletAllocation): raise ValueError('only support lda_topic_modelling()') import pyLDAvis import pyLDAvis.sklearn if notebook_mode: pyLDAvis.enable_notebook() prepared_vis_data = pyLDAvis.sklearn.prepare(self.comp, self._vectors, self.vectorizer, mds=mds) if notebook_mode: return prepared_vis_data else: pyLDAvis.show(prepared_vis_data)
def compute_lda(self, file_name, n_topics, alpha, beta): print("STATUS: START BUILDING MODEL") path_base = "data/topic_modeling/" path_lda_model = path_base + file_name + "-lda_model.pckl" if os.path.exists(path_lda_model): f = open(path_lda_model, "rb") lda_model = pickle.load(f) f.close() print( "STATUS: FINISHED BUILDING MODEL (USING ALREADY BUILT MODEL)") else: lda_model = gensim.models.LdaMulticore( corpus=self.corpus, id2word=self.id2word, num_topics=n_topics, random_state=100, chunksize=100, passes=10, alpha=alpha, eta=beta, ) f = open(path_lda_model, "wb") pickle.dump(lda_model, f) f.close() print("STATUS: FINISHED BUILDING MODEL (NEW MODEL CREATED)") # Visualize the topics print("STATUS: START VISUALIZING MODEL") data = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word) pyLDAvis.show(data)
def plot_lda_vis(model_data, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, save_html, show model_vis_data = prepare(**model_data) if mode == 'save_html' and filename: save_html(model_vis_data, filename) else: show(model_vis_data)
def create_models(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10) print(lda.show_topics()) vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) print(vis_data) pyLDAvis.show(vis_data)
def visLDA(model, data, vectorizer, ip, port): # https://github.com/bmabey/pyLDAvis/issues/69 visData = pyLDAvis.sklearn.prepare(model, data, vectorizer, mds='mmds', sort_topics=False) pyLDAvis.show(visData, ip=ip, port=port)
def visualize(self): """ Start local web-server and display LDA fitted model """ self.check_model() show( prepare(self.model, self.vectorized_data, self.vectorizer, mds='tsne'))
def topic_model(clean_txt: list, num_count: int): """Визуализация тематической модели""" clean_txt = [clean_txt] common_dictionary = Dictionary(clean_txt) common_corpus = [common_dictionary.doc2bow(text) for text in clean_txt] lda = LdaModel(common_corpus, num_topics=num_count) vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary) pyLDAvis.save_html(vis, 'LDA.html') pyLDAvis.show(data=vis, open_browser=True)
def plot_using_ldaviz(topics=10): texts = [[text] for text in bow.columns] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, num_topics=topics, id2word=dictionary) vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.show(vis_data)
def visualize(self, _label): visfile = "./models/{0}.vis".format(_label) if os.path.isfile(visfile): vis_data = pickle.load(open(visfile, "rb")) else: dictionary, corpus, ldamodel = self.build(_label) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) pickle.dump(vis_data, open(visfile, "wb")) pyLDAvis.show(vis_data)
def topic_model_visualize(textlist: list, num_topics: int) -> None: """Визуализация тематической модели""" textlist = [textlist] common_dictionary = Dictionary(textlist) common_corpus = [common_dictionary.doc2bow(text) for text in textlist] lda = LdaModel(common_corpus, num_topics=num_topics) vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary) pyLDAvis.save_html(vis, 'LDA.html') pyLDAvis.show(data=vis, open_browser=True)
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: logging.info("Saving pyLDAVis to {}".format(filename)) save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
def visualize(self): """ Start local web-server to display the LDA fitted model """ if not self.fitted: raise ValueError('LDA model is not fitted') show( prepare(self.lda, self.vectorized_data, self.vectorizer, mds='tsne'))
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data)
def LDA_analysis(texts, nTopics, onlyCount=True, showPic=True): min_ngram = 2 max_ngram = 4 max_df = 1.0 min_df = 0 max_features = 500 learning_offset = 20 lda = LatentDirichletAllocation(n_topics=nTopics, max_iter=50, learning_method='batch', learning_offset=learning_offset, random_state=0) if onlyCount: vectorizer = CountVectorizer(analyzer='word', ngram_range=(min_ngram, max_ngram), max_features=max_features, encoding='utf-8', strip_accents='unicode', stop_words='english', max_df=max_df, min_df=min_df) X = vectorizer.fit_transform(texts) else: vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(min_ngram, max_ngram), max_features=max_features, encoding='utf-8', strip_accents='unicode', stop_words='english', max_df=max_df, min_df=min_df) X = vectorizer.fit_transform(texts) X_new = lda.fit_transform(X) feature_names = vectorizer.get_feature_names() print_top_words(lda, feature_names, 10) print lda.components_.shape print X_new[9] cosSim = cosine_similarity(X, lda.components_, False) print texts[9] print cosSim[9] if showPic: #pyLDAvis.enable_notebook() data_pyLDAvis = pyLDAvis.sklearn.prepare(lda, X, vectorizer) pyLDAvis.show(data_pyLDAvis) return cosSim
def display_page(pathname): print(pathname) if "kmean" in pathname: return page_kmean_layout elif "lda" in pathname: #data_prepared = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer) #, d3_url="js/d3.v3.min.js", ldavis_url="js/ldavis.js", ldavis_css_url="js/ldavis.css" #html = pyLDAvis.prepared_data_to_html(data_prepared) #print(html) pyLDAvis.show( pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)) return html.A("LDA visualisation on other tab for now")
def visualize(): with open('../data/bow_corpus', 'rb') as input_file: corpus = pickle.load(input_file) tweet_dictionary = gensim.corpora.Dictionary.load( '../data/tweet_dictionary') """ Load model """ model = LdaModel.load('../data/model/LDA_model_v1') """ Visualization """ lda_visualization = pyLDAvis.gensim.prepare(model, corpus, tweet_dictionary) pyLDAvis.show(lda_visualization)
def LDA_sklearn(self, contents, num_topics=5, num_words=10, max_df=0.95, min_df=2, max_features=1000, showLDA=True): from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer self.seg(contents) tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english') documents = [' '.join(content) for content in self.content_seg] tf = tf_vectorizer.fit_transform(documents) feature_names = tf_vectorizer.get_feature_names() model = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) self.LDA_sklearn_topics = [] for topic_idx, topic in enumerate(model.components_): #print("Topic %d:" % (topic_idx)) topic_info = '(%s, ' % (topic_idx) for i in topic.argsort()[:-(num_words + 1):-1]: #返回最大的index topic_info += str(np.around( topic[i], decimals=3)) + '*' + '\"%s\"' % (feature_names[i]) + ' + ' self.LDA_sklearn_topics.append(topic_info[:-3] + ')') #self.LDA_sklearn_topics.append([(feature_names[i], np.around(topic[i], decimals=3)) for i in topic.argsort()[:-(num_words+1):-1]]) #[:-a-1:-1]倒序排列,共取a个数 #LDA可视化 #交互图解释:一个圆圈代表一个主题,圆圈大小代表每个主题包含的文章数。 #-->当鼠标未点到圆圈时,显示的是最重要(频率最高)的30个关键词。 #-->当鼠标点到圆圈时,显示每个关键词在该主题下的频率。 if showLDA == True: import pyLDAvis import pyLDAvis.sklearn result = pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer) pyLDAvis.show(result) return self.LDA_sklearn_topics
def visualize_lda(self, df, display=False): if self.lda_model == []: self.run_lda(df) max_features = self.tf_vectorizer.get_params()['max_features'] n_topics = self.lda_model.get_params()['n_topics'] vis_data = pyLDAvis.sklearn.prepare(self.lda_model, self.tf, self.tf_vectorizer, R=n_topics, n_jobs=-1) pyLDAvis.save_html( vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' + str(n_topics) + 'topics.html') if display: pyLDAvis.show(vis_data)
def show_model_statistics(lda_model, with_visualization=False): # Compute Perplexity #print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Visualize the topics # pyLDAvis.enable_notebook() if with_visualization: vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.show(vis)
def visualize_LDA(model, corpus): """ This function accepts an lda model and a corpus of words and uses pyLDAvis to prepare a visualization and then save to html. input: an lda model and a corpus of words returns: None """ LDAvis_prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary=model.id2word, mds='tsne') vis_filename = './LDAvis_prepared/random_users/LDAvis.html' pyLDAvis.save_html(LDAvis_prepared, vis_filename) pyLDAvis.show(LDAvis_prepared) return None
def vectorize(self): ''' args: none output: generates an LDA topic model of the document using gensim and pyLDAvis ''' # tokenize and remove stopwords sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text #sentences = Topic(raw_input('topic: ')).text # get text from wikipedia #stoplist = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split()) texts = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences] # compute the frequency of each token frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # remove words that appear only once texts = [[token for token in text if frequency[token] > 1] for text in texts] # construct a gensim dictionary and corpus (bag of words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document # define LDA model lda = models.ldamodel.LdaModel( corpus = corpus, id2word = dictionary, num_topics = 10, #what should this be ??? update_every = 1, chunksize = 10000, passes = 1 ) # visualize the lda space vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.show(vis_data) with open('topic_models/'+self.name+'.json', 'a+') as topic_json: pyLDAvis.save_json(vis_data, topic_json) with open('topic_models/'+self.name+'.html', 'a+') as topic_html: pyLDAvis.save_html(vis_data, topic_html)