def get_topics(data, max_freq, min_occurrence, num_components): cv = CountVectorizer(max_df=max_freq, min_df=min_occurrence, stop_words='english') dtm = cv.fit_transform(data) model = LatentDirichletAllocation(num_components, learning_method='online', random_state=0, n_jobs=-1) output = model.fit_transform(dtm) s_viz.prepare(model, dtm, cv, mds='tsne')
def LDA(count_data, count_vectorizer): # Tweak the two parameters below number_topics = 35 number_words = 20 # Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") MyLDA.print_topics(lda, count_vectorizer, number_words) LDAvis_data_filepath = os.path.join('./ldavis_prepared_' + str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) return pyLDAvis.save_html( LDAvis_prepared, './ldavis_prepared_' + str(number_topics) + '.html')
def lda_model_checking(lda, count_data, count_vectorizer, number_topics, lda_html_name=None): """ draw the topic distribution by group using ldavis library :param count_data: :param count_vectorizer: :param lda_html_name: :return: """ if lda_html_name is None: lda_html_name = "./ldavis_prepared_" LDAvis_data_filepath = os.path.join(lda_html_name + str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, lda_html_name + str(number_topics) + '.html')
def lda_func(string, filename): sns.set_style('whitegrid') count_vectorizer = CountVectorizer(stop_words=('english')) count_data = count_vectorizer.fit_transform([string]) plot_10_most_common_words(count_data, count_vectorizer, filename) warnings.simplefilter("ignore", DeprecationWarning) number_topics = 3 number_words = 5 lda = LDA(n_components=number_topics, n_jobs=-1, learning_method='online') lda.fit(count_data) print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) from pyLDAvis import sklearn as sklearn_lda import pickle import pyLDAvis LDAvis_data_filepath = os.path.join('./ldavis_prepared_' + str(number_topics) + filename) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html( LDAvis_prepared, './ldavis_prepared_' + str(number_topics) + filename + '.html')
def fit_ddl_lda(words_sentences, output_dir, filename_stem, number_topics): # Initialise the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(words_sentences) # Tweak the two parameters below number_words = 20 # Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) LDAvis_data_filepath = os.path.join(output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, os.path.join(output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics) + '.html')) return lda
def latent_dirichlet_allocation_topic_extraction(): """ Function performs topic extraction on Tweets using Scikit-Learn LDA model. :return: None. """ from sklearn.decomposition import LatentDirichletAllocation # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() # Run LDA. lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) time.sleep(3) # Display the top words for each topic. lda_util.display_topics(lda, tf_feature_names, 10) import pyLDAvis from pyLDAvis import sklearn # pyLDAvis.enable_notebook() visualization = sklearn.prepare(lda_model=lda, vectorizer=tf_vectorizer, dtm=tf) pyLDAvis.save_html(visualization, 'lda_visualization-no-company-words.html')
def save_LDA_visualization(lda_tf, dtm_tf, tf_vectorizer, html_file): """ Save LDA visualization as html """ from pyLDAvis.sklearn import prepare data = prepare(lda_tf, dtm_tf, tf_vectorizer) from pyLDAvis import save_html save_html(data, html_file)
def generate_ldavis(self): params = {"mds": "pcoa"} try: LDAvis_prepared = sklearn_lda.prepare(self.nmf_model, self.vectorized_out, self.vectorizer, **params) except: return "This visualization is currently not available." return LDAvis_prepared
def explore_topics_viz(self, save_path): self.LDAvis_prepared = sklearn_lda.prepare(self.lda, self.count_data, self.count_vectorizer) LDAvis_data_filepath = os.path.join( f"{save_path}_{self.task}_{self.num_topics}") with open(LDAvis_data_filepath + ".pkl", "wb") as fp: pickle.dump(self.LDAvis_prepared, fp) pyLDAvis.save_html(self.LDAvis_prepared, LDAvis_data_filepath + '.html')
def visualize(self): """ Start local web-server and display LDA fitted model """ self.check_model() show( prepare(self.model, self.vectorized_data, self.vectorizer, mds='tsne'))
def visualize_lda(self, n): """Visualizing Topic Modeling using pyLDAvis. Args: n (int): number of topic Return: """ lda, doc2vec, tfidf = self.topic_modeling(n) prepared = prepare(lda, doc2vec, tfidf) pyLDAvis.save_html(prepared, './figure/topic_modeling.html')
def count_and_lda(text): top_N = 20 words = nltk.tokenize.word_tokenize(text) word_dist = nltk.FreqDist(words) stopwords = nltk.corpus.stopwords.words('english') words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords) rslt = pd.DataFrame(word_dist.most_common(top_N), columns=['Word', 'Frequency']) rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N), columns=['Word', 'Frequency']).set_index('Word') counts = Counter(words).most_common(20) # print counts vectorizer = TfidfVectorizer() dtm_tfidf = vectorizer.fit_transform(words) # print(dtm_tfidf.shape) lda_tfidf = LatentDirichletAllocation(n_components=10, learning_offset=50, max_iter=10) lda_tfidf.fit(dtm_tfidf) data = prepare(lda_tfidf, dtm_tfidf, vectorizer) pyLDAvis.save_html(data, './static/data.html') tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=500, stop_words='english') tf = tf_vectorizer.fit_transform(words) vocab = tf_vectorizer.get_feature_names() model = lda.LDA(n_topics=20, n_iter=2000, random_state=1) model.fit(tf) topic_word = model.topic_word_ n = 5 topics = [] for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1] # print('*Topic {}\n- {}'.format(i, ', '.join(topic_words))) topics.append(', '.join(topic_words)) the_counts = [] for count in counts: the_counts.append({'data': count[0], 'value': count[1]}) return topics, the_counts
def visualize(self): """ Start local web-server to display the LDA fitted model """ if not self.fitted: raise ValueError('LDA model is not fitted') show( prepare(self.lda, self.vectorized_data, self.vectorizer, mds='tsne'))
def fit_ea_lda(df_app, output_dir, options): # Helper function def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print("\nTopic #%d:" % topic_idx) print(" ".join( [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) df_app_tmp = df_app.copy() df_app_tmp.drop('ANNOTATE', axis=1, inplace=True) if options['CLASS'] != '': df_app_tmp.drop('CLASS', axis=1, inplace=True) df_app_words = transform_to_nlp(df_app_tmp) makeImage(df_app_tmp.sum(), output_dir, ea_decode.options_filename(options) + '_' + 'WC') # Initialise the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(df_app_words['Words']) #plot_most_common_words(count_data, count_vectorizer, 15) # Load the LDA model from sk-learn from sklearn.decomposition import LatentDirichletAllocation as LDA # Tweak the two parameters below number_topics = 10 number_words = 20 # Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) file_out = ea_decode.options_filename( options) + '_' + 'LDA_VIS' + '_' + str(number_topics) LDAvis_data_filepath = os.path.join(output_dir, file_out) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, os.path.join(output_dir, file_out + '.html'))
def generate_ldavis(self): # 0 = (nr_of_topics, total_nr_of_words) # 1 = (nr_of_articles, total_nr_of_words) # 2 = (nr_of_articles, nr_of_words_per_article) params = {"mds": "pcoa"} try: LDAvis_prepared = sklearn_lda.prepare(self.nmf_model, self.vectorized_out, self.vectorizer, **params) except: return "This visualization is currently not available." return LDAvis_prepared
def lda_topics_visualization(lda, count_vectorizer, paper_words_count_matrix, number_topics): results_file_path = os.path.join(LDA_RESULTS_FILE_PATH, 'ldavis_prepared_' + str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, paper_words_count_matrix, count_vectorizer) with open(results_file_path, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(results_file_path, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, results_file_path + '.html')
def get_lda_summary(number_topics, number_words, data_series, output_name): from sklearn.decomposition import LatentDirichletAllocation as LDA from pyLDAvis import sklearn as sklearn_lda import pickle import pyLDAvis count_vectorizer = CountVectorizer(token_pattern=r'[^\s]+') count_data = count_vectorizer.fit_transform(data_series) lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(output_name, 'wb') as f: pickle.dump(LDAvis_prepared, f) pyLDAvis.save_html(LDAvis_prepared, output_name + '.html')
def lda_vis(df_all_tweets, num_topics): lda_vis_path = r"C:\Users\btier\Documents\lda_vis.html" # start count vector with stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform( df_all_tweets['PROCESSED_TEXT']) # Create / fit LDA lda = LDA(n_components=num_topics, n_jobs=-1) lda.fit(count_data) LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(lda_vis_path, 'w') as f: pickle.dump(LDAvis_prepared, f) with open(lda_vis_path) as f: LDAvis_prepared = pickle.load(f) return pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_' + str(num_topics) + '.html')
def generate_lda_visualisation(self, text): '''This function will create an interactive graphic using the text provided and topic count of the object. Also saves an html file in the local directory. self = dataframe object text = text to analyse and group''' LDA_vect = CountVectorizer(stop_words='english') LDA_count_data = LDA_vect.fit_transform(text) # Create and fit the LDA model lda = LDA(n_components=self.topic_count, n_jobs=-1) lda.fit(LDA_count_data) # Generate the LDA visualisation and display it LDAvis_prepared = sklearn_lda.prepare(lda, LDA_count_data, LDA_vect) path = '../lda_visualisations/ldavis_prepared_'+ str(self.topic_count) +'.html' pyLDAvis.save_html(LDAvis_prepared, path) display(HTML(path))
def pyldavis_visualization(corpus, topics, num_docs=None, ngrams=1, weighting='tf', min_df=0.1, max_df=0.7, mds='pcoa', *args, **kwargs): model, doc_term_matrix, vectorizer = build_model(corpus, topics, num_docs, ngrams, weighting, min_df, max_df) prep_data = prepare(model.model, doc_term_matrix, vectorizer, mds=mds) out = StringIO() save_html(prep_data, out) out.seek(0) return (doc_term_matrix, out.read())
def visualize_topic_model(lda, count_data, count_vectorizer, num_topics, ldavis_filename_prefix): from pyLDAvis import sklearn as sklearn_lda import pickle import pyLDAvis ldavis_data_path = os.path.join(ldavis_filename_prefix + str(num_topics)) ldavis_html_path = ldavis_filename_prefix + str(num_topics) + '.html' # this is a bit time consuming - make the if statement True # if you want to execute visualization prep yourself ldavis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(ldavis_data_path, 'wb') as f: pickle.dump(ldavis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(ldavis_data_path, 'rb') as f: ldavis_prepared = pickle.load(f) pyLDAvis.save_html(ldavis_prepared, ldavis_html_path)
def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"): """ :param df: :class:`~pandas.Dataframe` containing text colums :param generate_visualization: Generate visalization of LDA results. Slows down generation notably. :param lang: Language for :class:`~Voikko` """ if self._count_vector and self._lda: return True file_words = self.instance_path() / "word.dat" file_lda = self.instance_path() / "lda.dat" file_ldavis = self.instance_path() / "ldavis.html" try: # Try loading saved lda files. self._count_vector = joblib.load(file_words) self._lda = joblib.load(file_lda) except FileNotFoundError as e: logger.exception(e) texts = [x for x in df.to_numpy().flatten() if x is not np.NaN] # Setup word count vector self._count_vector = CountVectorizer( tokenizer=self.text_tokenize, stop_words=self.stop_words ) count_data = self._count_vector.fit_transform(texts) self._lda = LDA(n_components=self.number_topics, n_jobs=-1) self._lda.fit(count_data) if generate_visualization: logger.debug("Generating LDA visualization. This might take a while") from pyLDAvis import sklearn as sklearn_lda import pyLDAvis LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data, self._count_vector) pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis)) joblib.dump(self._count_vector, file_words) joblib.dump(self._lda, file_lda)
def pyldavis_run(lda_model_path, document_term_matrix_path, vectorizer_path): ''' Computes the pyLDAvis visualisation of the LDA model. Parameters ---------- lda_model_ath : str Path of the pickle object (serialised python object) of the LDA model. This is created in the lda_tsne_model2.py module. document_term_matrix_path : str Path of the pickle object (serialised python object) of the document-term matrix which is created using the CountVectorizer in the lda_tsne_model2.py module. vectorizer_path : str Path of the pickle object (serialised python object) of the vectorizer used to create the document-term matrix.This is usually the CountVectorizer in the lda_tsne_model2.py module. Returns ---------- Embedded html pyldavis visulisation of the LDA model. ''' t0 = time.time() # loading the pickle objects from the paths parameters. lda_model = pickle.load(open(lda_model_path, "rb")) document_term_matrix = pickle.load(open(document_term_matrix_path, "rb")) cvectorizer = pickle.load(open(vectorizer_path, "rb")) #prepares the pyldavis visualisation. There is a choice of dimensionality reduction methods here, TSNE is chosen as it is consistent #with the previous analysis in the lda_tsne_model2.py module and has shown to yield better results than other available methods. prepared_data = prepare(lda_model, document_term_matrix, cvectorizer, mds='tsne', plot_opts={ 'xlab': '', 'ylab': '' }) html = pyLDAvis.prepared_data_to_html(prepared_data) t1 = time.time() print("time for pyldavis: " + str(t1 - t0), file=sys.stdout) return html
def wordcloud_visualization(corpus, topics, num_docs=None, min_df=0.1, ngrams=1, weighting='tf', max_df=0.7, mds='pcoa', *args, **kwargs): font = pkg_resources.resource_filename(__name__, "fonts/ZillaSlab-Medium.ttf") print(font) model, doc_term_matrix, vectorizer = build_model(corpus, topics, num_docs, ngrams, weighting, min_df, max_df) prep_data = prepare(model.model, doc_term_matrix, vectorizer, mds=mds) ti = prep_data.topic_info topic_labels = ti.groupby(['Category']).groups.keys() plt.clf() topics = [] for label in topic_labels: out = StringIO() df = ti[ti.Category == label].sort_values(by='Total', ascending=False)[:20] tf = dict(df[['Term', 'Total']].to_dict('split')['data']) wc = wordcloud.WordCloud(font_path=font, width=600, height=300, background_color='white') wc.fit_words(tf) plt.imshow(wc) plt.axis('off') plt.savefig(out) out.seek(0) topics.append((label, out.read())) return topics """
def __init__(self, corpus, numTopics=20, load=True, language='en'): with warnings.catch_warnings(): warnings.simplefilter('ignore') print("Instantiating Latent Dirichlet Allocation (Topic Modeling)") if not load: # self.SP = SentenceProcessor(language=language) # self.countVectorizer = CountVectorizer(stop_words=self.SP.stopWords, lowercase=True, strip_accents='ascii') # self.countData = self.countVectorizer.fit_transform( [self.SP.getProcessedSentence(doc.text) for doc in corpus.docList] ) # with open("./modelsData/LDA/countVectorizer_59k.pkl", 'wb') as f: pickle.dump(self.countVectorizer, f) # with open("./modelsData/LDA/countData_59k.pkl", 'wb') as f: pickle.dump(self.countData, f) with open("./modelsData/LDA/countVectorizer_59k.pkl", 'wb') as f: self.countVectorizer = pickle.load(f) with open("./modelsData/LDA/countData_59k.pkl", 'wb') as f: self.countData = pickle.load(f) self.lda = SkLearnLDA(n_components=numTopics, n_jobs=3, max_iter=100, verbose=1, random_state=0) self.lda.fit(self.countData) with open("./modelsData/LDA/SKLearnLDAModel.pkl", 'wb') as f: pickle.dump(self.lda, f) self.ldaModel = sklearn_lda.prepare(self.lda, self.countData, self.countVectorizer) with open("./modelsData/LDA/SKLearnLDA.pkl", 'wb') as f: pickle.dump(self.ldaModel, f) else: with open("./modelsData/LDA/SKLearnLDA_59k_100it.pkl", "rb") as f: self.ldaModel = pickle.load(f) with open("./modelsData/LDA/SKLearnLDAModel_59k_100it.pkl", 'rb') as f: self.lda = pickle.load(f)
number_topics = 3 random_seed = 2 # Create and fit the LDA model lda = LDA(n_components=number_topics, random_state=random_seed, verbose=1) lda.fit(count_data) # Define the word list number_words = 10 words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("\nTopic #%d" % topic_idx) print(" ".join([words[i] for i in topic.argsort()[:-number_words:-1]])) pyLDAvis.enable_notebook() sklearn_lda.prepare(lda, count_data, count_vectorizer) # Calculate the topic distributions for all articles in the training and test sets X_train = lda.transform(count_data) X_test = lda.transform(count_test) # Find the topic given the topic distribution in the training set Topic_train = np.argmax(X_train, axis=1) Topic_train_df = pd.DataFrame(Topic_train, columns=["Topic"]) # Find the topic given the topic distribution in the test set Topic_test = np.argmax(X_test, axis=1) Topic_test_df = pd.DataFrame(Topic_test, columns=["Topic"]) # Reset the index of the df for Topic_test Topic_test_df.index += df_test.index[0] # Allocate the topic to the original dataframe
# Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) %%time from pyLDAvis import sklearn as sklearn_lda import pickle import pyLDAvis LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'w') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath) as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html') from wordcloud import WordCloud wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3,
def buildsklearnselectedworks(so: SearchObject, bagsofsentences: list): """ see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py see also: https://nlpforhackers.io/topic-modeling/ CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151 max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example: max_df = 0.50 means "ignore terms that appear in more than 50% of the documents". max_df = 25 means "ignore terms that appear in more than 25 documents". The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms. min_df is used for removing terms that appear too infrequently. For example: min_df = 0.01 means "ignore terms that appear in less than 1% of the documents". min_df = 5 means "ignore terms that appear in less than 5 documents". The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms. notes: maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc. maxfreq of on the general issue of graphing see also: https://speakerdeck.com/bmabey/visualizing-topic-models https://de.dariah.eu/tatom/topic_model_visualization.html on the axes: https://stats.stackexchange.com/questions/222/what-are-principal-component-scores """ activepoll = so.poll vv = so.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } activepoll.statusis('Running the LDA vectorizer') # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=settings['maxfreq'], min_df=settings['minfreq'], max_features=settings['maxfeatures']) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) ldamodel = LatentDirichletAllocation(n_components=settings['components'], max_iter=settings['iterations'], learning_method='online', learning_offset=50., random_state=0) ldamodel.fit(ldavectorized) visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer) # pyLDAvis.save_html(visualisation, 'ldavis.html') ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation) storevectorindatabase(so, ldavishtmlandjs) return ldavishtmlandjs
def _lda4(table, input_col, topic_name='topic', num_voca=1000, num_topic=5, num_topic_word=10, max_iter=20, learning_method='online', learning_offset=10., random_state=None): # generate model corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") log_likelihood = lda_model.score(term_count) perplexity = lda_model.perplexity(term_count) # create topic table vocab_weights_list = [] vocab_list = [] weights_list = [] topic_term_prob = normalize(lda_model.components_, norm='l1') for vector in topic_term_prob: pairs = [] for term_idx, value in enumerate(vector): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) vocab_weights = [] vocab = [] weights = [] for pair in pairs[:num_topic_word]: vocab_weights.append("{}: {}".format(pair[1], pair[0])) vocab.append(pair[1]) weights.append(pair[0]) vocab_weights_list.append(vocab_weights) vocab_list.append(vocab) weights_list.append(weights) topic_table = pd.DataFrame({ 'vocabularies_weights': vocab_weights_list, 'vocabularies': vocab_list, 'weights': weights_list }) topic_table['index'] = [idx + 1 for idx in topic_table.index] topic_table = topic_table[[ 'index', 'vocabularies_weights', 'vocabularies', 'weights' ]] # create output table doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) topic_dist_name = topic_name + '_distribution' if topic_name in table.columns or topic_dist_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [ doc_topic[i].argmax() + 1 for i in range(len(corpus)) ] out_table[topic_dist_name] = doc_topic.tolist() # pyLDAvis prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Learning method': learning_method, 'Learning offset': learning_offset, 'Seed': random_state } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Latent Dirichlet Allocation Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Log Likelihood | {log_likelihood} | | ### Perplexity | {perplexity} | | ### Parameters | {params} """.format(log_likelihood=log_likelihood, perplexity=perplexity, params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['lda_model'] = lda_model model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def show_pyLDAvis_plots(self): from pyLDAvis.sklearn import prepare prepare(self.model['lda_tf'], self.model['dtm_tf'], self.model['tf_vectorizer'])