def pretrained_doc2vec(texts, labels=[], pretrained_emb="saved_models/apnews_dbow/doc2vec.bin", epochs=10, workers=3, lr_reduce=0.002, rm_training_data=False, save_model=True, save_dir='saved_models', filename='', save_as_word2vec=True, **kwargs): it = LabeledLineSentence(texts, labels) pretrained_d2v = Doc2Vec(pretrained_emb=pretrained_emb, workers=workers, **kwargs) pretrained_d2v.build_vocab(it) for epoch in range(epochs): pretrained_d2v.train(it, total_examples=pretrained_d2v.corpus_count, epochs=1, start_alpha=pretrained_d2v.alpha) pretrained_d2v.alpha -= lr_reduce # decrease the learning rate texts, labels = shuffle(texts, labels) it = LabeledLineSentence(texts, labels) if rm_training_data: print( 'Deleting training data - keeping doctag vectors and inference...') pretrained_d2v.delete_temporary_training_data( keep_doctags_vectors=True, keep_inference=True) if save_model: if len(filename) == 0: filename = 'pretrained_d2v_{}epochs_'.format(epochs) full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='WordEmbeddings') if save_as_word2vec: filename_w2v = 'pretrained_d2v_to_w2v_{}epochs_'.format(epochs) full_path_w2v = save_folder_file(save_dir, filename_w2v, ext='.word2vec', optional_folder='WordEmbeddings') pretrained_d2v.save_word2vec_format(full_path_w2v) pretrained_d2v.save(full_path) return pretrained_d2v
def LSI(self, num_topics=10, print_params=True, save_model=True, save_dir='saved_models', filename='', **kwargs): ''' Topic Modeling with Latent Semantic Indexing ''' lsi_model = models.LsiModel(self.bow, id2word=self.gensim_dict, num_topics=num_topics, **kwargs) print('Running LSI model...\n') if print_params: print('Parameters used in model:') print('Number of topics: {}\nTFIDF transformation: {}\n'.format(num_topics, self.tfidf)) if save_model: if len(filename) == 0: filename = 'LSI_Params_NT{}_TFIDF{}_'.format(num_topics, self.tfidf) full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='LSI') lsi_model.save(full_path) print('Saving LSI model to: \n{}\n'.format(full_path)) return(lsi_model)
def top_texts_per_topic(self, df_dominant_topic, save_output=True, save_dir='results', filename=''): ''' Most representative statements for each topic Helps to make sense of each topic (for labeling) ''' sent_topics_sorteddf = pd.DataFrame() sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, grp.sort_values(['Percent_Contribution'], ascending=[0]).head(1)], axis=0) sent_topics_sorteddf.reset_index(drop=True, inplace=True) if save_output: if len(filename) == 0: filename = 'top_texts_per_topic' full_path = save_folder_file(save_dir, filename, ext='.csv') print('Saving the table to: {}'.format(full_path)) sent_topics_sorteddf.to_csv(full_path, index=False) return sent_topics_sorteddf
def toolkit_cv_plot(self, varying_params, constant_params, save_plot=True, save_dir='results/model_validation', filename='', ext='.pdf', size=(20, 15), **kwargs): ''' Using tmtoolkit for parameter tuning based on a wider variety of measures ''' warnings.filterwarnings("ignore", category = UserWarning) print('evaluating {} topic models'.format(len(varying_params))) eval_results = tm_gensim.evaluate_topic_models((self.gensim_dict, self.bow), varying_params, constant_params, coherence_gensim_texts=self.text, **kwargs) results_by_n_topics = results_by_parameter(eval_results, 'num_topics') plot_eval_results(results_by_n_topics, xaxislabel='num topics', title='Evaluation results', figsize=size); if save_plot: filename = 'tmtoolkit_CV_' full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='convergence_plots') plt.savefig(full_path) return(results_by_n_topics)
def LDAvis(self, model, save_plot=True, save_dir='results', filename='', ext='.html', show_plot=True, is_notebook=True, mds='mds', sort_topics=False, **kwargs): ''' Use pyLDAvis to visualize clustering ''' print('Rendering visualization...') vis = gensimvis.prepare(model, self.bow, self.gensim_dict, mds=mds, sort_topics=sort_topics, **kwargs) if save_plot: if len(filename) == 0: filename = 'LDAvis_plot_' full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='LDAvis_plots') if ext == '.html': pyLDAvis.save_html(vis, full_path) else: print('File extension not supported') if show_plot: if is_notebook: return(vis) # show else: pyLDAvis.show(vis)
def HDP(self, print_params=True, save_model=True, save_dir='saved_models', filename='', **kwargs): ''' Estimate a 'good' number of topics to set, based on the data ''' hdp_model = models.HdpModel(self.bow, id2word=self.gensim_dict, **kwargs) print('Inferring number of topics with Hierarchical Dirichlet Process...\n') if print_params: print('Parameters used in model:') print('TFIDF transformation: {}\n'.format(self.tfidf)) if save_model: if len(filename) == 0: filename = 'HDP_Params_TFIDF{}_'.format(self.tfidf) full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='HDP') hdp_model.save(full_path) print('Saving HDP model to: \n{}\n'.format(full_path)) return hdp_model
def format_topics_sentences(self, save_output=True, save_dir='results', filename=''): ''' Find the dominant topic in each statement Topic with highest percentage contribution in each statement ''' # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(self.model[self.corpus]): row = sorted(row[0], key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = self.model.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append( pd.Series( [int(topic_num) + 1, round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break # break to only get the top topic sent_topics_df.columns = ['Dominant_Topic', 'Percent_Contribution', 'Important_Keywords'] # Add original text to the end of the output sent_topics = pd.concat([sent_topics_df, self.texts], axis=1) topics_df = sent_topics.reset_index() if save_output: if len(filename) == 0: filename = 'dominant_topic_per_text_' full_path = save_folder_file(save_dir, filename, ext='.csv') print('Saving the table to: {}'.format(full_path)) topics_df.to_csv(full_path, index=False) return topics_df
def gensimBOW(self, gensim_dict, save_matrix=True, save_dir='data/corpus_data', filename=''): ''' Make a gensim Bag-of-Words representation matrix ''' bow_corpus = [gensim_dict.doc2bow(text) for text in self.data] if save_matrix: if len(filename) == 0: filename = 'BOWmat' full_path = save_folder_file(save_dir, filename, ext='.mm') corpora.MmCorpus.serialize(full_path, bow_corpus) # store to disk, for later use print('Saving .mm matrix to {}\n'.format(full_path)) return bow_corpus
def topic_distribution(self, df_dominant_topic, top_text_topic, save_output=True, save_dir='results', filename=''): ''' Topic distribution across statements Volume and distribution of topics to see how spread out it is ''' # Number of Documents for Each Topic topic_counts = df_dominant_topic['Dominant_Topic'].value_counts() # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 5) topic_stats = pd.concat([topic_counts, topic_contribution], axis=1) # Make a column for topic number (was previously index) topic_stats.reset_index(level=0, inplace=True) topic_stats.columns = ['Dominant_Topic', 'Num_Documents', 'Perc_Documents'] topic_stats['Dominant_Topic'] = topic_stats['Dominant_Topic'] # Topic Number and Keywords topic_num_keywords = top_text_topic[['Dominant_Topic', 'Important_Keywords']] # Merge on Topic Number df_dominant_topics = topic_num_keywords.merge(topic_stats, on='Dominant_Topic', how='left') df_dominant_topics.reset_index() if save_output: if len(filename) == 0: filename = 'doc_distribution_in_topics' full_path = save_folder_file(save_dir, filename, ext='.csv') print('Saving the table to: {}'.format(full_path)) df_dominant_topics.to_csv(full_path, index=False) return df_dominant_topics
def gensimDict(self, min_word_len=3, prop_docs=0.8, compact=True, save_dict=True, save_dir='data/corpus_data', filename='', keep_n = None): ''' `min_word_len`: int, remove words smaller than min_word_len (should already be done) `prop_docs`: float (0 to 1), max proportion of docs a word can appear before being removed `compact`: bool, Do we reset the index after some rows were deleted in preprocess? `save_dict`: bool, Are we saving this object `save_dir`: str, folder to save the dictionary, child of the current dir will be created if it doesn't exists `filename`: str, filename. If empty string, a new folder name will be created `keep_n`: int, maximum number of words to keep during filtering (None if keep all) ''' dict_words = corpora.Dictionary(self.data) # build gensim dictionary of corpus print('Removing words of less than {} characters, and ' \ 'words present in at least {}% of documents\n'.format( min_word_len, prop_docs)) dict_words.filter_extremes(no_below=min_word_len, no_above=prop_docs, keep_n=keep_n) if compact: dict_words.compactify() # remove gaps in id sequence after words that were removed print('Removing gaps in indices caused by preprocessing...\n') if save_dict: if len(filename) == 0: filename = 'Gensim_dict_Params_MWL{}_PD{}_'.format(min_word_len, prop_docs) full_path = save_folder_file(save_dir, filename, ext='.dict') dict_words.save(full_path) # store the dictionary for future reference print('Saving gensim dictionary to {}\n'.format(full_path)) return dict_words
def convergence_plot(self, log_file, eval_every=5, save_plot=True, save_dir='results/model_validation', filename='', ext='.pdf', size=(12, 9), show_plot=True): ''' plot that uses logfile to see if model convereged based on perplexity or log-likelihood (proxy for KL-divergence) ''' pattern = re.compile(r'(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity') matches = [pattern.findall(log) for log in open(log_file)] matches_pos = [match for match in matches if len(match) > 0] scores = [pos[0] for pos in matches_pos] perplexity = [float(score[1]) for score in scores] likelihood = [float(score[0]) for score in scores] iterations = list(range(0, len(scores)*eval_every, eval_every)) plt.figure(figsize=size) plt.plot(iterations, perplexity) plt.ylabel("Perplexity", fontsize=15) plt.xlabel("Iteration", fontsize=15) plt.title("Topic Model Convergence", fontsize=20) plt.grid() if save_plot: filename = 'perplex_convergence_plot_' full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='convergence_plots') plt.savefig(full_path) if show_plot==True: plt.show() else: plt.close()
def plotTSNE( self, n_top_words=8, # number of keywords we show save_dir='visualization', filename='', ext='.html'): ''' Dimension reduction plots using T-SNE Automatically saves - the plot is not displayed automatically Output is a html file with the plot ''' # 20 colors colormap = np.array([ "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ]) X_topics = self.X_topics num_example = self.num_example tsne_model = self.tsne_model topic_word = self.model.components_ # all topic words vocab = self.tf_vectorizer.get_feature_names() cleaned = self.cleaned _model_keys = [] for i in range(X_topics.shape[0]): _model_keys.append(X_topics[i].argmax()) topic_summaries = [] for i, topic_dist in enumerate(topic_word): # get topic keywords and append topic_words = np.array(vocab)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] topic_summaries.append(' '.join(topic_words)) dict_df = { 'content': cleaned[:num_example], 'topic_key': _model_keys[:num_example] } df = pd.DataFrame(data=dict_df) source = bp.ColumnDataSource(df) num_example = len(X_topics) # plot title = "[t-SNE visualization of LDA model trained on {} statements, " \ "{} topics, thresholding at {} topic probability, ({} data " \ "points and top {} words)".format(X_topics.shape[0], self.n_components, self.threshold, num_example, n_top_words) plot_lda = bp.figure( plot_width=1400, plot_height=1100, title=title, tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave", x_axis_type=None, y_axis_type=None, min_border=1) plot_lda.scatter(x=tsne_model[:, 0], y=tsne_model[:, 1], color=colormap[_model_keys][:num_example]) # randomly choose a text (within a topic) coordinate as the crucial words coordinate topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan for topic_num in _model_keys: if not np.isnan(topic_coord).any(): break topic_coord[topic_num] = tsne_model[_model_keys.index(topic_num)] # plot crucial words for i in range(X_topics.shape[1]): plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]]) # hover tools hover = plot_lda.select(dict(type=HoverTool)) hover.tooltips = {"content": "@content - topic: @topic_key"} if len(filename) == 0: filename = "{}_statements_" \ "{}_topics_{}_topic_prob_threshold_" \ "{}_data_pts_and_top_{}_words".format(X_topics.shape[0], self.n_components, self.threshold, num_example, n_top_words) full_path = save_folder_file(save_dir, filename, ext=ext) print('T-SNE html output saved to `{}`.\n'.format(full_path)) # save the plot save(plot_lda, full_path)
def plot_groups_w2v(w2v, size=(18, 10), n_clusters=4, max_iter=100, init='k-means++', max_idx=200, title='2D Rendition of Keywords, by Category', random_state=0, with_adjust_text=False, group_color_list=None, **kwargs): words_np = [] #a list of labels (words) words_label = [] for word in w2v.vocab.keys(): words_np.append(w2v[word]) words_label.append(word) print('Added {} words. Shape {}'.format(len(words_np), np.shape(words_np))) # Apply K-means clustering on the model kmeans_model = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, **kwargs) X = kmeans_model.fit(words_np) labels = kmeans_model.labels_.tolist() l = kmeans_model.fit_predict(words_np) words_np, labels, words_label = shuffle(words_np, labels, words_label, n_samples=max_idx, random_state=random_state) pca = PCA(n_components=2) pca.fit(words_np) datapoint = pca.transform(words_np) centroids = kmeans_model.cluster_centers_ centroidpoint = pca.transform(centroids) # dict_colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS) if not group_color_list: # default to using Tableau colors - could get fancier with CSS4 colours too if n_clusters < 11: color_list = [val for key, val in mcolors.TABLEAU_COLORS.items()] else: color_list = [val for key, val in mcolors.CSS4.items()] group_color_list = np.random.choice(color_list, n_clusters, replace=False) color = [group_color_list[lab] for lab in labels] plt.figure(figsize=size) texts = [] for index, vec in enumerate(datapoint): x, y = vec[0], vec[1] plt.scatter(x, y, s=100, c=color[index], edgecolors='#000000') if with_adjust_text: texts.append(plt.annotate(words_label[index], xy=(x, y), size=15)) else: plt.annotate(words_label[index], xy=(x, y), size=25) plt.tick_params(labelsize=15) plt.xticks(rotation=45) plt.title(title, fontsize=20) if with_adjust_text: adjust_text(texts) filename = 'class_w2v' full_path = save_folder_file('results/model_validation', filename, ext='.pdf', optional_folder='CV_score_plots') plt.savefig(full_path) plt.show()
def score_plot(self, tuning_df, save_plot=True, save_dir='results/model_validation', ext='.pdf', size=(12, 5), is_notebook=True, tune_params=['eta', 'decay'], score = ['coherence', 'perplexity'], pref = ['higher', 'lower']): ''' Plots showing coherence and perplexity measures vs. number of topics used in tuning process ''' fig, axes = plt.subplots(1, len(score), sharex=True, figsize=size); # create a color palette palette = plt.get_cmap('Set1'); params = [] for param1_name, param1_df in tuning_df.groupby(tune_params[0]): for param2_name, param2_df in param1_df.groupby(tune_params[1]): for i, ax in enumerate(axes.flatten()): ax.plot(param2_df["topic_num"], param2_df[score[i]]); ax.set_xlabel('Number of Topics', fontsize=15); ax.set_ylabel('{}'.format(score[i]), fontsize=15); ax.spines[ "top" ].set_visible( False ); ax.spines[ "right" ].set_visible( False ); ax.tick_params(axis='both', which='major', labelsize=15 ); ax.set_title('{} ({} is better)'.format(score[i], pref[i])); fig.text( 0.5, -0.03, 'Note the different y axes', ha='center', va='center', fontsize = 14); ax.grid(True); params.append('{}: {}, {}: {}'.format(tune_params[0], param1_name, tune_params[1], param2_name)); axes[0].legend( params, loc='upper center', bbox_to_anchor=(1.1, 1.35), shadow=True, ncol=4 ); plt.suptitle( 'Validation score plots', fontsize = 20 ); if is_notebook: plt.show(); if save_plot: filename = 'validation_from{}_to{}_by{}_'.format(self.start, self.max_num_topics, self.step) full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='CV_score_plots') plt.savefig(full_path)
def compare_scores(self, max_num_topics = 20, start = 2, step = 2, etas = ['auto'], decays = [0.7], random_state=919, save_output=True, save_dir='results/model_validation', print_params = False, eval_every = 5, **kwargs): """ Compute c_v coherence and perplexity for various number of topics `max_num_topics` : int, Max number of topics to test `start`: int, Min number of topics to test `step`: int, increased by stepsize `save_output`: bool, save output? `save_dir`: str, folder to save the results, child of the current dir will be created if it doesn't exists `random_state`: int, seed to reproduce `print_params`: bool, whether to output details `eval_every`: int, calculates perplexity every _ iterations (small num -> slow) Returns: `model_list` : list of LDA topic models used for tuning `score_dict`,: dict with {`key`: value}: `coherence_values` : Coherence values corresponding to the LDA model with respective number of topics `perplexity_values`: kl-divergence between theoretical and empirical distribution `score_df`,: DataFrame with a column for each tuning parameters, coherence and perplexity """ warnings.filterwarnings("ignore", category = DeprecationWarning) self.start = start self.max_num_topics = max_num_topics self.step = step self.print_params = print_params self.eval_every = eval_every # number of lists could be reduced model_list = [] eta_list = [] decay_list = [] num_topics_list = [] p_score = [] c_score = [] score_dict = {} print('\nTesting topics {} to {} for:\n'.format(start, (max_num_topics - step))) for eta in etas: for decay in decays: print('\n {} eta and {} decay...\n'.format(eta, decay)) for num_topics in range(start, max_num_topics, step): params = "topics{}_eta{}_decay{}".format(num_topics, eta, decay) model = self.LDA(print_params = self.print_params, num_topics = num_topics, eta = eta, decay = decay, eval_every = self.eval_every, save_model=False, random_state=random_state, **kwargs) model_list.append(model) coherencemodel = CoherenceModel(model=model, corpus=self.bow, texts=self.text, coherence='c_v') coherent = coherencemodel.get_coherence() perplex = model.log_perplexity(self.bow) eta_list.append(eta) decay_list.append(decay) num_topics_list.append(num_topics) p_score.append(coherent) c_score.append(perplex) score_df = pd.DataFrame({'eta':eta_list, 'decay':decay_list, 'topic_num':num_topics_list, 'coherence':c_score, 'perplexity':p_score}) score_df.replace(to_replace=[None], value='none', inplace=True) if save_output: filename = 'Coherence_Perplexity_from{}_to{}_by{}'.format(start, max_num_topics, step) full_path = save_folder_file(save_dir, filename, ext='.csv', optional_folder='scores') score_df.to_csv(full_path, index=False) score_dict['perplexity'] = p_score score_dict['coherence'] = c_score return model_list, score_df, score_dict
def freq_plot( self, top_n=50, width=1.0, c_scale='Portland', title='Top word frequencies (after cleanup and lemmatization)', plotname='word_count_bar', image_format='png', save_plot=True, save_dir='visualization', filename='', is_notebook=True, **kwargs): """ Interactive bar frequency plot `top_n`: int, to plot a number top_n of most frequent words `width`: float, bar width `c_scale`: str, colour scheme (see matplotlib colour schemes) `title`: str, title to display on image `plotname`: str, for notebook display `image_format`: str, image extension, of the for 'png', 'pdf', etc - NO dot `save_plot`: bool, is the plot saved `save_dir`: str, folder to save plot (child of the working directory) folder will be created if it doesn't exists NOTE: orca must be installed to save a still image of the plot `filename`: str, filename for the still image to save `is_notebook`: bool, is this displayed on a notebook? """ ordered_count = self.order_count() sorted_word = [count[0] for count in ordered_count[:top_n]] sorted_freq = [count[1] for count in ordered_count[:top_n]] data_word = [ go.Bar(x=sorted_word, y=sorted_freq, marker=dict(colorscale=c_scale, color=sorted_freq, line=dict(color='rgb(0,0,0)', width=width)), text='Word count') ] layout = go.Layout(title=title) fig = go.Figure(data=data_word, layout=layout, **kwargs) if is_notebook: iplot(fig, filename=plotname, image=image_format) if save_plot: if len(filename) == 0: filename = 'word_frequency_barplot_top{}_words_'.format(top_n) full_path = save_folder_file(save_dir, filename, ext='.' + image_format) print('Pyplot word frequency bar chart saved to `{}`.\n'.format( full_path)) pio.write_image(fig, full_path)
def cloud_plot(self, size=(9, 6), background_color="black", max_words=1000, max_font_size=60, min_font_size=5, collocations=False, colormap="coolwarm", plot_title="Most common words", plot_fontsize=30, interpolation='lanczos', save_plot='True', save_dir='visualization', filename='', image_format='.png', is_notebook=True, **kwargs): ''' `size`: tuple of ints, image size `background_color`: str, colour name `max_words`: int, maximum number of words to plot `max_font_size`: int, maximum font size `min_font_size`: int, minimum font size `collocations`: bool, * set to False * to avoid duplicates `colormap`: str, colour scheme for letters (see matplotlib colours) `plot_title`: str, title `plot_fontsize`: int, average fontsize `interpolation`: str, smoother, example of possible choices: 'nearest', 'bilinear', 'hamming', 'quadric', 'lanczos' `save_plot`: bool, is the plot saved `save_dir`: str, folder to save plot (child of the working directory) folder will be created if it doesn't exists `filename`: str, filename for the still image to save `image_format`: str, extension, of the form '.png', '.pdf', etc `is_notebook`: bool, is this displayed on a notebook? ''' self.text_cloud = " ".join(word for word in self.count_dict.elements()) plt.figure(figsize=size) wc = WordCloud(background_color=background_color, max_words=max_words, max_font_size=max_font_size, min_font_size=min_font_size, collocations=collocations, colormap=colormap) wc.generate(self.text_cloud, **kwargs) plt.title(plot_title, fontsize=plot_fontsize) plt.margins(x=0.5, y=0.25) plt.axis('off') plt.imshow(wc, interpolation=interpolation) if is_notebook: plt.show() if save_plot: if len(filename) == 0: filename = 'wordcloud_plot_' full_path = save_folder_file(save_dir, filename, ext=image_format) print('Wordcloud plot saved to `{}`.\n'.format(full_path)) # store to file wc.to_file(full_path) plt.savefig(full_path)
def LDA(self, num_topics=10, update_every=1, chunksize=100, full_data_chunk=True, iterations=10000, passes=10, eval_every=5, alpha='auto', eta='auto', decay=0.8, minimum_probability = 0.05, minimum_phi_value = 0.02, per_word_topics=True, print_params=True, save_model=True, save_dir='saved_models', filename='', random_state=919, **kwargs): ''' `num_topics`: int, Number of latent topics (clusters) extracted from training corpus (bow) `update_every`: int, Number of chunks to process prior to moving onto the M step of EM. `chunksize`: int, Number of documents to load into memory at a time and process E step of EM `full_data_chunk=`: bool, Overrides chunksize. Load all docs into memory at once? `iterations`: int, Maximum number of training iterations through the corpus. `passes`: int, Number of passes through the entire corpus for training `eval_every`: int, the smaller the number, the finer grained is convergence plot `alpha='auto', str, number of expected topics that expresses our a-priori belief for the each topics' probability. Choices: 'auto': Learns an asymmetric prior from the corpus. 'asymmetric': Fixed normalized asymmetric prior of 1.0 / topicnum. `eta`: prior on word probability, can be: scalar for a symmetric prior over topic/word probability, vector of length num_words for user defined prob for each word, matrix (num_topics x num_words) to assign prob to word-topic combinations, or str 'auto' to learn the asymmetric prior from the data. `decay`: float, Number between (0.5, 1] how much past documents are forgotten when new document is seen `minimum_probability`: float, Topics with a prob lower than this are filtered out. `minimum_phi_value`: float, lower bound on the term probabilities (when `per_word_topics` = True) `per_word_topics`: bool, sorts topics in descending order (from most likely topics for each word) `print_params`: bool, are the parameters printed? `save_model`: bool, save model? `save_dir`: str, folder to save the model, child of the current dir will be created if it doesn't exists `filename`: str, filename. If empty string, a new folder name will be created `random_state`: int, seed to reproduce ''' # remove deprecation warnings warnings.filterwarnings("ignore", category = DeprecationWarning) if full_data_chunk: chuncksize = len(self.bow) lda_model = models.LdaModel(self.bow, id2word=self.gensim_dict, num_topics=num_topics, update_every=update_every, chunksize=chunksize, iterations=iterations, passes=passes, alpha=alpha, eta=eta, decay=decay, minimum_probability=minimum_probability, minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, eval_every=eval_every, random_state=random_state, **kwargs) if print_params: print('Parameters used in model: ') model_pars = 'Number of topics: {}\nTFIDF transformation: {}\n'\ 'Number of iterations: {}\nBatch size: {}\n' \ 'Update every {} pass\nNumber of passes: {}\n' \ 'Topic inference per word: {}\nAlpha: {}\n'\ 'Eta: {}\nDecay: {}\nMinimum probability: {}\n' \ 'Minimum_phi_value: {}\nEvaluate every: {}\n' \ 'Random seed: {}\n'.format(num_topics, self.tfidf, iterations, chunksize, update_every, passes, per_word_topics, alpha, eta, decay, minimum_probability, minimum_phi_value, eval_every, random_state) print(model_pars) if save_model: if len(filename) == 0: filename = 'LDA_Params_NT{}_TFIDF{}'\ 'Per_word_topic{}'.format(num_topics, self.tfidf, per_word_topics) full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='LDA') full_path_txt = save_folder_file(save_dir, filename + '_parameters', ext='.txt', optional_folder='LDA') print('Saving LDA model to: \n{}'.format(full_path)) lda_model.save(full_path) f = open(full_path_txt,'w') # write down corresponding parameters f.write(model_pars) f.close() return lda_model