def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5): # corpus = Corpus() corpus.add_files(file_name, encoding='utf8') # preproc = TMPreproc(corpus) dtm_bg = preproc.dtm # var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)] # const_params = { 'n_iter': n_iter, 'random_state': 20200713 # to make results reproducible } eval_results = evaluate_topic_models(dtm_bg, varying_parameters=var_params, constant_parameters=const_params, metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#, #return_models=True ) # eval_results_by_topics = results_by_parameter(eval_results, 'n_topics') # name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang) plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small') plt.tight_layout() plt.savefig('out/'+name) return
def toolkit_cv_plot(self, varying_params, constant_params, save_plot=True, save_dir='results/model_validation', filename='', ext='.pdf', size=(20, 15), **kwargs): ''' Using tmtoolkit for parameter tuning based on a wider variety of measures ''' warnings.filterwarnings("ignore", category = UserWarning) print('evaluating {} topic models'.format(len(varying_params))) eval_results = tm_gensim.evaluate_topic_models((self.gensim_dict, self.bow), varying_params, constant_params, coherence_gensim_texts=self.text, **kwargs) results_by_n_topics = results_by_parameter(eval_results, 'num_topics') plot_eval_results(results_by_n_topics, xaxislabel='num topics', title='Evaluation results', figsize=size); if save_plot: filename = 'tmtoolkit_CV_' full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='convergence_plots') plt.savefig(full_path) return(results_by_n_topics)
def test_plot_eval_results(n_param_sets, n_params, n_metrics, plot_specific_metric): param_names = ['param' + str(i) for i in range(n_params)] metric_names = ['metric' + str(i) for i in range(n_metrics)] res = [] for _ in range(n_param_sets): param_set = dict(zip(param_names, np.random.randint(0, 100, n_params))) metric_results = dict(zip(metric_names, np.random.uniform(0, 1, n_metrics))) res.append((param_set, metric_results)) p = random.choice(param_names) by_param = evaluate.results_by_parameter(res, p) if not by_param: with pytest.raises(ValueError): visualize.plot_eval_results(by_param) else: if plot_specific_metric: metric = random.choice(metric_names) else: metric = None fig, axes = visualize.plot_eval_results(by_param, metric=metric) plt.close(fig)
import numpy as np from scipy.sparse import csr_matrix rows = [] cols = [] data = [] for i in range(0, len(trigram_bow_corpus)): line = trigram_bow_corpus[i] for indx, freq in line: rows.append(i) cols.append(indx) data.append(freq) dtm = csr_matrix((data, (rows, cols)), shape=(len(trigram_bow_corpus), len(trigram_dictionary)), dtype=int) const_params = dict(n_iter=20) ks = list(range(5, 100, 5)) #+ list(range(50, 200, 50)) + list(range(200, 500, 100)) varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks] eval_results = tm_lda.evaluate_topic_models( dtm, varying_params, const_params, return_models=True) #,n_max_processes=8 results_by_n_topics = results_by_parameter(eval_results, 'n_topics') # fig, ax = plt.subplots(figsize=(8, 6)) plot_eval_results(results_by_n_topics) plt.tight_layout() # plt.savefig('valid_lda.eps', format='eps', dpi=300) plt.show()
varying_params = [dict(n_topics=k, alpha=1.0/k) for k in ks] # this will evaluate all models in parallel using the metrics in tm_lda.DEFAULT_METRICS # still, this will take some time print('evaluating %d topic models' % len(varying_params)) models = tm_lda.evaluate_topic_models(dtm, varying_params, const_params, return_models=True) # retain the calculated models # save the results as pickle print('saving results') pickle_data(models, 'data/lda_evaluation_results.pickle') # plot the results print('plotting evaluation results') results_by_n_topics = results_by_parameter(models, 'n_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results for alpha=1/k, beta=0.1', figsize=(8, 6)) plt.savefig('data/lda_evaluation_plot.png') plt.show() # the peak seems to be around n_topics == 120 # print the distributions of this model n_topics_best_model = 120 best_model = dict(results_by_n_topics)[n_topics_best_model]['model'] print('saving final model with n_topics=%d' % n_topics_best_model) save_ldamodel_to_pickle('data/lda_evaluation_finalmodel.pickle', best_model, vocab, doc_labels, dtm) print('printing final model') print_ldamodel_topic_words(best_model.topic_word_, vocab) print_ldamodel_doc_topics(best_model.doc_topic_, doc_labels)
const_params = dict( update_every=0, passes=20, iterations=400, alpha='auto', eta='auto', ) ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] print('evaluating %d topic models' % len(varying_params)) eval_results = tm_gensim.evaluate_topic_models( (gnsm_dict, gnsm_corpus), varying_params, const_params, coherence_gensim_texts=model_lists) # necessary for coherence C_V metric # save the results as pickle print('saving results') pickle_data(eval_results, 'gensim_evaluation_results_entire.pickle') # plot the results print('plotting evaluation results') plt.style.use('ggplot') results_by_n_topics = results_by_parameter(eval_results, 'num_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results', figsize=(8, 6)) plt.savefig('gensim_evaluation_plot_entire.png') plt.show()
print('run script as: %s <tokens preprocessing pipeline> <eta> <alpha factor>' % sys.argv[0]) print('<tokens preprocessing pipeline> must be 0, 1 or 2') exit(1) toks = int(sys.argv[1]) eta = float(sys.argv[2]) alpha_mod = float(sys.argv[3]) #%% picklefile = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (toks, eta, alpha_mod) print('loading pickle file with evaluation results from `%s`' % picklefile) eval_results = unpickle_file(picklefile) eval_results_by_n_topics = results_by_parameter(eval_results, 'n_topics') n_metrics = len(eval_results_by_n_topics[0][1]) #%% fig, axes = plot_eval_results(eval_results_by_n_topics, title='Evaluation results for alpha=%.2f/k, beta=%.2f' % (alpha_mod, eta), xaxislabel='num. topics (k)') plot_file_eval_res = 'fig/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.png' % (toks, eta, alpha_mod) print('saving plot to file `%s`' % plot_file_eval_res) plt.savefig(plot_file_eval_res) plt.show() print('done.')