def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5): # corpus = Corpus() corpus.add_files(file_name, encoding='utf8') # preproc = TMPreproc(corpus) dtm_bg = preproc.dtm # var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)] # const_params = { 'n_iter': n_iter, 'random_state': 20200713 # to make results reproducible } eval_results = evaluate_topic_models(dtm_bg, varying_parameters=var_params, constant_parameters=const_params, metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#, #return_models=True ) # eval_results_by_topics = results_by_parameter(eval_results, 'n_topics') # name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang) plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small') plt.tight_layout() plt.savefig('out/'+name) return
def test_evaluation_lda_all_metrics_multi_vs_singleproc(): passed_params = {'n_topics', 'alpha', 'n_iter', 'refresh', 'random_state'} varying_params = [dict(n_topics=k, alpha=1/k) for k in range(2, 5)] const_params = dict(n_iter=10, refresh=1, random_state=1) evaluate_topic_models_kwargs = dict( metric=tm_lda.AVAILABLE_METRICS, held_out_documents_wallach09_n_samples=10, held_out_documents_wallach09_n_folds=2, coherence_gensim_vocab=EVALUATION_TEST_VOCAB, coherence_gensim_texts=EVALUATION_TEST_TOKENS, return_models=True ) eval_res = tm_lda.evaluate_topic_models(EVALUATION_TEST_DTM, varying_params, const_params, **evaluate_topic_models_kwargs) assert len(eval_res) == len(varying_params) for param_set, metric_results in eval_res: assert set(param_set.keys()) == passed_params assert set(metric_results.keys()) == set(tm_lda.AVAILABLE_METRICS + ('model',)) assert 0 <= metric_results['cao_juan_2009'] <= 1 assert 0 <= metric_results['arun_2010'] assert metric_results['coherence_mimno_2011'] < 0 assert np.isclose(metric_results['coherence_gensim_u_mass'], metric_results['coherence_mimno_2011']) assert 0 <= metric_results['coherence_gensim_c_v'] <= 1 assert metric_results['coherence_gensim_c_uci'] < 0 assert metric_results['coherence_gensim_c_npmi'] < 0 if 'griffiths_2004' in tm_lda.AVAILABLE_METRICS: # only if gmpy2 is installed assert metric_results['griffiths_2004'] < 0 if 'loglikelihood' in tm_lda.AVAILABLE_METRICS: assert metric_results['loglikelihood'] < 0 if 'held_out_documents_wallach09' in tm_lda.AVAILABLE_METRICS: # only if gmpy2 is installed assert metric_results['held_out_documents_wallach09'] < 0 assert isinstance(metric_results['model'], lda.LDA) eval_res_singleproc = tm_lda.evaluate_topic_models(EVALUATION_TEST_DTM, varying_params, const_params, n_max_processes=1, **evaluate_topic_models_kwargs) assert len(eval_res_singleproc) == len(eval_res) for param_set2, metric_results2 in eval_res_singleproc: for x, y in eval_res: if x == param_set2: param_set1, metric_results1 = x, y break else: assert False # exclude results that use metrics with random sampling if 'held_out_documents_wallach09' in tm_lda.AVAILABLE_METRICS: # only if gmpy2 is installed del metric_results1['held_out_documents_wallach09'] del metric_results2['held_out_documents_wallach09'] del metric_results1['model'] del metric_results2['model'] assert metric_results1 == metric_results2
import numpy as np from scipy.sparse import csr_matrix rows = [] cols = [] data = [] for i in range(0, len(trigram_bow_corpus)): line = trigram_bow_corpus[i] for indx, freq in line: rows.append(i) cols.append(indx) data.append(freq) dtm = csr_matrix((data, (rows, cols)), shape=(len(trigram_bow_corpus), len(trigram_dictionary)), dtype=int) const_params = dict(n_iter=20) ks = list(range(5, 100, 5)) #+ list(range(50, 200, 50)) + list(range(200, 500, 100)) varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks] eval_results = tm_lda.evaluate_topic_models( dtm, varying_params, const_params, return_models=True) #,n_max_processes=8 results_by_n_topics = results_by_parameter(eval_results, 'n_topics') # fig, ax = plt.subplots(figsize=(8, 6)) plot_eval_results(results_by_n_topics) plt.tight_layout() # plt.savefig('valid_lda.eps', format='eps', dpi=300) plt.show()
doc_labels = lda.datasets.load_reuters_titles() vocab = lda.datasets.load_reuters_vocab() dtm = lda.datasets.load_reuters() print('%d documents with vocab size %d' % (len(doc_labels), len(vocab))) assert dtm.shape[0] == len(doc_labels) assert dtm.shape[1] == len(vocab) # evaluate topic models with different parameters const_params = dict(n_iter=1500, random_state=1, refresh=10, eta=0.1) # beta is called eta in the 'lda' package ks = list(range(10, 140, 10)) + list(range(140, 300, 20)) + [300, 325, 350, 375, 400, 450, 500] varying_params = [dict(n_topics=k, alpha=1.0/k) for k in ks] # this will evaluate all models in parallel using the metrics in tm_lda.DEFAULT_METRICS # still, this will take some time print('evaluating %d topic models' % len(varying_params)) models = tm_lda.evaluate_topic_models(dtm, varying_params, const_params, return_models=True) # retain the calculated models # save the results as pickle print('saving results') pickle_data(models, 'data/lda_evaluation_results.pickle') # plot the results print('plotting evaluation results') results_by_n_topics = results_by_parameter(models, 'n_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results for alpha=1/k, beta=0.1', figsize=(8, 6)) plt.savefig('data/lda_evaluation_plot.png') plt.show() # the peak seems to be around n_topics == 120 # print the distributions of this model
tokens = list(doc_tokens.values()) del doc_tokens assert len(tokens) == len(doc_labels) print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) print('evaluating topic models...') constant_params = dict(n_iter=n_iter, # random_state=1, eta=eta) print('constant parameters:') pprint(constant_params) varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50)) #varying_num_topics = list(range(5,11)) varying_alpha = [alpha_mod/k for k in varying_num_topics] varying_params = [dict(n_topics=k, alpha=a) for k, a in zip(varying_num_topics, varying_alpha)] print('varying parameters:') pprint(varying_params) eval_results = tm_lda.evaluate_topic_models(dtm, varying_params, constant_params, metric=('griffiths_2004', 'cao_juan_2009', 'arun_2010', 'coherence_mimno_2011', 'coherence_gensim_c_v'), coherence_gensim_vocab=vocab, coherence_gensim_texts=tokens) pickle_file_eval_res = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (preproc_mode, eta, alpha_mod) print('saving results to file `%s`' % pickle_file_eval_res) pickle_data(eval_results, pickle_file_eval_res) print('done.')