Example #1
0
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return
def test_evaluation_lda_all_metrics_multi_vs_singleproc():
    passed_params = {'n_topics', 'alpha', 'n_iter', 'refresh', 'random_state'}
    varying_params = [dict(n_topics=k, alpha=1/k) for k in range(2, 5)]
    const_params = dict(n_iter=10, refresh=1, random_state=1)

    evaluate_topic_models_kwargs = dict(
        metric=tm_lda.AVAILABLE_METRICS,
        held_out_documents_wallach09_n_samples=10,
        held_out_documents_wallach09_n_folds=2,
        coherence_gensim_vocab=EVALUATION_TEST_VOCAB,
        coherence_gensim_texts=EVALUATION_TEST_TOKENS,
        return_models=True
    )

    eval_res = tm_lda.evaluate_topic_models(EVALUATION_TEST_DTM, varying_params, const_params,
                                            **evaluate_topic_models_kwargs)

    assert len(eval_res) == len(varying_params)

    for param_set, metric_results in eval_res:
        assert set(param_set.keys()) == passed_params
        assert set(metric_results.keys()) == set(tm_lda.AVAILABLE_METRICS + ('model',))

        assert 0 <= metric_results['cao_juan_2009'] <= 1
        assert 0 <= metric_results['arun_2010']
        assert metric_results['coherence_mimno_2011'] < 0
        assert np.isclose(metric_results['coherence_gensim_u_mass'], metric_results['coherence_mimno_2011'])
        assert 0 <= metric_results['coherence_gensim_c_v'] <= 1
        assert metric_results['coherence_gensim_c_uci'] < 0
        assert metric_results['coherence_gensim_c_npmi'] < 0

        if 'griffiths_2004' in tm_lda.AVAILABLE_METRICS:  # only if gmpy2 is installed
            assert metric_results['griffiths_2004'] < 0

        if 'loglikelihood' in tm_lda.AVAILABLE_METRICS:
            assert metric_results['loglikelihood'] < 0

        if 'held_out_documents_wallach09' in tm_lda.AVAILABLE_METRICS:  # only if gmpy2 is installed
            assert metric_results['held_out_documents_wallach09'] < 0

        assert isinstance(metric_results['model'], lda.LDA)

    eval_res_singleproc = tm_lda.evaluate_topic_models(EVALUATION_TEST_DTM, varying_params, const_params,
                                                                n_max_processes=1, **evaluate_topic_models_kwargs)
    assert len(eval_res_singleproc) == len(eval_res)
    for param_set2, metric_results2 in eval_res_singleproc:
        for x, y in eval_res:
            if x == param_set2:
                param_set1, metric_results1 = x, y
                break
        else:
            assert False

        # exclude results that use metrics with random sampling
        if 'held_out_documents_wallach09' in tm_lda.AVAILABLE_METRICS:  # only if gmpy2 is installed
            del metric_results1['held_out_documents_wallach09']
            del metric_results2['held_out_documents_wallach09']

        del metric_results1['model']
        del metric_results2['model']

        assert metric_results1 == metric_results2
Example #3
0
import numpy as np
from scipy.sparse import csr_matrix
rows = []
cols = []
data = []
for i in range(0, len(trigram_bow_corpus)):
    line = trigram_bow_corpus[i]
    for indx, freq in line:
        rows.append(i)
        cols.append(indx)
        data.append(freq)
dtm = csr_matrix((data, (rows, cols)),
                 shape=(len(trigram_bow_corpus), len(trigram_dictionary)),
                 dtype=int)

const_params = dict(n_iter=20)
ks = list(range(5, 100,
                5))  #+ list(range(50, 200, 50)) + list(range(200, 500, 100))
varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks]

eval_results = tm_lda.evaluate_topic_models(
    dtm, varying_params, const_params, return_models=True)  #,n_max_processes=8

results_by_n_topics = results_by_parameter(eval_results, 'n_topics')

# fig, ax = plt.subplots(figsize=(8, 6))
plot_eval_results(results_by_n_topics)
plt.tight_layout()
# plt.savefig('valid_lda.eps', format='eps', dpi=300)
plt.show()
Example #4
0
    doc_labels = lda.datasets.load_reuters_titles()
    vocab = lda.datasets.load_reuters_vocab()
    dtm = lda.datasets.load_reuters()
    print('%d documents with vocab size %d' % (len(doc_labels), len(vocab)))
    assert dtm.shape[0] == len(doc_labels)
    assert dtm.shape[1] == len(vocab)

    # evaluate topic models with different parameters
    const_params = dict(n_iter=1500, random_state=1, refresh=10, eta=0.1)    # beta is called eta in the 'lda' package
    ks = list(range(10, 140, 10)) + list(range(140, 300, 20)) + [300, 325, 350, 375, 400, 450, 500]
    varying_params = [dict(n_topics=k, alpha=1.0/k) for k in ks]

    # this will evaluate all models in parallel using the metrics in tm_lda.DEFAULT_METRICS
    # still, this will take some time
    print('evaluating %d topic models' % len(varying_params))
    models = tm_lda.evaluate_topic_models(dtm, varying_params, const_params,
                                          return_models=True)  # retain the calculated models

    # save the results as pickle
    print('saving results')
    pickle_data(models, 'data/lda_evaluation_results.pickle')

    # plot the results
    print('plotting evaluation results')
    results_by_n_topics = results_by_parameter(models, 'n_topics')
    plot_eval_results(results_by_n_topics, xaxislabel='num. topics k',
                      title='Evaluation results for alpha=1/k, beta=0.1', figsize=(8, 6))
    plt.savefig('data/lda_evaluation_plot.png')
    plt.show()

    # the peak seems to be around n_topics == 120
    # print the distributions of this model
Example #5
0
tokens = list(doc_tokens.values())
del doc_tokens
assert len(tokens) == len(doc_labels)
print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum()))

print('evaluating topic models...')
constant_params = dict(n_iter=n_iter,
#                       random_state=1,
                       eta=eta)
print('constant parameters:')
pprint(constant_params)
varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50))
#varying_num_topics = list(range(5,11))
varying_alpha = [alpha_mod/k for k in varying_num_topics]
varying_params = [dict(n_topics=k, alpha=a) for k, a in zip(varying_num_topics, varying_alpha)]
print('varying parameters:')
pprint(varying_params)

eval_results = tm_lda.evaluate_topic_models(dtm, varying_params, constant_params,
                                            metric=('griffiths_2004', 'cao_juan_2009', 'arun_2010',
                                                    'coherence_mimno_2011', 'coherence_gensim_c_v'),
                                            coherence_gensim_vocab=vocab,
                                            coherence_gensim_texts=tokens)

pickle_file_eval_res = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (preproc_mode, eta, alpha_mod)
print('saving results to file `%s`' % pickle_file_eval_res)
pickle_data(eval_results, pickle_file_eval_res)

print('done.')