Beispiel #1
0
def test_pickle_unpickle():
    pfile = 'tests/data/test_pickle_unpickle.pickle'
    input_data = ('foo', 123, [])
    pickle_data(input_data, pfile)

    output_data = unpickle_file(pfile)

    for i, o in zip(input_data, output_data):
        assert i == o
Beispiel #2
0
def save_ldamodel_to_pickle(picklefile,
                            model,
                            vocab,
                            doc_labels,
                            dtm=None,
                            **kwargs):
    """Save a LDA model as pickle file."""
    pickle_data(
        {
            'model': model,
            'vocab': vocab,
            'doc_labels': doc_labels,
            'dtm': dtm
        }, picklefile)
Beispiel #3
0
    print('POS tagged:')
    preproc.pos_tag()
    pprint(preproc.tokens_with_pos_tags)

    print('lemmatized:')
    preproc.lemmatize()
    pprint(preproc.tokens_with_pos_tags)

    print('lowercase:')
    preproc.tokens_to_lowercase()
    pprint(preproc.tokens)

    print('cleaned:')
    preproc.clean_tokens()
    pprint(preproc.tokens_with_pos_tags)
    pprint(preproc.tokens)

    print('filtered:')
    preproc.filter_for_token(u'einfach', remove_found_token=True)
    preproc.filter_for_pos('N')
    pprint(preproc.tokens_with_pos_tags)

    print('saving tokens as pickle...')
    pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle')

    print('DTM:')
    doc_labels, vocab, dtm = preproc.get_dtm()

    print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
Beispiel #4
0
    # evaluate topic models with different parameters
    const_params = dict(n_iter=1200, random_state=1, refresh=10)
    ks = list(range(10, 160, 5)) + list(range(160, 300,
                                              10)) + [300, 325, 350, 375, 400]
    varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks]

    # this will evaluate all models in parallel
    # still, this will take some time
    print('evaluating %d topic models' % len(varying_params))
    models = tm_lda.evaluate_topic_models(
        dtm, varying_params, const_params,
        return_models=True)  # retain the calculated models

    # save the results as pickle
    print('saving results')
    pickle_data(models, 'data/lda_evaluation_results.pickle')

    # plot the results
    print('plotting evaluation results')
    results_by_n_topics = results_by_parameter(models, 'n_topics')
    plot_eval_results(results_by_n_topics,
                      xaxislabel='num. topics k',
                      title='Evaluation results for alpha=1/k, beta=0.01',
                      figsize=(8, 6))
    plt.savefig('data/lda_evaluation_plot.png')
    plt.show()

    # the peak seems to be around n_topics == 140
    # print the distributions of this model
    n_topics_best_model = 140
    print('printing best model with n_topics=%d' % n_topics_best_model)
Beispiel #5
0
const_params = dict(
    update_every=0,
    passes=20,
    iterations=400,
    alpha='auto',
    eta='auto',
)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]

print('evaluating %d topic models' % len(varying_params))
eval_results = tm_gensim.evaluate_topic_models(
    (gnsm_dict, gnsm_corpus),
    varying_params,
    const_params,
    coherence_gensim_texts=model_lists)  # necessary for coherence C_V metric

# save the results as pickle
print('saving results')
pickle_data(eval_results, 'gensim_evaluation_results_entire.pickle')

# plot the results
print('plotting evaluation results')
plt.style.use('ggplot')
results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
plot_eval_results(results_by_n_topics,
                  xaxislabel='num. topics k',
                  title='Evaluation results',
                  figsize=(8, 6))
plt.savefig('gensim_evaluation_plot_entire.png')
plt.show()
Beispiel #6
0
tokens = list(doc_tokens.values())
del doc_tokens
assert len(tokens) == len(doc_labels)
print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum()))

print('evaluating topic models...')
constant_params = dict(n_iter=n_iter,
#                       random_state=1,
                       eta=eta)
print('constant parameters:')
pprint(constant_params)
varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50))
#varying_num_topics = list(range(5,11))
varying_alpha = [alpha_mod/k for k in varying_num_topics]
varying_params = [dict(n_topics=k, alpha=a) for k, a in zip(varying_num_topics, varying_alpha)]
print('varying parameters:')
pprint(varying_params)

eval_results = tm_lda.evaluate_topic_models(dtm, varying_params, constant_params,
                                            metric=('griffiths_2004', 'cao_juan_2009', 'arun_2010',
                                                    'coherence_mimno_2011', 'coherence_gensim_c_v'),
                                            coherence_gensim_vocab=vocab,
                                            coherence_gensim_texts=tokens)

pickle_file_eval_res = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (preproc_mode, eta, alpha_mod)
print('saving results to file `%s`' % pickle_file_eval_res)
pickle_data(eval_results, pickle_file_eval_res)

print('done.')

Beispiel #7
0
print('creating gensim corpus...')
gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]

# evaluate topic models with different parameters
const_params = dict(update_every=0, passes=10)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]

print('evaluating %d topic models' % len(varying_params))
eval_results = tm_gensim.evaluate_topic_models(
    (gnsm_dict, gnsm_corpus),
    varying_params,
    const_params,
    coherence_gensim_texts=texts)  # necessary for coherence C_V metric

# save the results as pickle
print('saving results')
pickle_data(eval_results, 'data/gensim_evaluation_results.pickle')

# plot the results
print('plotting evaluation results')
plt.style.use('ggplot')
results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
plot_eval_results(results_by_n_topics,
                  xaxislabel='num. topics k',
                  title='Evaluation results',
                  figsize=(8, 6))
plt.savefig('data/gensim_evaluation_plot.png')
plt.show()
        print('-- processing took %f sec. so far' % proc_time)

        preproc.save_state('data/read_preproc_lda_de_state.pickle')

        print('token samples:')
        for dl, tokens in preproc.tokens_with_pos_tags.items():
            print("> %s:" % dl)
            print(">>", sample(tokens, 10))

        print('generating DTM...')
        doc_labels, vocab, dtm = preproc.get_dtm()

        print("saving DTM data to pickle file '%s'..." % DTM_PICKLE)
        pickle_data({
            'dtm': dtm,
            'vocab': vocab,
            'docnames': doc_labels
        }, DTM_PICKLE)

    print("running LDA...")
    # note: this won't result in a good topic model. it's only here for demonstration purposes.
    # we should increase the number of iterations and also do some evaluation to get the "correct" number of topics.
    model = lda.LDA(n_topics=30, n_iter=500)
    model.fit(dtm)

    # print topic-word distributions with respective probabilities
    print_ldamodel_topic_words(model.topic_word_, vocab)

    # print document-topic distributions with respective probabilities
    print_ldamodel_doc_topics(model.doc_topic_, doc_labels)
Beispiel #9
0
assert len(vocab) == dtm.shape[1]
print('loaded DTM with %d documents, %d vocab size, %d tokens' %
      (len(doc_labels), len(vocab), dtm.sum()))

#%% compute model

print('generating model with parameters:')
pprint(LDA_PARAMS)

model = LDA(**LDA_PARAMS)
model.fit(dtm)

#%% output

print('saving model to `%s`' % LDA_MODEL_PICKLE)
pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE)

print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT)
save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT,
                               model.topic_word_,
                               model.doc_topic_,
                               doc_labels,
                               vocab,
                               dtm=dtm)

#%%
print('displaying loglikelihoods...')
plt.plot(
    np.arange(BURNIN, len(model.loglikelihoods_)) * 10,
    model.loglikelihoods_[BURNIN:])
plt.xlabel('iterations')
Beispiel #10
0
]
uncommon_special_chars = set(
    [pttrn_token_w_specialchar_inv.sub('', t) for t in tokens_w_specialchars])
uncommon_special_chars = set(
    sum([[c for c in cs] for cs in uncommon_special_chars], []))

print('detected the following uncommon special characters:')
for c in uncommon_special_chars:
    print('%04x' % ord(c))

print('running preprocessing pipeline...')
preproc.pos_tag()\
       .lemmatize()\
       .tokens_to_lowercase()\
       .remove_special_chars_in_tokens()\
       .clean_tokens(remove_shorter_than=2)\
       .remove_common_tokens(0.9)\
       .remove_uncommon_tokens(3, absolute=True)

print('retrieving tokens...')
tokens = preproc.tokens

print('generating DTM...')
doc_labels, vocab, dtm = preproc.get_dtm()

output_dtm_pickle = DATA_PICKLE_DTM % preproc_mode

print('writing DTM to `%s`...' % output_dtm_pickle)
pickle_data((doc_labels, vocab, dtm, tokens), output_dtm_pickle)
print('done.')