Ejemplo n.º 1
0
def test_pickle_unpickle():
    pfile = 'tests/data/test_pickle_unpickle.pickle'
    input_data = ('foo', 123, [])
    pickle_data(input_data, pfile)

    output_data = unpickle_file(pfile)

    for i, o in zip(input_data, output_data):
        assert i == o
Ejemplo n.º 2
0
def load_ldamodel_from_pickle(picklefile, **kwargs):
    """Load a LDA model from a pickle file."""
    return unpickle_file(picklefile, **kwargs)
Ejemplo n.º 3
0
    print('call script as: %s <tokens preprocessing pipeline> <eta> <alpha factor> <num. iterations>' % sys.argv[0])
    print('<tokens preprocessing pipeline> must be 0, 1 or 2')
    exit(1)

preproc_mode = int(sys.argv[1])
assert 0 <= preproc_mode <= 2
eta = float(sys.argv[2])
assert 0 < eta < 1
alpha_mod = float(sys.argv[3])
assert alpha_mod > 0
n_iter = int(sys.argv[4])
assert n_iter > 0

dtm_pickle = DATA_PICKLE_DTM % preproc_mode
print('loading DTM from file `%s`...' % dtm_pickle)
doc_labels, vocab, dtm, doc_tokens = unpickle_file(dtm_pickle)
assert len(doc_labels) == dtm.shape[0]
assert len(vocab) == dtm.shape[1]
tokens = list(doc_tokens.values())
del doc_tokens
assert len(tokens) == len(doc_labels)
print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum()))

print('evaluating topic models...')
constant_params = dict(n_iter=n_iter,
#                       random_state=1,
                       eta=eta)
print('constant parameters:')
pprint(constant_params)
varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50))
#varying_num_topics = list(range(5,11))
Ejemplo n.º 4
0
ep-00-02-16.de""".split('\n')
FILEIDS = ['german/' + f for f in FILES]

DTM_PICKLE = 'data/read_preproc_lda_de_dtm.pickle'
LDA_PICKLE = 'data/read_preproc_lda_de_lda.pickle'

logging.basicConfig(level=logging.DEBUG)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.DEBUG)
tmtoolkit_log.propagate = True

if __name__ == '__main__':  # this is necessary for multiprocessing on Windows!
    if os.path.exists(DTM_PICKLE):
        print("loading DTM data from pickle file '%s'..." % DTM_PICKLE)

        pickled_data = unpickle_file(DTM_PICKLE)
        assert pickled_data['dtm'].shape[0] == len(pickled_data['docnames'])
        assert pickled_data['dtm'].shape[1] == len(pickled_data['vocab'])

        dtm, vocab, doc_labels = pickled_data['dtm'], pickled_data[
            'vocab'], pickled_data['docnames']
    else:
        europarl = nltk.corpus.util.LazyCorpusLoader(
            'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS)

        corpus = Corpus(
            {f: europarl.raw(f_id)
             for f, f_id in zip(FILES, FILEIDS)})

        print("all loaded documents:")
        for dl, text in corpus.docs.items():
Ejemplo n.º 5
0
if len(sys.argv) != 4:
    print('run script as: %s  <tokens preprocessing pipeline> <eta> <alpha factor>' % sys.argv[0])
    print('<tokens preprocessing pipeline> must be 0, 1 or 2')
    exit(1)

toks = int(sys.argv[1])
eta = float(sys.argv[2])
alpha_mod = float(sys.argv[3])

#%%

picklefile = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (toks, eta, alpha_mod)
print('loading pickle file with evaluation results from `%s`' % picklefile)

eval_results = unpickle_file(picklefile)
eval_results_by_n_topics = results_by_parameter(eval_results, 'n_topics')

n_metrics = len(eval_results_by_n_topics[0][1])

#%%

fig, axes = plot_eval_results(eval_results_by_n_topics,
                              title='Evaluation results for alpha=%.2f/k, beta=%.2f' % (alpha_mod, eta),
                              xaxislabel='num. topics (k)')

plot_file_eval_res = 'fig/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.png' % (toks, eta, alpha_mod)
print('saving plot to file `%s`' % plot_file_eval_res)
plt.savefig(plot_file_eval_res)
plt.show()
Ejemplo n.º 6
0
# other parameters
BURNIN = 5  # with a default of refresh=10 this means 50 burnin iterations

# paths to data files

DATA_PICKLE_DTM = 'data/speeches_tokens_%d.pickle' % toks
LDA_MODEL_PICKLE = 'data/model%d.pickle' % toks
LDA_MODEL_LL_PLOT = 'data/model%d_logliks.png' % toks
LDA_MODEL_EXCEL_OUTPUT = 'data/model%d_results.xlsx' % toks

#%% load
print('input tokens from preprocessing pipeline %d' % toks)

print('loading DTM from `%s`...' % DATA_PICKLE_DTM)
doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM)
assert len(doc_labels) == dtm.shape[0]
assert len(vocab) == dtm.shape[1]
print('loaded DTM with %d documents, %d vocab size, %d tokens' %
      (len(doc_labels), len(vocab), dtm.sum()))

#%% compute model

print('generating model with parameters:')
pprint(LDA_PARAMS)

model = LDA(**LDA_PARAMS)
model.fit(dtm)

#%% output
Ejemplo n.º 7
0
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tmtoolkit.utils import unpickle_file
from tmtoolkit.topicmod.model_stats import get_most_relevant_words_for_topic, get_topic_word_relevance, \
    get_doc_lengths, get_marginal_topic_distrib, exclude_topics


pd.set_option('display.width', 180)

#%% load data

# model and DTM
doc_labels, vocab, dtm, model = unpickle_file('data/model2.pickle')

n_docs, n_topics = model.doc_topic_.shape
_, n_vocab = model.topic_word_.shape

assert n_docs == len(doc_labels) == dtm.shape[0]
assert n_topics == model.topic_word_.shape[0]
assert n_vocab == len(vocab) == dtm.shape[1]

print('loaded model with %d documents, vocab size %d, %d tokens and %d topics'
      % (n_docs, n_vocab, dtm.sum(), n_topics))

# raw speeches
speeches_merged = unpickle_file('data/speeches_merged.pickle')

# TOPs data