Exemple #1
0
    print('POS tagged:')
    preproc.pos_tag()
    pprint(preproc.tokens_with_pos_tags)

    print('lemmatized:')
    preproc.lemmatize()
    pprint(preproc.tokens_with_pos_tags)

    print('lowercase:')
    preproc.tokens_to_lowercase()
    pprint(preproc.tokens)

    print('cleaned:')
    preproc.clean_tokens()
    pprint(preproc.tokens_with_pos_tags)
    pprint(preproc.tokens)

    print('filtered:')
    preproc.filter_for_token(u'einfach', remove_found_token=True)
    preproc.filter_for_pos('N')
    pprint(preproc.tokens_with_pos_tags)

    print('saving tokens as pickle...')
    pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle')

    print('DTM:')
    doc_labels, vocab, dtm = preproc.get_dtm()

    print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
add_timing('clean_tokens')

preproc.remove_common_tokens(0.9)
preproc.remove_uncommon_tokens(0.05)
add_timing('remove_common_tokens / remove_uncommon_tokens')

vocab = preproc.vocabulary
add_timing('get vocab')

tokens = preproc.tokens
add_timing('get tokens')

tokens_tagged = preproc.get_tokens(with_metadata=True, as_datatables=False)
add_timing('get tagged tokens')

dtm = preproc.get_dtm()
add_timing('get dtm')

if isinstance(dtm, tuple):
    _, _, dtm = dtm

print('final DTM shape:')
print(dtm.shape)

print('timings:')
t_sum = 0
prev_t = None
for i, (t, label) in enumerate(zip(timings, timing_labels)):
    if i > 0:
        t_delta = (t - prev_t).total_seconds()
        print('%s: %.2f sec' % (label, t_delta))