print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags) print('lowercase:') preproc.tokens_to_lowercase() pprint(preproc.tokens) print('cleaned:') preproc.clean_tokens() pprint(preproc.tokens_with_pos_tags) pprint(preproc.tokens) print('filtered:') preproc.filter_for_token(u'einfach', remove_found_token=True) preproc.filter_for_pos('N') pprint(preproc.tokens_with_pos_tags) print('saving tokens as pickle...') pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle') print('DTM:') doc_labels, vocab, dtm = preproc.get_dtm() print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
add_timing('clean_tokens') preproc.remove_common_tokens(0.9) preproc.remove_uncommon_tokens(0.05) add_timing('remove_common_tokens / remove_uncommon_tokens') vocab = preproc.vocabulary add_timing('get vocab') tokens = preproc.tokens add_timing('get tokens') tokens_tagged = preproc.get_tokens(with_metadata=True, as_datatables=False) add_timing('get tagged tokens') dtm = preproc.get_dtm() add_timing('get dtm') if isinstance(dtm, tuple): _, _, dtm = dtm print('final DTM shape:') print(dtm.shape) print('timings:') t_sum = 0 prev_t = None for i, (t, label) in enumerate(zip(timings, timing_labels)): if i > 0: t_delta = (t - prev_t).total_seconds() print('%s: %.2f sec' % (label, t_delta))