Beispiel #1
0
    preproc.tokenize()
    pprint(preproc.tokens)

    # preproc.stem()
    # pprint(preproc.tokens)

    print('POS tagged:')
    preproc.pos_tag()
    pprint(preproc.tokens_with_pos_tags)

    print('lemmatized:')
    preproc.lemmatize()
    pprint(preproc.tokens_with_pos_tags)

    print('lowercase:')
    preproc.tokens_to_lowercase()
    pprint(preproc.tokens)

    print('cleaned:')
    preproc.clean_tokens()
    pprint(preproc.tokens_with_pos_tags)
    pprint(preproc.tokens)

    print('filtered:')
    preproc.filter_for_token(u'einfach', remove_found_token=True)
    preproc.filter_for_pos('N')
    pprint(preproc.tokens_with_pos_tags)

    print('saving tokens as pickle...')
    pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle')
#%% Saving / loading state

# at any time you can save the current processing state to disk via `save_state(<path to file>)` and later
# restore it via `from_state(<path to file>)`
# this is extremely useful when you have computations that take a long time and after which you want to create
# "save points" in order to load the state and continue experimenting with the data without having to run the
# whole processing pipeline again

# preproc.save_state('data/bt18_tagged_lemmatized_state.pickle')
# preproc = TMPreproc.from_state('data/bt18_tagged_lemmatized_state.pickle')

#%% Further token normalization

# convert all tokens to lowercase and apply several "cleaning" methods (see `clean_tokens` for details)
print('applying further token normalization')
preproc.tokens_to_lowercase().clean_tokens().remove_tokens(r'^-.+', match_type='regex')

print('vocabulary:')
pprint(preproc.vocabulary)

print('\nvocabulary contains %d tokens' % len(preproc.vocabulary))

# there are still some stray tokens which should be removed:
preproc.remove_tokens(['#en', "''", "'s", '+++', '+40', ',50', '...', '.plädieren'])

#%% Let's have a look at the most frequent tokens

print('retrieving document frequencies for all tokens in the vocabulary')
vocab_doc_freq = preproc.vocabulary_rel_doc_frequency
vocab_doc_freq_df = pd.DataFrame({'token': list(vocab_doc_freq.keys()),
                                  'freq': list(vocab_doc_freq.values())})