pprint(preproc.vocabulary)

print('\nvocabulary contains %d tokens' % len(preproc.vocabulary))


#%% Display a keywords-in-context (KWIC) table

# the result is returned as *datatable* (because it is much faster to construct)
print('keywords-in-context (KWIC) table for keyword "Merkel":')
print(preproc.get_kwic_table('Merkel'))

#%% Apply Part-of-Speech tagging (POS tagging) and lemmatization to normalize the vocabulary

# this is very computationally extensive and hence takes a long time, even when computed in parallel
# consider storing / loading the processing state as shown below
preproc.pos_tag().lemmatize()

#%% Saving / loading state

# at any time you can save the current processing state to disk via `save_state(<path to file>)` and later
# restore it via `from_state(<path to file>)`
# this is extremely useful when you have computations that take a long time and after which you want to create
# "save points" in order to load the state and continue experimenting with the data without having to run the
# whole processing pipeline again

# preproc.save_state('data/bt18_tagged_lemmatized_state.pickle')
# preproc = TMPreproc.from_state('data/bt18_tagged_lemmatized_state.pickle')

#%% Further token normalization

# convert all tokens to lowercase and apply several "cleaning" methods (see `clean_tokens` for details)
Exemple #2
0
        u'doc1': u'Ein einfaches Beispiel in einfachem Deutsch.',
        u'doc2': u'Es enthält nur drei sehr einfache Dokumente.',
        u'doc3': u'Die Dokumente sind sehr kurz.',
    }

    preproc = TMPreproc(corpus, language='german')

    print('tokenized:')
    preproc.tokenize()
    pprint(preproc.tokens)

    # preproc.stem()
    # pprint(preproc.tokens)

    print('POS tagged:')
    preproc.pos_tag()
    pprint(preproc.tokens_with_pos_tags)

    print('lemmatized:')
    preproc.lemmatize()
    pprint(preproc.tokens_with_pos_tags)

    print('lowercase:')
    preproc.tokens_to_lowercase()
    pprint(preproc.tokens)

    print('cleaned:')
    preproc.clean_tokens()
    pprint(preproc.tokens_with_pos_tags)
    pprint(preproc.tokens)
Exemple #3
0
    t for t in vocab if pttrn_token_w_specialchar.search(t)
]
uncommon_special_chars = set(
    [pttrn_token_w_specialchar_inv.sub('', t) for t in tokens_w_specialchars])
uncommon_special_chars = set(
    sum([[c for c in cs] for cs in uncommon_special_chars], []))

print('detected the following uncommon special characters:')
for c in uncommon_special_chars:
    print('%04x' % ord(c))

print('running preprocessing pipeline...')
preproc.pos_tag()\
       .lemmatize()\
       .tokens_to_lowercase()\
       .remove_special_chars_in_tokens()\
       .clean_tokens(remove_shorter_than=2)\
       .remove_common_tokens(0.9)\
       .remove_uncommon_tokens(3, absolute=True)

print('retrieving tokens...')
tokens = preproc.tokens

print('generating DTM...')
doc_labels, vocab, dtm = preproc.get_dtm()

output_dtm_pickle = DATA_PICKLE_DTM % preproc_mode

print('writing DTM to `%s`...' % output_dtm_pickle)
pickle_data((doc_labels, vocab, dtm, tokens), output_dtm_pickle)
print('done.')