pprint(preproc.vocabulary) print('\nvocabulary contains %d tokens' % len(preproc.vocabulary)) #%% Display a keywords-in-context (KWIC) table # the result is returned as *datatable* (because it is much faster to construct) print('keywords-in-context (KWIC) table for keyword "Merkel":') print(preproc.get_kwic_table('Merkel')) #%% Apply Part-of-Speech tagging (POS tagging) and lemmatization to normalize the vocabulary # this is very computationally extensive and hence takes a long time, even when computed in parallel # consider storing / loading the processing state as shown below preproc.pos_tag().lemmatize() #%% Saving / loading state # at any time you can save the current processing state to disk via `save_state(<path to file>)` and later # restore it via `from_state(<path to file>)` # this is extremely useful when you have computations that take a long time and after which you want to create # "save points" in order to load the state and continue experimenting with the data without having to run the # whole processing pipeline again # preproc.save_state('data/bt18_tagged_lemmatized_state.pickle') # preproc = TMPreproc.from_state('data/bt18_tagged_lemmatized_state.pickle') #%% Further token normalization # convert all tokens to lowercase and apply several "cleaning" methods (see `clean_tokens` for details)
u'doc1': u'Ein einfaches Beispiel in einfachem Deutsch.', u'doc2': u'Es enthält nur drei sehr einfache Dokumente.', u'doc3': u'Die Dokumente sind sehr kurz.', } preproc = TMPreproc(corpus, language='german') print('tokenized:') preproc.tokenize() pprint(preproc.tokens) # preproc.stem() # pprint(preproc.tokens) print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags) print('lowercase:') preproc.tokens_to_lowercase() pprint(preproc.tokens) print('cleaned:') preproc.clean_tokens() pprint(preproc.tokens_with_pos_tags) pprint(preproc.tokens)
t for t in vocab if pttrn_token_w_specialchar.search(t) ] uncommon_special_chars = set( [pttrn_token_w_specialchar_inv.sub('', t) for t in tokens_w_specialchars]) uncommon_special_chars = set( sum([[c for c in cs] for cs in uncommon_special_chars], [])) print('detected the following uncommon special characters:') for c in uncommon_special_chars: print('%04x' % ord(c)) print('running preprocessing pipeline...') preproc.pos_tag()\ .lemmatize()\ .tokens_to_lowercase()\ .remove_special_chars_in_tokens()\ .clean_tokens(remove_shorter_than=2)\ .remove_common_tokens(0.9)\ .remove_uncommon_tokens(3, absolute=True) print('retrieving tokens...') tokens = preproc.tokens print('generating DTM...') doc_labels, vocab, dtm = preproc.get_dtm() output_dtm_pickle = DATA_PICKLE_DTM % preproc_mode print('writing DTM to `%s`...' % output_dtm_pickle) pickle_data((doc_labels, vocab, dtm, tokens), output_dtm_pickle) print('done.')