Example #1
0
preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')

preproc.remove_special_chars_in_tokens()
add_timing('remove_special_chars_in_tokens')

preproc.tokens_to_lowercase()
add_timing('tokens_to_lowercase')

preproc.clean_tokens()
add_timing('clean_tokens')

preproc.remove_common_tokens(0.9)
preproc.remove_uncommon_tokens(0.05)
add_timing('remove_common_tokens / remove_uncommon_tokens')

vocab = preproc.vocabulary
add_timing('get vocab')

tokens = preproc.tokens
add_timing('get tokens')

tokens_tagged = preproc.get_tokens(with_metadata=True, as_datatables=False)
add_timing('get tagged tokens')

dtm = preproc.get_dtm()
add_timing('get dtm')
Example #2
0
vocab_doc_freq_df = pd.DataFrame({'token': list(vocab_doc_freq.keys()),
                                  'freq': list(vocab_doc_freq.values())})

print('top 50 tokens by relative document frequency:')
vocab_top = vocab_doc_freq_df.sort_values('freq', ascending=False).head(50)
print(vocab_top)

# plot this
plt.figure()
vocab_top.plot(x='token', y='freq', kind='bar')
plt.show()

#%% Further token cleanup

# we can remove tokens above a certain threshold of (relative or absolute) document frequency
preproc.remove_common_tokens(0.8)   # this will only remove "müssen"

# since we'll later use tf-idf, common words don't have much influence on the result and can remain

#%% Document lengths (number of tokens per document)

doc_labels = np.array(list(preproc.doc_lengths.keys()))
doc_lengths = np.array(list(preproc.doc_lengths.values()))

print('range of document lengths: %d tokens minimum, %d tokens maximum' % (np.min(doc_lengths), np.max(doc_lengths)))
print('mean document length:', np.mean(doc_lengths))
print('mean document length:', np.median(doc_lengths))

plt.figure()
plt.hist(doc_lengths, bins=100)
plt.title('Histogram of document lengths')