def test_corpus_apply(texts): c = Corpus({str(i): t for i, t in enumerate(texts)}) c_orig = c.copy() orig_doc_labels = c.doc_labels orig_doc_lengths = c.doc_lengths assert isinstance(c.apply(str.upper), Corpus) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == orig_doc_lengths for dl, dt in c.items(): assert c_orig[dl].upper() == dt
print('replacing characters in each document of the corpus') corpus.replace_characters(char_transl_table) print('these non-ASCII characters are left:') pprint(corpus.unique_characters - set(string.printable)) #%% Correct contractions # some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen" # correct this by applying a custom function with a regular expression (RE) to each document in the corpus pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)') print('correcting wrong contractions') # in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m` corpus.apply(lambda t: pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t)) #%% Create a TMPreproc object for token processing # this takes some time because the documents are directly tokenized print('creating TMPreproc object from corpus') preproc = TMPreproc(corpus, language='german') print('created: %s' % preproc) # we don't need this anymore, remove it to free memory del corpus #%% Calculate the total number of tokens in the whole corpus print('total number of tokens in the whole corpus:') print(sum(preproc.doc_lengths.values()))