def test_corpus_apply(texts): c = Corpus({str(i): t for i, t in enumerate(texts)}) c_orig = c.copy() orig_doc_labels = c.doc_labels orig_doc_lengths = c.doc_lengths assert isinstance(c.apply(str.upper), Corpus) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == orig_doc_lengths for dl, dt in c.items(): assert c_orig[dl].upper() == dt
def test_corpus_copy(texts): c1 = Corpus({str(i): t for i, t in enumerate(texts)}) c2 = c1.copy() assert c1.docs is not c2.docs assert c1.docs == c2.docs assert c1.doc_paths is not c2.doc_paths assert c1.doc_paths == c2.doc_paths assert c1.doc_labels == c2.doc_labels assert c1.doc_lengths == c2.doc_lengths assert c1.unique_characters == c2.unique_characters
def test_corpus_filter_characters(texts): c = Corpus({str(i): t for i, t in enumerate(texts)}) c_orig = c.copy() orig_doc_labels = c.doc_labels orig_doc_lengths = c.doc_lengths orig_uniq_chars = c.unique_characters assert isinstance(c.filter_characters(orig_uniq_chars), Corpus) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == orig_doc_lengths assert c.unique_characters == orig_uniq_chars not_in_corpus_chars = set(string.printable) - orig_uniq_chars if len(not_in_corpus_chars) > 0: c.filter_characters(not_in_corpus_chars) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == {dl: 0 for dl in c.doc_labels} assert c.unique_characters == set() c = c_orig.copy() c.filter_characters(set()) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == {dl: 0 for dl in c.doc_labels} assert c.unique_characters == set() if len(orig_uniq_chars) > 3: c = c_orig.copy() only_chars = set(sample(list(orig_uniq_chars), 3)) c.filter_characters(only_chars) assert c.doc_labels == orig_doc_labels assert c.doc_lengths != orig_doc_lengths assert c.unique_characters == only_chars c = c_orig.copy() only_chars = set(sample(list(orig_uniq_chars), 3)) c.filter_characters(''.join(only_chars)) # as char sequence assert c.doc_labels == orig_doc_labels assert c.doc_lengths != orig_doc_lengths assert c.unique_characters == only_chars