def test_corpus_from_tabular(): for ext in ('csv', 'xlsx'): c = Corpus.from_tabular('tests/data/100NewsArticles.' + ext, 'article_id', 'text') assert len(c.docs) == 100 assert all(dl.startswith('100NewsArticles') for dl in c.doc_labels) c.add_tabular('tests/data/100NewsArticles.' + ext, 'article_id', 'text', prepend_columns=['title', 'subtitle'], doc_label_fmt='added-{id}') assert len(c.docs) == 200 n_added = 0 for dl, nchars in c.doc_lengths.items(): if dl.startswith('added'): n_added += 1 _, doc_id = dl.split('-') assert nchars >= c.doc_lengths['100NewsArticles-' + doc_id] assert n_added == 100 assert len(Corpus.from_tabular('tests/data/bt18_speeches_sample.csv', 0, 2)) == 1000
def test_corpus_replace_characters_simple(): c = Corpus({'doc1': 'ABC', 'doc2': 'abcDeF'}) c.replace_characters({'a': None, 'C': 'c', 'e': ord('X')}) assert c.docs == { 'doc1': 'ABc', 'doc2': 'bcDXF', } c.replace_characters({ord('A'): None}) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bcDXF', } c.replace_characters(str.maketrans('DXFY', '1234')) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bc123', } c.replace_characters({}) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bc123', }
def test_corpus_builtin_corpora(): builtin_corp = Corpus.builtin_corpora() assert len(builtin_corp) == 2 for corp in builtin_corp: c = Corpus.from_builtin_corpus(corp) assert len(c) > 0
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5): # corpus = Corpus() corpus.add_files(file_name, encoding='utf8') # preproc = TMPreproc(corpus) dtm_bg = preproc.dtm # var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)] # const_params = { 'n_iter': n_iter, 'random_state': 20200713 # to make results reproducible } eval_results = evaluate_topic_models(dtm_bg, varying_parameters=var_params, constant_parameters=const_params, metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#, #return_models=True ) # eval_results_by_topics = results_by_parameter(eval_results, 'n_topics') # name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang) plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small') plt.tight_layout() plt.savefig('out/'+name) return
def test_corpus_builtin_corpora(): builtin_corp = Corpus.builtin_corpora() assert sorted(builtin_corp) == sorted( Corpus._BUILTIN_CORPORA_LOAD_KWARGS.keys()) for corp in builtin_corp: c = Corpus.from_builtin_corpus(corp) assert len(c) > 0
def test_corpus_from_pickle(): c1 = Corpus({'a': '1', 'b': '22', 'c': '333'}) with tempfile.TemporaryFile(suffix='.pickle') as f: c1.to_pickle(f) f.seek(0) c2 = Corpus.from_pickle(f) assert c1.docs == c2.docs
def test_corpus_pass_tmpreproc(): c = Corpus() c['doc1'] = 'A simple example in simple English.' c['doc2'] = 'It contains only three very simple documents.' c['doc3'] = 'Simply written documents are very brief.' preproc = TMPreproc(c) tok = preproc.tokenize().tokens assert set(tok.keys()) == set(c.keys()) assert len(tok['doc1']) == 7
def test_corpus_from_folder_valid_ext(): assert len( Corpus.from_folder('examples/data/gutenberg', valid_extensions='txt').docs) == 3 assert len( Corpus.from_folder('examples/data/gutenberg', valid_extensions='foo').docs) == 0 assert len( Corpus.from_folder('examples/data/gutenberg', valid_extensions=('foo', 'txt')).docs) == 3
def test_corpus_copy(texts): c1 = Corpus({str(i): t for i, t in enumerate(texts)}) c2 = c1.copy() assert c1.docs is not c2.docs assert c1.docs == c2.docs assert c1.doc_paths is not c2.doc_paths assert c1.doc_paths == c2.doc_paths assert c1.doc_labels == c2.doc_labels assert c1.doc_lengths == c2.doc_lengths assert c1.unique_characters == c2.unique_characters
def test_corpus_apply(texts): c = Corpus({str(i): t for i, t in enumerate(texts)}) c_orig = c.copy() orig_doc_labels = c.doc_labels orig_doc_lengths = c.doc_lengths assert isinstance(c.apply(str.upper), Corpus) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == orig_doc_lengths for dl, dt in c.items(): assert c_orig[dl].upper() == dt
def test_corpus_filter_by_min_length(): c = Corpus.from_folder('examples/data/gutenberg', force_unix_linebreaks=False) assert len(c.filter_by_min_length(1).docs) == 3 assert len(c.filter_by_min_length(142694).docs) == 1 assert len(c.filter_by_min_length(142695).docs) == 0 assert len(c.filter_by_min_length(1).docs) == 0
def test_corpus_unique_characters(texts): all_chars = set(''.join(texts)) c = Corpus({str(i): t for i, t in enumerate(texts)}) res_chars = c.unique_characters assert isinstance(res_chars, set) assert res_chars == all_chars
def test_corpus_filter_by_max_length(): c = Corpus.from_folder('tests/data/gutenberg', force_unix_linebreaks=False) assert len(c.filter_by_max_length(999999).docs) == 3 assert len(c.filter_by_max_length(142694).docs) == 3 assert len(c.filter_by_max_length(142693).docs) == 2 assert len(c.filter_by_max_length(0).docs) == 0 assert len(c.filter_by_max_length(999999).docs) == 0
def test_corpus_filter_by_max_length(): c = Corpus.from_folder('examples/data/gutenberg') assert len(c.filter_by_max_length(999999).docs) == 3 assert len(c.filter_by_max_length(142694).docs) == 3 assert len(c.filter_by_max_length(142693).docs) == 2 assert len(c.filter_by_max_length(0).docs) == 0 assert len(c.filter_by_max_length(999999).docs) == 0
def test_corpus_from_files(): doc_path = 'examples/data/gutenberg/kafka_verwandlung.txt' c1 = Corpus.from_files([doc_path]) c2 = Corpus().add_files([doc_path]) assert len(c1.docs) == len(c1.doc_paths) == 1 assert len(c2.docs) == len(c2.doc_paths) == 1 assert c1.docs.keys() == c2.docs.keys() == c1.doc_paths.keys( ) == c2.doc_paths.keys() only_doc_label = next(iter(c1.docs.keys())) assert only_doc_label.endswith('kafka_verwandlung') only_doc = c1.docs[only_doc_label] assert len(only_doc) > 0 assert c1.doc_paths[only_doc_label] == doc_path
def test_corpus_split_by_paragraphs_rejoin(): # TODO: better tests here c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}') c2 = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}') orig_docs = c.docs #par_docs = c.split_by_paragraphs().docs par_docs_joined = c2.split_by_paragraphs(join_paragraphs=5).docs assert len(par_docs_joined) >= len(orig_docs) for k, d in orig_docs.items(): assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung') pars = [ par_docs_joined[par_k] for par_k in sorted(par_docs_joined.keys()) if par_k.startswith(k) ] assert len(pars) > 0
def test_corpus_from_files2(): c = Corpus.from_files([ 'examples/data/gutenberg/werther/goethe_werther1.txt', 'examples/data/gutenberg/werther/goethe_werther2.txt' ]) assert len(c.docs) == len(c.doc_paths) == 2 for k, d in c.docs.items(): assert k[:-1].endswith('goethe_werther') assert len(d) > 0
def test_corpus_sample(): c = Corpus.from_folder('tests/data/gutenberg') n_docs_orig = c.n_docs sampled_docs = c.sample(2) assert isinstance(sampled_docs, Corpus) assert len(sampled_docs) == 2 assert c.n_docs == n_docs_orig sampled_docs = c.sample(2, as_corpus=False) assert isinstance(sampled_docs, dict) assert len(sampled_docs) == 2 assert c.n_docs == n_docs_orig assert isinstance(c.sample(2, inplace=True), Corpus) assert c.n_docs == 2
def test_corpus_split_by_paragraphs(): c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}') orig_docs = c.docs orig_doc_paths = c.doc_paths c.split_by_paragraphs() par_docs = c.docs assert len(par_docs) >= len(orig_docs) assert len(set(orig_doc_paths.values())) == len(set(c.doc_paths.values())) for k, d in orig_docs.items(): assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung') pars = [ par_docs[par_k] for par_k in sorted(par_docs.keys()) if par_k.startswith(k) ] assert len(pars) > 0 pars_ = paragraphs_from_lines(d) assert len(pars_) == len(pars) assert set(pars_) == set(pars)
def test_corpus_dict_methods(): c = Corpus() assert len(c) == 0 with pytest.raises(KeyError): x = c['x'] with pytest.raises(KeyError): c[1] = 'abc' with pytest.raises(KeyError): c[''] = 'abc' with pytest.raises(ValueError): c['d1'] = None c['d1'] = 'd1 text' assert len(c) == 1 assert 'd1' in c assert set(c.keys()) == {'d1'} assert c['d1'] == 'd1 text' c['d2'] = 'd2 text' assert len(c) == 2 for dl in c: assert dl in {'d1', 'd2'} assert set(c.keys()) == {'d1', 'd2'} for dl, dt in c.items(): assert dl in {'d1', 'd2'} assert c[dl] == dt with pytest.raises(KeyError): del c['d3'] del c['d1'] assert len(c) == 1 assert set(c.keys()) == {'d2'} del c['d2'] assert len(c) == 0 assert set(c.keys()) == set()
def test_corpus_from_folder_not_existent(): with pytest.raises(IOError): Corpus.from_folder('not_existent')
def test_corpus_filter_characters(texts): c = Corpus({str(i): t for i, t in enumerate(texts)}) c_orig = c.copy() orig_doc_labels = c.doc_labels orig_doc_lengths = c.doc_lengths orig_uniq_chars = c.unique_characters assert isinstance(c.filter_characters(orig_uniq_chars), Corpus) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == orig_doc_lengths assert c.unique_characters == orig_uniq_chars not_in_corpus_chars = set(string.printable) - orig_uniq_chars if len(not_in_corpus_chars) > 0: c.filter_characters(not_in_corpus_chars) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == {dl: 0 for dl in c.doc_labels} assert c.unique_characters == set() c = c_orig.copy() c.filter_characters(set()) assert c.doc_labels == orig_doc_labels assert c.doc_lengths == {dl: 0 for dl in c.doc_labels} assert c.unique_characters == set() if len(orig_uniq_chars) > 3: c = c_orig.copy() only_chars = set(sample(list(orig_uniq_chars), 3)) c.filter_characters(only_chars) assert c.doc_labels == orig_doc_labels assert c.doc_lengths != orig_doc_lengths assert c.unique_characters == only_chars c = c_orig.copy() only_chars = set(sample(list(orig_uniq_chars), 3)) c.filter_characters(''.join(only_chars)) # as char sequence assert c.doc_labels == orig_doc_labels assert c.doc_lengths != orig_doc_lengths assert c.unique_characters == only_chars
def test_corpus_from_files_not_existent(): with pytest.raises(IOError): Corpus.from_files([ 'examples/data/gutenberg/werther/goethe_werther1.txt', 'not_existent' ])
def test_corpus_from_files_nonlist_arg(): with pytest.raises(ValueError): Corpus.from_files('wrong')
def test_corpus_get_doc_labels(): c = Corpus.from_folder('examples/data/gutenberg') assert set(c.docs.keys()) == set(c.get_doc_labels())
def test_corpus_add_doc(): c = Corpus() with pytest.raises(ValueError): c.add_doc('', 'x') with pytest.raises(ValueError): c.add_doc(123, 'x') with pytest.raises(ValueError): c.add_doc('d1', None) c.add_doc('d1', 'd1 text') with pytest.raises(ValueError): c.add_doc('d1', 'd1 text') c.add_doc('d2', '') assert set(c.keys()) == {'d1', 'd2'}
def test_corpus_sample(): c = Corpus.from_folder('examples/data/gutenberg') assert len(c.sample(2).docs) == 2
def test_empty_corpora(): c1 = Corpus() c2 = Corpus.from_files([]) c3 = Corpus.from_files([]).add_files([]) assert c1.docs == c2.docs == c3.docs == {}
def test_corpus_filter_by_min_length(): c = Corpus.from_folder('examples/data/gutenberg') assert len(c.filter_by_min_length(1).docs) == 3 assert len(c.filter_by_min_length(142694).docs) == 1 assert len(c.filter_by_min_length(142695).docs) == 0 assert len(c.filter_by_min_length(1).docs) == 0
def test_corpus_from_zip(): c = Corpus.from_zip('tests/data/zipdata.zip', id_column='article_id', text_column='text') assert sum(dl.startswith('100NewsArticles-') for dl in c.doc_labels) == 100 assert sum(dl == 'german-goethe_werther1' for dl in c.doc_labels) == 1