def test_corpus_split_by_paragraphs(): c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}') orig_docs = c.docs orig_doc_paths = c.doc_paths c.split_by_paragraphs() par_docs = c.docs assert len(par_docs) >= len(orig_docs) assert len(set(orig_doc_paths.values())) == len(set(c.doc_paths.values())) for k, d in orig_docs.items(): assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung') pars = [ par_docs[par_k] for par_k in sorted(par_docs.keys()) if par_k.startswith(k) ] assert len(pars) > 0 pars_ = paragraphs_from_lines(d) assert len(pars_) == len(pars) assert set(pars_) == set(pars)
def test_paragraphs_from_lines_already_split_hypothesis(lines): pars = paragraphs_from_lines(lines, splitchar=None) assert len(pars) <= len(lines) assert all(len(p) > 0 for p in pars)
def test_paragraphs_from_lines_hypothesis(lines): pars = paragraphs_from_lines(lines) assert len(pars) <= len(lines) assert all(len(p) > 0 for p in pars)
def test_paragraphs_from_lines(): with pytest.raises(ValueError): paragraphs_from_lines("foo", splitchar=None) assert len(paragraphs_from_lines('')) == 0 assert len(paragraphs_from_lines(' ')) == 0 assert len(paragraphs_from_lines('\n')) == 0 assert len(paragraphs_from_lines('\n\n')) == 0 assert len(paragraphs_from_lines('\n\n\n')) == 0 pars = paragraphs_from_lines("foo") assert len(pars) == 1 assert pars[0] == "foo" testlines1 = u""" par1 lorem ipsum par2 lorem par3 ipsum lorem dorem par4 """ pars = paragraphs_from_lines( testlines1) # with default break_on_num_newlines=2 assert len(pars) == 4 assert pars[0] == 'par1 lorem ipsum' assert pars[1] == 'par2 lorem' assert pars[2] == 'par3 ipsum lorem dorem' assert pars[3] == 'par4' assert paragraphs_from_lines(testlines1.split('\n'), splitchar=None) == pars pars = paragraphs_from_lines(testlines1, break_on_num_newlines=1) assert len(pars) == 7 assert pars[0] == 'par1 lorem' assert pars[1] == 'ipsum' assert pars[6] == 'par4' pars = paragraphs_from_lines(testlines1, break_on_num_newlines=3) assert len(pars) == 3 assert pars[0] == 'par1 lorem ipsum par2 lorem' assert pars[1] == 'par3 ipsum lorem dorem' assert pars[2] == 'par4'