Exemple #1
0
def test_get_plaintext_document_body(tmpdir):
    input = [u"Some text\n", u"on multiple lines\n"]
    f = tmpdir.join("plain.txt")
    f.write("".join(input))
    assert input == get_plaintext_document_body(str(f))

    with pytest.raises(UnknownDocumentTypeError) as excinfo:
        html = "<html><body>Some page</body></html>"
        f = tmpdir.join("page.html")
        f.write(html)
        get_plaintext_document_body(str(f))
    assert 'text/html' in excinfo.value.args
Exemple #2
0
def test_clean_pdf_before_run(tmp_path, pdf_files):
    tmp_file_path = tmp_path / "packed.pdf"
    pdf = pdf_files[7]
    with open(pdf, 'rb') as input, open(tmp_file_path, 'wb') as tmp_out:
        tmp_out.write(input.read())

    text = get_plaintext_document_body(tmp_file_path.as_posix())
    assert text == ['Test\n', '\x0c']