def test_get_form(src, expected, expected_get_fields): """Check if we can read out form data.""" src = os.path.join(RESOURCE_ROOT, src) reader = PdfReader(src) fields = reader.get_form_text_fields() assert fields == expected with open("tmp-fields-report.txt", "w") as f: fields = reader.get_fields(fileobj=f) assert fields == expected_get_fields if fields: for field in fields.values(): # Just access the attributes [ field.field_type, field.parent, field.kids, field.name, field.alternate_name, field.mapping_name, field.flags, field.value, field.default_value, field.additional_actions, ] # cleanup os.remove("tmp-fields-report.txt")
def test_get_fields(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972486.pdf" name = "tika-972486.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) fields = reader.get_fields() assert fields is not None assert "c1-1" in fields assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"})
def test_get_fields_read_write_report(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/909/909655.pdf" name = "tika-909655.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) with open("tmp-fields-report.txt", "w") as fp: fields = reader.get_fields(fileobj=fp) assert fields # cleanup os.remove("tmp-fields-report.txt")
def test_get_fields(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/961/961883.pdf" name = "tika-961883.pdf" data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) with open("tmp.txt", "w") as fp: with pytest.warns(PdfReadWarning, match="Object 2 0 not defined."): retrieved_fields = reader.get_fields(fileobj=fp) assert retrieved_fields == {} # Cleanup os.remove("tmp.txt")
def test_get_fields_read_else_block2(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914902.pdf" name = "tika-914902.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) fields = reader.get_fields() assert fields is None