Exemple #1
0
def test_override_metadata(output_type, resources, outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, _out, err = run_ocrmypdf(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    assert after.docinfo.Title == german, after.docinfo
    assert after.docinfo.Author == chinese, after.docinfo
    assert after.docinfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.docinfo.CreationDate))
    after_date = decode_pdf_date(str(after.docinfo.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #2
0
def test_creation_date_preserved(output_type, resources, infile, outpdf):
    input_file = resources / infile

    check_ocrmypdf(
        input_file,
        outpdf,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    pdf_before = pikepdf.open(input_file)
    pdf_after = pikepdf.open(outpdf)

    before = pdf_before.trailer.get('/Info', {})
    after = pdf_after.trailer.get('/Info', {})

    if not before:
        assert after.get('/CreationDate', '') != ''
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(str(before['/CreationDate']))
        date_after = decode_pdf_date(str(after['/CreationDate']))
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(str(after['/ModDate']))
    assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
Exemple #3
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        env=spoof_tesseract_noop,
    )

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    assert after.docinfo.Title == german, after.docinfo
    assert after.docinfo.Author == chinese, after.docinfo
    assert after.docinfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.docinfo.CreationDate))
    after_date = decode_pdf_date(str(after.docinfo.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #4
0
def test_creation_date_preserved(
    spoof_tesseract_noop, output_type, resources, infile, outpdf
):
    input_file = resources / infile

    check_ocrmypdf(
        input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop
    )

    pdf_before = pikepdf.open(input_file)
    pdf_after = pikepdf.open(outpdf)

    before = pdf_before.trailer.get('/Info', {})
    after = pdf_after.trailer.get('/Info', {})

    if not before:
        assert after.get('/CreationDate', '') != ''
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(str(before['/CreationDate']))
        date_after = decode_pdf_date(str(after['/CreationDate']))
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(str(after['/ModDate']))
    assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
def test_build_metadata(trivial, graph, outdir):
    with trivial.open_metadata(set_pikepdf_as_editor=False) as xmp:
        xmp.load_from_docinfo(graph.docinfo)
    trivial.save(outdir / 'tmp.pdf')

    pdf = pikepdf.open(outdir / 'tmp.pdf')
    assert pdf.Root.Metadata.Type == Name.Metadata
    assert pdf.Root.Metadata.Subtype == Name.XML
    with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp:
        assert 'pdf:Producer' not in xmp
        xmp_date = xmp['xmp:CreateDate']
        docinfo_date = decode_pdf_date(trivial.docinfo[Name.CreationDate])
        assert xmp_date == docinfo_date.isoformat()
Exemple #6
0
def test_decode_pdf_date():
    VALS = [
        ('20160220040559', datetime(2016, 2, 20, 4, 5, 59)),
        ("20180101010101Z00'00'", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        ("20180101010101Z", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        ("20180101010101+0000", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        (
            "20180101010101+0100",
            datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1))),
        ),
    ]
    for s, d in VALS:
        assert decode_pdf_date(s) == d