Esempio n. 1
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    before = pypdf.PdfFileReader(str(input_file))
    after = pypdf.PdfFileReader(outpdf)

    assert after.documentInfo['/Title'] == german
    assert after.documentInfo['/Author'] == chinese
    assert after.documentInfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(before.documentInfo['/CreationDate'])
    after_date = decode_pdf_date(after.documentInfo['/CreationDate'])
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Esempio n. 2
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(input_file,
                               outpdf,
                               '--title',
                               german,
                               '--author',
                               chinese,
                               '--output-type',
                               output_type,
                               env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    if ghostscript.version() >= '9.24':
        pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO')

    assert after.metadata.Title == german, after.metadata
    assert after.metadata.Author == chinese, after.metadata
    assert after.metadata.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.metadata.CreationDate))
    after_date = decode_pdf_date(str(after.metadata.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Esempio n. 3
0
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
                                 infile, outpdf):
    input_file = resources / infile

    before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo()
    check_ocrmypdf(
        input_file, outpdf, '--output-type', output_type, 
        env=spoof_tesseract_noop)
    after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo()

    if not before:
        # If there was input creation date, none should be output
        # because of Ghostscript quirks we set it to null
        # This test would be better if we had a test file with /DocumentInfo but
        # no /CreationDate, which we don't
        assert not after['/CreationDate'] or \
                isinstance(after['/CreationDate'], pypdf.generic.NullObject)
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(before['/CreationDate'])
        date_after = decode_pdf_date(after['/CreationDate'])
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(after['/ModDate'])
    assert seconds_between_dates(
        date_after, datetime.datetime.now(timezone.utc)) < 1000
Esempio n. 4
0
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
                                 infile, outpdf):
    input_file = resources / infile

    check_ocrmypdf(input_file,
                   outpdf,
                   '--output-type',
                   output_type,
                   env=spoof_tesseract_noop)

    pdf_before = pikepdf.open(input_file)
    pdf_after = pikepdf.open(outpdf)

    before = pdf_before.trailer.get('/Info', {})
    after = pdf_after.trailer.get('/Info', {})

    if not before:
        # If there was input creation date, none should be output
        # because of Ghostscript quirks we set it to null
        # This test would be better if we had a test file with /DocumentInfo but
        # no /CreationDate, which we don't
        assert after.get('/CreationDate', '') == ''
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(str(before['/CreationDate']))
        date_after = decode_pdf_date(str(after['/CreationDate']))
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(str(after['/ModDate']))
    assert seconds_between_dates(date_after, datetime.datetime.now(
        timezone.utc)) < 1000