def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err before = pypdf.PdfFileReader(str(input_file)) after = pypdf.PdfFileReader(outpdf) assert after.documentInfo['/Title'] == german assert after.documentInfo['/Author'] == chinese assert after.documentInfo.get('/Keywords', '') == '' before_date = decode_pdf_date(before.documentInfo['/CreationDate']) after_date = decode_pdf_date(after.documentInfo['/CreationDate']) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf(input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) if ghostscript.version() >= '9.24': pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO') assert after.metadata.Title == german, after.metadata assert after.metadata.Author == chinese, after.metadata assert after.metadata.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.metadata.CreationDate)) after_date = decode_pdf_date(str(after.metadata.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources, infile, outpdf): input_file = resources / infile before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo() check_ocrmypdf( input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop) after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo() if not before: # If there was input creation date, none should be output # because of Ghostscript quirks we set it to null # This test would be better if we had a test file with /DocumentInfo but # no /CreationDate, which we don't assert not after['/CreationDate'] or \ isinstance(after['/CreationDate'], pypdf.generic.NullObject) else: # We expect that the creation date stayed the same date_before = decode_pdf_date(before['/CreationDate']) date_after = decode_pdf_date(after['/CreationDate']) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(after['/ModDate']) assert seconds_between_dates( date_after, datetime.datetime.now(timezone.utc)) < 1000
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources, infile, outpdf): input_file = resources / infile check_ocrmypdf(input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop) pdf_before = pikepdf.open(input_file) pdf_after = pikepdf.open(outpdf) before = pdf_before.trailer.get('/Info', {}) after = pdf_after.trailer.get('/Info', {}) if not before: # If there was input creation date, none should be output # because of Ghostscript quirks we set it to null # This test would be better if we had a test file with /DocumentInfo but # no /CreationDate, which we don't assert after.get('/CreationDate', '') == '' else: # We expect that the creation date stayed the same date_before = decode_pdf_date(str(before['/CreationDate'])) date_after = decode_pdf_date(str(after['/CreationDate'])) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(str(after['/ModDate'])) assert seconds_between_dates(date_after, datetime.datetime.now( timezone.utc)) < 1000