def test_override_metadata(output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, _out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, '--plugin', 'tests/plugins/tesseract_noop.py', ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_creation_date_preserved(output_type, resources, infile, outpdf): input_file = resources / infile check_ocrmypdf( input_file, outpdf, '--output-type', output_type, '--plugin', 'tests/plugins/tesseract_noop.py', ) pdf_before = pikepdf.open(input_file) pdf_after = pikepdf.open(outpdf) before = pdf_before.trailer.get('/Info', {}) after = pdf_after.trailer.get('/Info', {}) if not before: assert after.get('/CreationDate', '') != '' else: # We expect that the creation date stayed the same date_before = decode_pdf_date(str(before['/CreationDate'])) date_after = decode_pdf_date(str(after['/CreationDate'])) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(str(after['/ModDate'])) assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_creation_date_preserved( spoof_tesseract_noop, output_type, resources, infile, outpdf ): input_file = resources / infile check_ocrmypdf( input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop ) pdf_before = pikepdf.open(input_file) pdf_after = pikepdf.open(outpdf) before = pdf_before.trailer.get('/Info', {}) after = pdf_after.trailer.get('/Info', {}) if not before: assert after.get('/CreationDate', '') != '' else: # We expect that the creation date stayed the same date_before = decode_pdf_date(str(before['/CreationDate'])) date_after = decode_pdf_date(str(after['/CreationDate'])) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(str(after['/ModDate'])) assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
def test_build_metadata(trivial, graph, outdir): with trivial.open_metadata(set_pikepdf_as_editor=False) as xmp: xmp.load_from_docinfo(graph.docinfo) trivial.save(outdir / 'tmp.pdf') pdf = pikepdf.open(outdir / 'tmp.pdf') assert pdf.Root.Metadata.Type == Name.Metadata assert pdf.Root.Metadata.Subtype == Name.XML with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp: assert 'pdf:Producer' not in xmp xmp_date = xmp['xmp:CreateDate'] docinfo_date = decode_pdf_date(trivial.docinfo[Name.CreationDate]) assert xmp_date == docinfo_date.isoformat()
def test_decode_pdf_date(): VALS = [ ('20160220040559', datetime(2016, 2, 20, 4, 5, 59)), ("20180101010101Z00'00'", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), ("20180101010101Z", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), ("20180101010101+0000", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), ( "20180101010101+0100", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1))), ), ] for s, d in VALS: assert decode_pdf_date(s) == d