def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf') options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(resources / 'enron1.pdf') context = PDFContext(options, outdir, resources / 'enron1.pdf', pdfinfo) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'rb') as f: with mmap.mmap( f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ ) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_metadata_fixup_warning(resources, outdir, caplog): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import metadata_fixup options = parser.parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PDFContext(options, outdir, outdir / 'graph.pdf', None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records)
def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') input_files = [str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps')] copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf') log = logging.getLogger() context = JobContext() options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) context.options = options context.pdfinfo = PdfInfo(resources / 'enron1.pdf') convert_to_pdfa( input_files_groups=input_files, output_file=outdir / 'pdfa.pdf', log=log, context=context, ) with open(outdir / 'pdfa.pdf', 'rb') as f: with mmap.mmap( f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ ) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.cli import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00') options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1