def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf') options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(resources / 'enron1.pdf') context = PDFContext(options, outdir, resources / 'enron1.pdf', pdfinfo) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'rb') as f: with mmap.mmap( f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ ) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext( options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([]) ) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_srgb_in_unicode_path(tmp_path): """Test that we can produce pdfmark when install path is not ASCII""" dstdir = tmp_path / b'\xe4\x80\x80'.decode('utf-8') dstdir.mkdir() dst = dstdir / 'sRGB.icc' copyfile(SRGB_ICC_PROFILE, fspath(dst)) with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)): generate_pdfa_ps(dstdir / 'out.ps')
def test_srgb_in_unicode_path(tmpdir): """Test that we can produce pdfmark when install path is not ASCII""" dstdir = Path(fspath(tmpdir)) / b'\xe4\x80\x80'.decode('utf-8') dstdir.mkdir() dst = dstdir / 'sRGB.icc' copyfile(SRGB_ICC_PROFILE, fspath(dst)) with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)): generate_pdfa_ps(dstdir / 'out.ps')
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') input_files = [str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps')] copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf') log = logging.getLogger() context = JobContext() options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) context.options = options context.pdfinfo = PdfInfo(resources / 'enron1.pdf') convert_to_pdfa( input_files_groups=input_files, output_file=outdir / 'pdfa.pdf', log=log, context=context, ) with open(outdir / 'pdfa.pdf', 'rb') as f: with mmap.mmap( f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ ) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def generate_postscript_stub(context: PdfContext): output_file = context.get_path('pdfa.ps') generate_pdfa_ps(output_file) return output_file