Example #1
0
def test_prevent_gs_invalid_xml(resources, outdir):
    from ocrmypdf.__main__ import parser
    from ocrmypdf._pipeline import convert_to_pdfa
    from ocrmypdf.pdfa import generate_pdfa_ps
    from ocrmypdf.pdfinfo import PdfInfo

    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf')

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(resources / 'enron1.pdf')
    context = PDFContext(options, outdir, resources / 'enron1.pdf', pdfinfo)

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'rb') as f:
        with mmap.mmap(
            f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ
        ) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Example #2
0
def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(
        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])
    )

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            # Ensure we did not carry the nul forward.
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Example #3
0
def test_srgb_in_unicode_path(tmp_path):
    """Test that we can produce pdfmark when install path is not ASCII"""

    dstdir = tmp_path / b'\xe4\x80\x80'.decode('utf-8')
    dstdir.mkdir()
    dst = dstdir / 'sRGB.icc'

    copyfile(SRGB_ICC_PROFILE, fspath(dst))

    with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)):
        generate_pdfa_ps(dstdir / 'out.ps')
Example #4
0
def test_srgb_in_unicode_path(tmpdir):
    """Test that we can produce pdfmark when install path is not ASCII"""

    dstdir = Path(fspath(tmpdir)) / b'\xe4\x80\x80'.decode('utf-8')
    dstdir.mkdir()
    dst = dstdir / 'sRGB.icc'

    copyfile(SRGB_ICC_PROFILE, fspath(dst))

    with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)):
        generate_pdfa_ps(dstdir / 'out.ps')
Example #5
0
def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo)

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    print(caplog.records)
    assert any('malformed DocumentInfo block' in record.message
               for record in caplog.records)
Example #6
0
def test_prevent_gs_invalid_xml(resources, outdir):
    from ocrmypdf.__main__ import parser
    from ocrmypdf._pipeline import convert_to_pdfa
    from ocrmypdf.pdfa import generate_pdfa_ps
    from ocrmypdf.pdfinfo import PdfInfo

    generate_pdfa_ps(outdir / 'pdfa.ps')
    input_files = [str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps')]
    copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf')
    log = logging.getLogger()
    context = JobContext()

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    context.options = options
    context.pdfinfo = PdfInfo(resources / 'enron1.pdf')

    convert_to_pdfa(
        input_files_groups=input_files,
        output_file=outdir / 'pdfa.pdf',
        log=log,
        context=context,
    )

    with open(outdir / 'pdfa.pdf', 'rb') as f:
        with mmap.mmap(
            f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ
        ) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Example #7
0
def generate_postscript_stub(context: PdfContext):
    output_file = context.get_path('pdfa.ps')
    generate_pdfa_ps(output_file)
    return output_file