Example #1
0
def test_prevent_gs_invalid_xml(resources, outdir):
    from ocrmypdf.__main__ import parser
    from ocrmypdf._pipeline import convert_to_pdfa
    from ocrmypdf.pdfa import generate_pdfa_ps
    from ocrmypdf.pdfinfo import PdfInfo

    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf')

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(resources / 'enron1.pdf')
    context = PDFContext(options, outdir, resources / 'enron1.pdf', pdfinfo)

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'rb') as f:
        with mmap.mmap(
            f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ
        ) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Example #2
0
def test_metadata_fixup_warning(resources, outdir, caplog):
    from ocrmypdf.__main__ import parser
    from ocrmypdf._pipeline import metadata_fixup

    options = parser.parse_args(
        args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'])

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')

    context = PDFContext(options, outdir, outdir / 'graph.pdf', None)
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    for record in caplog.records:
        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.pdf')
    with graph.open_metadata() as meta:
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph_mod.pdf')

    context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None)
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    assert any(record.levelname == 'WARNING' for record in caplog.records)
Example #3
0
def test_prevent_gs_invalid_xml(resources, outdir):
    from ocrmypdf.__main__ import parser
    from ocrmypdf._pipeline import convert_to_pdfa
    from ocrmypdf.pdfa import generate_pdfa_ps
    from ocrmypdf.pdfinfo import PdfInfo

    generate_pdfa_ps(outdir / 'pdfa.ps')
    input_files = [str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps')]
    copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf')
    log = logging.getLogger()
    context = JobContext()

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    context.options = options
    context.pdfinfo = PdfInfo(resources / 'enron1.pdf')

    convert_to_pdfa(
        input_files_groups=input_files,
        output_file=outdir / 'pdfa.pdf',
        log=log,
        context=context,
    )

    with open(outdir / 'pdfa.pdf', 'rb') as f:
        with mmap.mmap(
            f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ
        ) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Example #4
0
def test_prevent_gs_invalid_xml(resources, outdir):
    from ocrmypdf.cli import parser
    from ocrmypdf._pipeline import convert_to_pdfa
    from ocrmypdf.pdfa import generate_pdfa_ps
    from ocrmypdf.pdfinfo import PdfInfo

    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00')

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo)

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    with open(outdir / 'pdfa.pdf', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            # Ensure we did not carry the nul forward.
            assert mm.find(b'&#0;', xmp_start,
                           xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1