Beispiel #1
0
def test_metadata_fixup_warning(resources, outdir, caplog):
    options = get_parser().parse_args(
        args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
    )

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')

    context = PdfContext(
        options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([])
    )
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    for record in caplog.records:
        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.pdf')
    with graph.open_metadata() as meta:
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph_mod.pdf')

    context = PdfContext(
        options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([])
    )
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    assert any(record.levelname == 'WARNING' for record in caplog.records)
Beispiel #2
0
def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(
        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])
    )

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            # Ensure we did not carry the nul forward.
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Beispiel #3
0
def exec_concurrent(context: PdfContext, executor: Executor):
    """Execute the pipeline concurrently"""

    # Run exec_page_sync on every page context
    options = context.options
    max_workers = min(len(context.pdfinfo), options.jobs)
    if max_workers > 1:
        log.info("Start processing %d pages concurrently", max_workers)

    sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo)
    ocrgraft = OcrGrafter(context)

    def update_page(result: PageResult, pbar):
        try:
            tls.pageno = result.pageno + 1
            sidecars[result.pageno] = result.text
            pbar.update()
            ocrgraft.graft_page(
                pageno=result.pageno,
                image=result.pdf_page_from_image,
                textpdf=result.ocr,
                autorotate_correction=result.orientation_correction,
            )
            pbar.update()
        finally:
            tls.pageno = None

    executor(
        use_threads=options.use_threads,
        max_workers=max_workers,
        tqdm_kwargs=dict(
            total=(2 * len(context.pdfinfo)),
            desc='OCR'
            if options.tesseract_timeout > 0 else 'Image processing',
            unit='page',
            unit_scale=0.5,
            disable=not options.progress_bar,
        ),
        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
        task=exec_page_sync,
        task_arguments=context.get_page_contexts(),
        task_finished=update_page,
    )

    # Output sidecar text
    if options.sidecar:
        text = merge_sidecars(sidecars, context)
        # Copy text file to destination
        copy_final(text, options.sidecar, context)

    # Merge layers to one single pdf
    pdf = ocrgraft.finalize()

    if options.output_type != 'none':
        # PDF/A and metadata
        log.info("Postprocessing...")
        pdf = post_process(pdf, context, executor)

        # Copy PDF file to destination
        copy_final(pdf, options.output_file, context)
Beispiel #4
0
def optimize_pdf(input_file: Path, context: PdfContext, executor: Executor):
    output_file = context.get_path('optimize.pdf')
    save_settings = dict(
        linearize=should_linearize(input_file, context),
        **get_pdf_save_settings(context.options.output_type),
    )
    optimize(input_file, output_file, context, save_settings, executor)
    return output_file
Beispiel #5
0
def metadata_fixup(working_file: Path, context: PdfContext):
    output_file = context.get_path('metafix.pdf')
    options = context.options

    def report_on_metadata(missing):
        if not missing:
            return
        if options.output_type.startswith('pdfa'):
            log.warning(
                "Some input metadata could not be copied because it is not "
                "permitted in PDF/A. You may wish to examine the output "
                "PDF's XMP metadata.")
            log.debug("The following metadata fields were not copied: %r",
                      missing)
        else:
            log.error("Some input metadata could not be copied."
                      "You may wish to examine the output PDF's XMP metadata.")
            log.info("The following metadata fields were not copied: %r",
                     missing)

    with pikepdf.open(
            context.origin) as original, pikepdf.open(working_file) as pdf:
        docinfo = get_docinfo(original, context)
        with pdf.open_metadata() as meta:
            meta.load_from_docinfo(docinfo,
                                   delete_missing=False,
                                   raise_failure=False)
            # If xmp:CreateDate is missing, set it to the modify date to
            # match Ghostscript, for consistency
            if 'xmp:CreateDate' not in meta:
                meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')

            # Ghostscript likes to set title to Untitled if omitted from input.
            # Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1
            # and the XMP Spec do not make this recommendation.
            if meta.get('dc:title') == 'Untitled':
                with original.open_metadata(
                        set_pikepdf_as_editor=False,
                        update_docinfo=False) as original_meta:
                    if 'dc:title' not in original_meta:
                        del meta['dc:title']

            meta_original = original.open_metadata()
            missing = set(meta_original.keys()) - set(meta.keys())
            report_on_metadata(missing)

        pdf.save(
            output_file,
            compress_streams=True,
            preserve_pdfa=True,
            object_stream_mode=pikepdf.ObjectStreamMode.generate,
            linearize=(  # Don't linearize if optimize() will be linearizing too
                should_linearize(working_file, context)
                if options.optimize == 0 else False),
        )

    return output_file
Beispiel #6
0
def optimize_pdf(input_file: Path, context: PdfContext):
    output_file = context.get_path('optimize.pdf')
    save_settings = dict(
        compress_streams=True,
        preserve_pdfa=True,
        object_stream_mode=pikepdf.ObjectStreamMode.generate,
        linearize=should_linearize(input_file, context),
    )
    optimize(input_file, output_file, context, save_settings)
    return output_file
Beispiel #7
0
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext):
    options = context.options
    input_pdfinfo = context.pdfinfo
    fix_docinfo_file = context.get_path('fix_docinfo.pdf')
    output_file = context.get_path('pdfa.pdf')

    # If the DocumentInfo record contains NUL characters, Ghostscript will
    # produce XMP metadata which contains invalid XML entities (&#0;).
    # NULs in DocumentInfo seem to be common since older Acrobats included them.
    # pikepdf can deal with this, but we make the world a better place by
    # stamping them out as soon as possible.
    modified = False
    with pikepdf.open(input_pdf) as pdf_file:
        try:
            len(pdf_file.docinfo)
        except TypeError:
            log.error(
                "File contains a malformed DocumentInfo block - continuing anyway"
            )
        else:
            if pdf_file.docinfo:
                for k, v in pdf_file.docinfo.items():
                    if b'\x00' in bytes(v):
                        pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
                        modified = True
        if modified:
            pdf_file.save(fix_docinfo_file)
        else:
            safe_symlink(input_pdf, fix_docinfo_file)

    context.plugin_manager.hook.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=[fix_docinfo_file],
        pdfmark=input_ps_stub,
        output_file=output_file,
        compression=options.pdfa_image_compression,
        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
        progressbar_class=(context.plugin_manager.hook.get_progressbar_class()
                           if options.progress_bar else None),
    )

    return output_file
Beispiel #8
0
def main(infile, outfile, level, jobs=1):
    from shutil import copy  # pylint: disable=import-outside-toplevel
    from tempfile import TemporaryDirectory  # pylint: disable=import-outside-toplevel

    class OptimizeOptions:
        """Emulate ocrmypdf's options"""

        def __init__(
            self, input_file, jobs, optimize_, jpeg_quality, png_quality, jb2lossy
        ):
            self.input_file = input_file
            self.jobs = jobs
            self.optimize = optimize_
            self.jpeg_quality = jpeg_quality
            self.png_quality = png_quality
            self.jbig2_page_group_size = 0
            self.jbig2_lossy = jb2lossy
            self.quiet = True
            self.progress_bar = False

    infile = Path(infile)
    options = OptimizeOptions(
        input_file=infile,
        jobs=jobs,
        optimize_=int(level),
        jpeg_quality=0,  # Use default
        png_quality=0,
        jb2lossy=False,
    )

    with TemporaryDirectory() as td:
        context = PdfContext(options, td, infile, None, None)
        tmpout = Path(td) / 'out.pdf'
        optimize(
            infile,
            tmpout,
            context,
            dict(
                compress_streams=True,
                preserve_pdfa=True,
                object_stream_mode=pikepdf.ObjectStreamMode.generate,
            ),
        )
        copy(fspath(tmpout), fspath(outfile))
Beispiel #9
0
def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext):
    output_file = context.get_path('sidecar.txt')
    with open(output_file, 'w', encoding="utf-8") as stream:
        for page_num, txt_file in enumerate(txt_files):
            if page_num != 0:
                stream.write('\f')  # Form feed between pages
            if txt_file:
                with open(txt_file, 'r', encoding="utf-8") as in_:
                    txt = in_.read()
                    # Some OCR engines (e.g. Tesseract v4 alpha) add form feeds
                    # between pages, and some do not. For consistency, we ignore
                    # any added by the OCR engine and them on our own.
                    if txt.endswith('\f'):
                        stream.write(txt[:-1])
                    else:
                        stream.write(txt)
            else:
                stream.write(f'[OCR skipped on page {(page_num + 1)}]')
    return output_file
Beispiel #10
0
def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo, get_plugin_manager([]))

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    print(caplog.records)
    assert any('malformed DocumentInfo block' in record.message
               for record in caplog.records)
Beispiel #11
0
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok
Beispiel #12
0
def generate_postscript_stub(context: PdfContext):
    output_file = context.get_path('pdfa.ps')
    generate_pdfa_ps(output_file)
    return output_file