コード例 #1
0
def test_metadata_fixup_warning(resources, outdir, caplog):
    options = get_parser().parse_args(
        args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
    )

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')

    context = PdfContext(
        options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([])
    )
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    for record in caplog.records:
        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.pdf')
    with graph.open_metadata() as meta:
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph_mod.pdf')

    context = PdfContext(
        options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([])
    )
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    assert any(record.levelname == 'WARNING' for record in caplog.records)
コード例 #2
0
def test_user_words(caplog):
    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
        opts = make_opts(user_words='foo')
        plugin_manager = get_plugin_manager(opts.plugins)
        vd._check_options(opts, plugin_manager, set())
        assert '4.0 ignores --user-words' in caplog.text
    caplog.clear()
    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
        opts = make_opts(user_patterns='foo')
        plugin_manager = get_plugin_manager(opts.plugins)
        vd._check_options(opts, plugin_manager, set())
        assert '4.0 ignores --user-words' not in caplog.text
コード例 #3
0
def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(
        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])
    )

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            # Ensure we did not carry the nul forward.
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
コード例 #4
0
def test_rasterize_rotates(resources, tmp_path):
    pm = get_plugin_manager([])

    img = tmp_path / 'img90.png'
    pm.hook.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device='pngmono',
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=90,
        filter_vector=False,
    )
    assert Image.open(img).size == (123, 151), "Image not rotated"

    img = tmp_path / 'img180.png'
    pm.hook.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device='pngmono',
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=180,
        filter_vector=False,
    )
    assert Image.open(img).size == (151, 123), "Image not rotated"
コード例 #5
0
def test_old_tesseract_error():
    with patch('ocrmypdf._exec.tesseract.has_textonly_pdf',
               return_value=False):
        with pytest.raises(MissingDependencyError):
            opts = make_opts(pdf_renderer='sandwich', language='eng')
            plugin_manager = get_plugin_manager(opts.plugins)
            vd.check_options(opts, plugin_manager)
コード例 #6
0
def test_old_tesseract_error():
    with patch('ocrmypdf._exec.tesseract.version',
               return_value='4.00.00alpha'):
        with pytest.raises(MissingDependencyError):
            opts = make_opts(pdf_renderer='sandwich', language='eng')
            plugin_manager = get_plugin_manager(opts.plugins)
            vd._check_options(opts, plugin_manager, {'eng'})
コード例 #7
0
def test_no_progress_bar(progress_bar, resources):
    opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf'))
    plugin_manager = get_plugin_manager(opts.plugins)
    with patch('ocrmypdf._concurrent.tqdm', autospec=True) as tqdmpatch:
        vd._check_options(opts, plugin_manager, set())
        pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar)
        assert pdfinfo is not None
        assert tqdmpatch.called
        _args, kwargs = tqdmpatch.call_args
        assert kwargs['disable'] != progress_bar
コード例 #8
0
def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
    if language is not None:
        kwargs['language'] = language
    parser = get_parser()
    pm = get_plugin_manager(kwargs.get('plugins', []))
    pm.hook.add_options(parser=parser)  # pylint: disable=no-member
    return (
        create_options(
            input_file=input_file, output_file=output_file, parser=parser, **kwargs
        ),
        pm,
    )
コード例 #9
0
def test_language_warning(caplog):
    opts = make_opts(language=None)
    plugin_manager = get_plugin_manager(opts.plugins)
    caplog.set_level(logging.DEBUG)
    with patch('ocrmypdf._validation.locale.getlocale',
               return_value=('en_US', 'UTF-8')):
        vd.check_options_languages(opts, {'eng'})
        assert opts.languages == {'eng'}
        assert '' in caplog.text

    opts = make_opts(language=None)
    with patch('ocrmypdf._validation.locale.getlocale',
               return_value=('fr_FR', 'UTF-8')):
        vd.check_options_languages(opts, {'eng'})
        assert opts.languages == {'eng'}
        assert 'assuming --language' in caplog.text
コード例 #10
0
def test_no_progress_bar(progress_bar, resources):
    opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf'))
    plugin_manager = get_plugin_manager(opts.plugins)

    vd._check_options(opts, plugin_manager, set())

    pbar_disabled = None

    class CheckProgressBar(NullProgressBar):
        def __init__(self, disable, **kwargs):
            nonlocal pbar_disabled
            pbar_disabled = disable
            super().__init__(disable=disable, **kwargs)

    executor = SerialExecutor(pbar_class=CheckProgressBar)
    pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor)

    assert pdfinfo is not None
    assert pbar_disabled is not None and pbar_disabled != progress_bar
コード例 #11
0
ファイル: test_metadata.py プロジェクト: riku7032/OCRmyPDF
def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo, get_plugin_manager([]))

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    print(caplog.records)
    assert any('malformed DocumentInfo block' in record.message
               for record in caplog.records)
コード例 #12
0
def test_pagesegmode_warning(caplog):
    opts = make_opts(tesseract_pagesegmode='0')
    plugin_manager = get_plugin_manager(opts.plugins)
    vd._check_options(opts, plugin_manager, set())
    assert 'disable OCR' in caplog.text
コード例 #13
0
ファイル: _sync.py プロジェクト: knobix/OCRmyPDF
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok
コード例 #14
0
def ocr(  # pylint: disable=unused-argument
    input_file: PathOrIO,
    output_file: PathOrIO,
    *,
    language: Iterable[str] = None,
    image_dpi: int = None,
    output_type=None,
    sidecar: os.PathLike = None,
    jobs: int = None,
    use_threads: bool = None,
    title: str = None,
    author: str = None,
    subject: str = None,
    keywords: str = None,
    rotate_pages: bool = None,
    remove_background: bool = None,
    deskew: bool = None,
    clean: bool = None,
    clean_final: bool = None,
    unpaper_args: str = None,
    oversample: int = None,
    remove_vectors: bool = None,
    threshold: bool = None,
    force_ocr: bool = None,
    skip_text: bool = None,
    redo_ocr: bool = None,
    skip_big: float = None,
    optimize: int = None,
    jpg_quality: int = None,
    png_quality: int = None,
    jbig2_lossy: bool = None,
    jbig2_page_group_size: int = None,
    pages: str = None,
    max_image_mpixels: float = None,
    tesseract_config: Iterable[str] = None,
    tesseract_pagesegmode: int = None,
    tesseract_oem: int = None,
    pdf_renderer=None,
    tesseract_timeout: float = None,
    rotate_pages_threshold: float = None,
    pdfa_image_compression=None,
    user_words: os.PathLike = None,
    user_patterns: os.PathLike = None,
    fast_web_view: float = None,
    plugins: Iterable[Union[str, Path]] = None,
    keep_temporary_files: bool = None,
    progress_bar: bool = None,
    **kwargs,
):
    """Run OCRmyPDF on one PDF or image.

    For most arguments, see documentation for the equivalent command line parameter.
    A few specific arguments are discussed here:

    Args:
        use_threads: Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
        input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
            interpreted as file system path to the input file. If the object
            appears to be a readable stream (with methods such as ``.read()``
            and ``.seek()``), the object will be read in its entirety and saved to
            a temporary file. If ``input_file`` is  ``"-"``, standard input will be
            read.
        output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
            interpreted as file system path to the output file. If the object
            appears to be a writable stream (with methods such as ``.read()`` and
            ``.seek()``), the output will be written to this stream. If
            ``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
            (provided that standard output does not seem to be a terminal device).
            When a stream is used as output, whether via a writable object or
            ``"-"``, some final validation steps are not performed (we do not read
            back the stream after it is written).
    Raises:
        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
            with the OCR layer.
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
            could not be read, or some other file type that is not a PDF.
        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
            image is not credible (allowing it to proceed would cause poor OCR).
        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
            file failed.
        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
            text already, and settings did not tell us to proceed.
        ocrmypdf.InputFileError: Any other problem with the input file.
        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
        ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
            OCRmyPDF does not remove passwords.
        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
            valid.

    Returns:
        :class:`ocrmypdf.ExitCode`
    """
    if not plugins:
        plugins = []
    elif isinstance(plugins, (str, Path)):
        plugins = [plugins]
    else:
        plugins = list(plugins)

    parser = get_parser()
    _plugin_manager = get_plugin_manager(plugins)
    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member

    create_options_kwargs = {
        k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
    }
    create_options_kwargs.update(kwargs)

    if 'verbose' in kwargs:
        warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")

    options = create_options(**create_options_kwargs)
    check_options(options, _plugin_manager)
    return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
コード例 #15
0
ファイル: api.py プロジェクト: riku7032/OCRmyPDF
def ocr(  # pylint: disable=unused-argument
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    language: Iterable[str] = None,
    image_dpi: int = None,
    output_type=None,
    sidecar: os.PathLike = None,
    jobs: int = None,
    use_threads: bool = None,
    title: str = None,
    author: str = None,
    subject: str = None,
    keywords: str = None,
    rotate_pages: bool = None,
    remove_background: bool = None,
    deskew: bool = None,
    clean: bool = None,
    clean_final: bool = None,
    unpaper_args: str = None,
    oversample: int = None,
    remove_vectors: bool = None,
    threshold: bool = None,
    force_ocr: bool = None,
    skip_text: bool = None,
    redo_ocr: bool = None,
    skip_big: float = None,
    optimize: int = None,
    jpg_quality: int = None,
    png_quality: int = None,
    jbig2_lossy: bool = None,
    jbig2_page_group_size: int = None,
    pages: str = None,
    max_image_mpixels: float = None,
    tesseract_config: Iterable[str] = None,
    tesseract_pagesegmode: int = None,
    tesseract_oem: int = None,
    pdf_renderer=None,
    tesseract_timeout: float = None,
    rotate_pages_threshold: float = None,
    pdfa_image_compression=None,
    user_words: os.PathLike = None,
    user_patterns: os.PathLike = None,
    fast_web_view: float = None,
    plugins: Iterable[str] = None,
    keep_temporary_files: bool = None,
    progress_bar: bool = None,
    **kwargs,
):
    """Run OCRmyPDF on one PDF or image.

    For most arguments, see documentation for the equivalent command line parameter.
    A few specific arguments are discussed here:

    Args:
        use_threads (bool): Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
    Raises:
        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
            with the OCR layer.
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
            could not be read, or some other file type that is not a PDF.
        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
            image is not credible (allowing it to proceed would cause poor OCR).
        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
            file failed.
        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
            text already, and settings did not tell us to proceed.
        ocrmypdf.InputFileError: Any other problem with the input file.
        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
        ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
            OCRmyPDF does not remove passwords.
        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
            valid.

    Returns:
        :class:`ocrmypdf.ExitCode`
    """
    if not plugins:
        plugins = []

    parser = get_parser()
    _plugin_manager = get_plugin_manager(plugins)
    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member

    create_options_kwargs = {
        k: v
        for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
    }
    create_options_kwargs.update(kwargs)

    options = create_options(**create_options_kwargs)
    check_options(options, _plugin_manager)
    return run_pipeline(options=options,
                        plugin_manager=_plugin_manager,
                        api=True)
コード例 #16
0
 def __setstate__(self, state):
     self.__dict__.update(state)
     self.plugin_manager = get_plugin_manager(self.options.plugins)