Beispiel #1
0
def test_old_tesseract_error():
    with patch('ocrmypdf._exec.tesseract.has_textonly_pdf',
               return_value=False):
        with pytest.raises(MissingDependencyError):
            opts = make_opts(pdf_renderer='sandwich', language='eng')
            plugin_manager = get_plugin_manager(opts.plugins)
            vd.check_options(opts, plugin_manager)
Beispiel #2
0
def run(args=None):
    _parser, options, plugin_manager = get_parser_options_plugins(args=args)

    if not check_closed_streams(options):
        return ExitCode.bad_args

    if hasattr(os, 'nice'):
        os.nice(5)

    verbosity = options.verbose
    if not os.isatty(sys.stderr.fileno()):
        options.progress_bar = False
    if options.quiet:
        verbosity = Verbosity.quiet
        options.progress_bar = False
    configure_logging(verbosity,
                      progress_bar_friendly=options.progress_bar,
                      manage_root_logger=True)
    log.debug('ocrmypdf %s', __version__)
    try:
        check_options(options, plugin_manager)
    except ValueError as e:
        log.error(e)
        return ExitCode.bad_args
    except BadArgsError as e:
        log.error(e)
        return e.exit_code
    except MissingDependencyError as e:
        log.error(e)
        return ExitCode.missing_dependency

    result = run_pipeline(options=options, plugin_manager=plugin_manager)
    return result
Beispiel #3
0
def test_hocr_notlatin_warning(caplog):
    # Bypass the test to see if the language is installed; we just want to pretend
    # that a non-Latin language is installed
    with patch('ocrmypdf._validation.check_options_languages',
               return_value=None):
        vd.check_options(*make_opts_pm(
            language='chi_sim', pdf_renderer='hocr', output_type='pdfa'))
    assert 'PDF renderer is known to cause' in caplog.text
Beispiel #4
0
def test_no_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)
    options = parser.parse_args(args=["--clean", input_, output])

    with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
        mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
        with pytest.raises(MissingDependencyError):
            check_options(options)
def test_no_progress_bar(progress_bar, resources):
    opts = make_opts(progress_bar=progress_bar,
                     input_file=(resources / 'trivial.pdf'))
    with patch('ocrmypdf.pdfinfo.info.tqdm', autospec=True) as tqdmpatch:
        vd.check_options(opts)
        pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar)
        assert tqdmpatch.called
        _args, kwargs = tqdmpatch.call_args
        assert kwargs['disable'] != progress_bar
Beispiel #6
0
def test_old_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)

    _parser, options, pm = get_parser_options_plugins(
        ["--clean", input_, output])
    with patch("ocrmypdf._exec.unpaper.version") as mock_unpaper_version:
        mock_unpaper_version.return_value = '0.5'

        with pytest.raises(MissingDependencyError):
            check_options(options, pm)
Beispiel #7
0
def test_no_progress_bar(progress_bar, resources):
    opts = make_opts(progress_bar=progress_bar,
                     input_file=(resources / 'trivial.pdf'))
    plugin_manager = get_plugin_manager(opts.plugins)
    with patch('ocrmypdf._concurrent.tqdm', autospec=True) as tqdmpatch:
        vd.check_options(opts, plugin_manager)
        pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar)
        assert pdfinfo is not None
        assert tqdmpatch.called
        _args, kwargs = tqdmpatch.call_args
        assert kwargs['disable'] != progress_bar
Beispiel #8
0
def test_no_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)

    _parser, options, pm = get_parser_options_plugins(
        ["--clean", input_, output])
    with patch("ocrmypdf._exec.unpaper.version") as mock:
        mock.side_effect = FileNotFoundError("unpaper")

        with pytest.raises(MissingDependencyError):
            check_options(options, pm)
        mock.assert_called()
Beispiel #9
0
def test_user_words(caplog):
    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
        opts = make_opts(user_words='foo')
        plugin_manager = get_plugin_manager(opts.plugins)
        vd.check_options(opts, plugin_manager)
        assert '4.0 ignores --user-words' in caplog.text
    caplog.clear()
    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
        opts = make_opts(user_patterns='foo')
        plugin_manager = get_plugin_manager(opts.plugins)
        vd.check_options(opts, plugin_manager)
        assert '4.0 ignores --user-words' not in caplog.text
Beispiel #10
0
def test_old_ghostscript(caplog):
    with patch('ocrmypdf._exec.ghostscript.version',
               return_value='9.19'), patch(
                   'ocrmypdf._exec.tesseract.has_textonly_pdf',
                   return_value=True), patch(
                       'ocrmypdf._validation.check_options_languages',
                       return_value=None):
        vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa'))
        assert 'Ghostscript does not work correctly' in caplog.text

    with patch('ocrmypdf._exec.ghostscript.version',
               return_value='9.18'), patch(
                   'ocrmypdf._exec.tesseract.has_textonly_pdf',
                   return_value=True):
        with pytest.raises(MissingDependencyError):
            vd.check_options(*make_opts_pm(output_type='pdfa-3'))

    with patch('ocrmypdf._exec.ghostscript.version',
               return_value='9.24'), patch(
                   'ocrmypdf._exec.tesseract.has_textonly_pdf',
                   return_value=True):
        with pytest.raises(MissingDependencyError):
            vd.check_options(*make_opts_pm())
Beispiel #11
0
def ocr(  # pylint: disable=unused-argument
    input_file: PathOrIO,
    output_file: PathOrIO,
    *,
    language: Iterable[str] = None,
    image_dpi: int = None,
    output_type=None,
    sidecar: os.PathLike = None,
    jobs: int = None,
    use_threads: bool = None,
    title: str = None,
    author: str = None,
    subject: str = None,
    keywords: str = None,
    rotate_pages: bool = None,
    remove_background: bool = None,
    deskew: bool = None,
    clean: bool = None,
    clean_final: bool = None,
    unpaper_args: str = None,
    oversample: int = None,
    remove_vectors: bool = None,
    threshold: bool = None,
    force_ocr: bool = None,
    skip_text: bool = None,
    redo_ocr: bool = None,
    skip_big: float = None,
    optimize: int = None,
    jpg_quality: int = None,
    png_quality: int = None,
    jbig2_lossy: bool = None,
    jbig2_page_group_size: int = None,
    pages: str = None,
    max_image_mpixels: float = None,
    tesseract_config: Iterable[str] = None,
    tesseract_pagesegmode: int = None,
    tesseract_oem: int = None,
    pdf_renderer=None,
    tesseract_timeout: float = None,
    rotate_pages_threshold: float = None,
    pdfa_image_compression=None,
    user_words: os.PathLike = None,
    user_patterns: os.PathLike = None,
    fast_web_view: float = None,
    plugins: Iterable[Union[str, Path]] = None,
    keep_temporary_files: bool = None,
    progress_bar: bool = None,
    **kwargs,
):
    """Run OCRmyPDF on one PDF or image.

    For most arguments, see documentation for the equivalent command line parameter.
    A few specific arguments are discussed here:

    Args:
        use_threads: Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
        input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
            interpreted as file system path to the input file. If the object
            appears to be a readable stream (with methods such as ``.read()``
            and ``.seek()``), the object will be read in its entirety and saved to
            a temporary file. If ``input_file`` is  ``"-"``, standard input will be
            read.
        output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
            interpreted as file system path to the output file. If the object
            appears to be a writable stream (with methods such as ``.read()`` and
            ``.seek()``), the output will be written to this stream. If
            ``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
            (provided that standard output does not seem to be a terminal device).
            When a stream is used as output, whether via a writable object or
            ``"-"``, some final validation steps are not performed (we do not read
            back the stream after it is written).
    Raises:
        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
            with the OCR layer.
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
            could not be read, or some other file type that is not a PDF.
        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
            image is not credible (allowing it to proceed would cause poor OCR).
        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
            file failed.
        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
            text already, and settings did not tell us to proceed.
        ocrmypdf.InputFileError: Any other problem with the input file.
        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
        ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
            OCRmyPDF does not remove passwords.
        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
            valid.

    Returns:
        :class:`ocrmypdf.ExitCode`
    """
    if not plugins:
        plugins = []
    elif isinstance(plugins, (str, Path)):
        plugins = [plugins]
    else:
        plugins = list(plugins)

    parser = get_parser()
    _plugin_manager = get_plugin_manager(plugins)
    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member

    create_options_kwargs = {
        k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
    }
    create_options_kwargs.update(kwargs)

    if 'verbose' in kwargs:
        warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")

    options = create_options(**create_options_kwargs)
    check_options(options, _plugin_manager)
    return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
Beispiel #12
0
def ocr(  # pylint: disable=unused-argument
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    language: Iterable[str] = None,
    image_dpi: int = None,
    output_type=None,
    sidecar: os.PathLike = None,
    jobs: int = None,
    use_threads: bool = None,
    title: str = None,
    author: str = None,
    subject: str = None,
    keywords: str = None,
    rotate_pages: bool = None,
    remove_background: bool = None,
    deskew: bool = None,
    clean: bool = None,
    clean_final: bool = None,
    unpaper_args: str = None,
    oversample: int = None,
    remove_vectors: bool = None,
    threshold: bool = None,
    force_ocr: bool = None,
    skip_text: bool = None,
    redo_ocr: bool = None,
    skip_big: float = None,
    optimize: int = None,
    jpg_quality: int = None,
    png_quality: int = None,
    jbig2_lossy: bool = None,
    jbig2_page_group_size: int = None,
    pages: str = None,
    max_image_mpixels: float = None,
    tesseract_config: Iterable[str] = None,
    tesseract_pagesegmode: int = None,
    tesseract_oem: int = None,
    pdf_renderer=None,
    tesseract_timeout: float = None,
    rotate_pages_threshold: float = None,
    pdfa_image_compression=None,
    user_words: os.PathLike = None,
    user_patterns: os.PathLike = None,
    fast_web_view: float = None,
    plugins: Iterable[str] = None,
    keep_temporary_files: bool = None,
    progress_bar: bool = None,
    **kwargs,
):
    """Run OCRmyPDF on one PDF or image.

    For most arguments, see documentation for the equivalent command line parameter.
    A few specific arguments are discussed here:

    Args:
        use_threads (bool): Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
    Raises:
        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
            with the OCR layer.
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
            could not be read, or some other file type that is not a PDF.
        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
            image is not credible (allowing it to proceed would cause poor OCR).
        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
            file failed.
        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
            text already, and settings did not tell us to proceed.
        ocrmypdf.InputFileError: Any other problem with the input file.
        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
        ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
            OCRmyPDF does not remove passwords.
        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
            valid.

    Returns:
        :class:`ocrmypdf.ExitCode`
    """
    if not plugins:
        plugins = []

    parser = get_parser()
    _plugin_manager = get_plugin_manager(plugins)
    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member

    create_options_kwargs = {
        k: v
        for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
    }
    create_options_kwargs.update(kwargs)

    options = create_options(**create_options_kwargs)
    check_options(options, _plugin_manager)
    return run_pipeline(options=options,
                        plugin_manager=_plugin_manager,
                        api=True)
Beispiel #13
0
def test_pagesegmode_warning(caplog):
    opts = make_opts(tesseract_pagesegmode='0')
    plugin_manager = get_plugin_manager(opts.plugins)
    vd.check_options(opts, plugin_manager)
    assert 'disable OCR' in caplog.text