def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=False): with pytest.raises(MissingDependencyError): opts = make_opts(pdf_renderer='sandwich', language='eng') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager)
def run(args=None): _parser, options, plugin_manager = get_parser_options_plugins(args=args) if not check_closed_streams(options): return ExitCode.bad_args if hasattr(os, 'nice'): os.nice(5) verbosity = options.verbose if not os.isatty(sys.stderr.fileno()): options.progress_bar = False if options.quiet: verbosity = Verbosity.quiet options.progress_bar = False configure_logging(verbosity, progress_bar_friendly=options.progress_bar, manage_root_logger=True) log.debug('ocrmypdf %s', __version__) try: check_options(options, plugin_manager) except ValueError as e: log.error(e) return ExitCode.bad_args except BadArgsError as e: log.error(e) return e.exit_code except MissingDependencyError as e: log.error(e) return ExitCode.missing_dependency result = run_pipeline(options=options, plugin_manager=plugin_manager) return result
def test_hocr_notlatin_warning(caplog): # Bypass the test to see if the language is installed; we just want to pretend # that a non-Latin language is installed with patch('ocrmypdf._validation.check_options_languages', return_value=None): vd.check_options(*make_opts_pm( language='chi_sim', pdf_renderer='hocr', output_type='pdfa')) assert 'PDF renderer is known to cause' in caplog.text
def test_no_unpaper(resources, no_outpdf): input_ = fspath(resources / "c02-22.pdf") output = fspath(no_outpdf) options = parser.parse_args(args=["--clean", input_, output]) with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version: mock_unpaper_version.side_effect = FileNotFoundError("unpaper") with pytest.raises(MissingDependencyError): check_options(options)
def test_no_progress_bar(progress_bar, resources): opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')) with patch('ocrmypdf.pdfinfo.info.tqdm', autospec=True) as tqdmpatch: vd.check_options(opts) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar) assert tqdmpatch.called _args, kwargs = tqdmpatch.call_args assert kwargs['disable'] != progress_bar
def test_old_unpaper(resources, no_outpdf): input_ = fspath(resources / "c02-22.pdf") output = fspath(no_outpdf) _parser, options, pm = get_parser_options_plugins( ["--clean", input_, output]) with patch("ocrmypdf._exec.unpaper.version") as mock_unpaper_version: mock_unpaper_version.return_value = '0.5' with pytest.raises(MissingDependencyError): check_options(options, pm)
def test_no_progress_bar(progress_bar, resources): opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')) plugin_manager = get_plugin_manager(opts.plugins) with patch('ocrmypdf._concurrent.tqdm', autospec=True) as tqdmpatch: vd.check_options(opts, plugin_manager) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar) assert pdfinfo is not None assert tqdmpatch.called _args, kwargs = tqdmpatch.call_args assert kwargs['disable'] != progress_bar
def test_no_unpaper(resources, no_outpdf): input_ = fspath(resources / "c02-22.pdf") output = fspath(no_outpdf) _parser, options, pm = get_parser_options_plugins( ["--clean", input_, output]) with patch("ocrmypdf._exec.unpaper.version") as mock: mock.side_effect = FileNotFoundError("unpaper") with pytest.raises(MissingDependencyError): check_options(options, pm) mock.assert_called()
def test_user_words(caplog): with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False): opts = make_opts(user_words='foo') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager) assert '4.0 ignores --user-words' in caplog.text caplog.clear() with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True): opts = make_opts(user_patterns='foo') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager) assert '4.0 ignores --user-words' not in caplog.text
def test_old_ghostscript(caplog): with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch( 'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True), patch( 'ocrmypdf._validation.check_options_languages', return_value=None): vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa')) assert 'Ghostscript does not work correctly' in caplog.text with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'), patch( 'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True): with pytest.raises(MissingDependencyError): vd.check_options(*make_opts_pm(output_type='pdfa-3')) with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'), patch( 'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True): with pytest.raises(MissingDependencyError): vd.check_options(*make_opts_pm())
def ocr( # pylint: disable=unused-argument input_file: PathOrIO, output_file: PathOrIO, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[Union[str, Path]] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads: Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the input file. If the object appears to be a readable stream (with methods such as ``.read()`` and ``.seek()``), the object will be read in its entirety and saved to a temporary file. If ``input_file`` is ``"-"``, standard input will be read. output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the output file. If the object appears to be a writable stream (with methods such as ``.read()`` and ``.seek()``), the output will be written to this stream. If ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` (provided that standard output does not seem to be a terminal device). When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] elif isinstance(plugins, (str, Path)): plugins = [plugins] else: plugins = list(plugins) parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) if 'verbose' in kwargs: warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().") options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def ocr( # pylint: disable=unused-argument input_file: os.PathLike, output_file: os.PathLike, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads (bool): Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def test_pagesegmode_warning(caplog): opts = make_opts(tesseract_pagesegmode='0') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager) assert 'disable OCR' in caplog.text