def run(args=None): _parser, options, plugin_manager = get_parser_options_plugins(args=args) if not check_closed_streams(options): return ExitCode.bad_args if hasattr(os, 'nice'): os.nice(5) verbosity = options.verbose if not os.isatty(sys.stderr.fileno()): options.progress_bar = False if options.quiet: verbosity = Verbosity.quiet options.progress_bar = False configure_logging(verbosity, progress_bar_friendly=options.progress_bar, manage_root_logger=True) log.debug('ocrmypdf %s', __version__) try: check_options(options, plugin_manager) except ValueError as e: log.error(e) return ExitCode.bad_args except BadArgsError as e: log.error(e) return e.exit_code except MissingDependencyError as e: log.error(e) return ExitCode.missing_dependency result = run_pipeline(options=options, plugin_manager=plugin_manager) return result
def ocr( # pylint: disable=unused-argument input_file: PathOrIO, output_file: PathOrIO, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[Union[str, Path]] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads: Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the input file. If the object appears to be a readable stream (with methods such as ``.read()`` and ``.seek()``), the object will be read in its entirety and saved to a temporary file. If ``input_file`` is ``"-"``, standard input will be read. output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the output file. If the object appears to be a writable stream (with methods such as ``.read()`` and ``.seek()``), the output will be written to this stream. If ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` (provided that standard output does not seem to be a terminal device). When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] elif isinstance(plugins, (str, Path)): plugins = [plugins] else: plugins = list(plugins) parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) if 'verbose' in kwargs: warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().") options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def ocr( # pylint: disable=unused-argument input_file: os.PathLike, output_file: os.PathLike, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads (bool): Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)