def test_metadata_fixup_warning(resources, outdir, caplog): options = get_parser().parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'] ) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PdfContext( options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PdfContext( options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records)
def test_user_words(caplog): with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False): opts = make_opts(user_words='foo') plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, set()) assert '4.0 ignores --user-words' in caplog.text caplog.clear() with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True): opts = make_opts(user_patterns='foo') plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, set()) assert '4.0 ignores --user-words' not in caplog.text
def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext( options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([]) ) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_rasterize_rotates(resources, tmp_path): pm = get_plugin_manager([]) img = tmp_path / 'img90.png' pm.hook.rasterize_pdf_page( input_file=resources / 'graph.pdf', output_file=img, raster_device='pngmono', raster_dpi=Resolution(20, 20), page_dpi=Resolution(20, 20), pageno=1, rotation=90, filter_vector=False, ) assert Image.open(img).size == (123, 151), "Image not rotated" img = tmp_path / 'img180.png' pm.hook.rasterize_pdf_page( input_file=resources / 'graph.pdf', output_file=img, raster_device='pngmono', raster_dpi=Resolution(20, 20), page_dpi=Resolution(20, 20), pageno=1, rotation=180, filter_vector=False, ) assert Image.open(img).size == (151, 123), "Image not rotated"
def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=False): with pytest.raises(MissingDependencyError): opts = make_opts(pdf_renderer='sandwich', language='eng') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager)
def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'): with pytest.raises(MissingDependencyError): opts = make_opts(pdf_renderer='sandwich', language='eng') plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, {'eng'})
def test_no_progress_bar(progress_bar, resources): opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')) plugin_manager = get_plugin_manager(opts.plugins) with patch('ocrmypdf._concurrent.tqdm', autospec=True) as tqdmpatch: vd._check_options(opts, plugin_manager, set()) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar) assert pdfinfo is not None assert tqdmpatch.called _args, kwargs = tqdmpatch.call_args assert kwargs['disable'] != progress_bar
def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs): if language is not None: kwargs['language'] = language parser = get_parser() pm = get_plugin_manager(kwargs.get('plugins', [])) pm.hook.add_options(parser=parser) # pylint: disable=no-member return ( create_options( input_file=input_file, output_file=output_file, parser=parser, **kwargs ), pm, )
def test_language_warning(caplog): opts = make_opts(language=None) plugin_manager = get_plugin_manager(opts.plugins) caplog.set_level(logging.DEBUG) with patch('ocrmypdf._validation.locale.getlocale', return_value=('en_US', 'UTF-8')): vd.check_options_languages(opts, {'eng'}) assert opts.languages == {'eng'} assert '' in caplog.text opts = make_opts(language=None) with patch('ocrmypdf._validation.locale.getlocale', return_value=('fr_FR', 'UTF-8')): vd.check_options_languages(opts, {'eng'}) assert opts.languages == {'eng'} assert 'assuming --language' in caplog.text
def test_no_progress_bar(progress_bar, resources): opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')) plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, set()) pbar_disabled = None class CheckProgressBar(NullProgressBar): def __init__(self, disable, **kwargs): nonlocal pbar_disabled pbar_disabled = disable super().__init__(disable=disable, **kwargs) executor = SerialExecutor(pbar_class=CheckProgressBar) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor) assert pdfinfo is not None assert pbar_disabled is not None and pbar_disabled != progress_bar
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def test_pagesegmode_warning(caplog): opts = make_opts(tesseract_pagesegmode='0') plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, set()) assert 'disable OCR' in caplog.text
def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() if not plugin_manager: plugin_manager = get_plugin_manager(options.plugins) work_folder = Path(mkdtemp(prefix="ocrmypdf.io.")) debug_log_handler = None if ( (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get('PYTEST_CURRENT_TEST', '') and not api ): # Debug log for command line interface only with verbose output # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this # when pytest is running debug_log_handler = configure_debug_logging( Path(work_folder) / "debug.log" ) # pragma: no cover pikepdf_enable_mmap() executor = setup_executor(plugin_manager) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) # Triage image or pdf origin_pdf = triage( original_filename, start_input_file, work_folder / 'origin.pdf', options ) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( origin_pdf, executor=executor, detailed_analysis=options.redo_ocr, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) # Execute the pipeline exec_concurrent(context, executor) if options.output_file == '-': log.info("Output sent to stdout") elif ( hasattr(options.output_file, 'writable') and options.output_file.writable() ): log.info("Output written to stream") elif samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info( "Output file is a %s (as expected)", pdfa_info['conformance'] ) else: log.warning( "Output file is okay but is not PDF/A (seems to be %s)", pdfa_info['conformance'], ) return ExitCode.pdfa_conversion_failed if not check_pdf(options.output_file): log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, start_input_file, options.output_file) except (KeyboardInterrupt if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("KeyboardInterrupt") else: log.error("KeyboardInterrupt") return ExitCode.ctrl_c except (ExitCodeException if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("ExitCodeException") elif str(e): log.error("%s: %s", type(e).__name__, str(e)) else: log.error(type(e).__name__) return e.exit_code except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except log.exception("An exception occurred while executing the pipeline") return ExitCode.other_error finally: if debug_log_handler: try: debug_log_handler.close() log.removeHandler(debug_log_handler) except EnvironmentError as e: print(e, file=sys.stderr) cleanup_working_files(work_folder, options) return ExitCode.ok
def ocr( # pylint: disable=unused-argument input_file: PathOrIO, output_file: PathOrIO, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[Union[str, Path]] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads: Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the input file. If the object appears to be a readable stream (with methods such as ``.read()`` and ``.seek()``), the object will be read in its entirety and saved to a temporary file. If ``input_file`` is ``"-"``, standard input will be read. output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the output file. If the object appears to be a writable stream (with methods such as ``.read()`` and ``.seek()``), the output will be written to this stream. If ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` (provided that standard output does not seem to be a terminal device). When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] elif isinstance(plugins, (str, Path)): plugins = [plugins] else: plugins = list(plugins) parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) if 'verbose' in kwargs: warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().") options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def ocr( # pylint: disable=unused-argument input_file: os.PathLike, output_file: os.PathLike, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads (bool): Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def __setstate__(self, state): self.__dict__.update(state) self.plugin_manager = get_plugin_manager(self.options.plugins)