def test_mono_image(blank_hocr, outdir): im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(outdir / 'mono.tif', format='TIFF') hocr = hocrtransform.HocrTransform(str(blank_hocr), 300) hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')) check_pdf(str(outdir / 'mono.pdf'))
def test_stdout(ocrmypdf_exec, resources, outpdf): if 'COV_CORE_DATAFILE' in os.environ: pytest.skip(msg="Coverage uses stdout") input_file = str(resources / 'francais.pdf') output_file = str(outpdf) # Runs: ocrmypdf francais.pdf - > test_stdout.pdf with open(output_file, 'wb') as output_stream: p_args = ocrmypdf_exec + [ input_file, '-', '--plugin', 'tests/plugins/tesseract_noop.py', ] run(p_args, stdout=output_stream, stderr=PIPE, stdin=DEVNULL, check=True) assert check_pdf(output_file)
def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() if not plugin_manager: plugin_manager = get_plugin_manager(options.plugins) work_folder = Path(mkdtemp(prefix="ocrmypdf.io.")) debug_log_handler = None if ( (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get('PYTEST_CURRENT_TEST', '') and not api ): # Debug log for command line interface only with verbose output # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this # when pytest is running debug_log_handler = configure_debug_logging( Path(work_folder) / "debug.log" ) # pragma: no cover pikepdf_enable_mmap() executor = setup_executor(plugin_manager) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) # Triage image or pdf origin_pdf = triage( original_filename, start_input_file, work_folder / 'origin.pdf', options ) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( origin_pdf, executor=executor, detailed_analysis=options.redo_ocr, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) # Execute the pipeline exec_concurrent(context, executor) if options.output_file == '-': log.info("Output sent to stdout") elif ( hasattr(options.output_file, 'writable') and options.output_file.writable() ): log.info("Output written to stream") elif samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info( "Output file is a %s (as expected)", pdfa_info['conformance'] ) else: log.warning( "Output file is okay but is not PDF/A (seems to be %s)", pdfa_info['conformance'], ) return ExitCode.pdfa_conversion_failed if not check_pdf(options.output_file): log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, start_input_file, options.output_file) except (KeyboardInterrupt if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("KeyboardInterrupt") else: log.error("KeyboardInterrupt") return ExitCode.ctrl_c except (ExitCodeException if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("ExitCodeException") elif str(e): log.error("%s: %s", type(e).__name__, str(e)) else: log.error(type(e).__name__) return e.exit_code except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except log.exception("An exception occurred while executing the pipeline") return ExitCode.other_error finally: if debug_log_handler: try: debug_log_handler.close() log.removeHandler(debug_log_handler) except EnvironmentError as e: print(e, file=sys.stderr) cleanup_working_files(work_folder, options) return ExitCode.ok
def test_pdf_error(resources): assert check_pdf(resources / 'blank.pdf') assert not check_pdf(__file__)