def test_override_metadata(spoof_tesseract_noop, output_type): input_file = _infile('c02-22.pdf') output_file = _outfile('test_override_metadata.pdf') german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' high_unicode = 'U+1030C is: 𐌌' p, out, err = run_ocrmypdf( input_file, output_file, '--title', german, '--author', chinese, '--subject', high_unicode, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok pdf = output_file out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True) lines_pdfinfo = out_pdfinfo.splitlines() pdfinfo = {} for line in lines_pdfinfo: k, v = line.strip().split(':', maxsplit=1) pdfinfo[k.strip()] = v.strip() assert pdfinfo['Title'] == german assert pdfinfo['Author'] == chinese assert pdfinfo['Subject'] == high_unicode assert pdfinfo.get('Keywords', '') == '' pdfa_info = file_claims_pdfa(output_file) assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf(input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) if ghostscript.version() >= '9.24': pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO') assert after.metadata.Title == german, after.metadata assert after.metadata.Author == chinese, after.metadata assert after.metadata.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.metadata.CreationDate)) after_date = decode_pdf_date(str(after.metadata.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_override_metadata(output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, _out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, '--plugin', 'tests/plugins/tesseract_noop.py', ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err before = pypdf.PdfFileReader(str(input_file)) after = pypdf.PdfFileReader(outpdf) assert after.documentInfo['/Title'] == german assert after.documentInfo['/Author'] == chinese assert after.documentInfo.get('/Keywords', '') == '' before_date = decode_pdf_date(before.documentInfo['/CreationDate']) after_date = decode_pdf_date(after.documentInfo['/CreationDate']) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf): check_ocrmypdf(resources / 'ccitt.pdf', outpdf, '--output-type', 'pdfa-1', env=spoof_tesseract_cache) pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['conformance'] == 'PDF/A-1B'
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf): check_ocrmypdf( resources / 'ccitt.pdf', outpdf, '--output-type', 'pdfa-1', env=spoof_tesseract_cache ) pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['conformance'] == 'PDF/A-1B'
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf): if pdfa_level == '3' and ghostscript.version() < '9.19': pytest.xfail(reason='Ghostscript >= 9.19 required') check_ocrmypdf( resources / 'ccitt.pdf', outpdf, '--output-type', 'pdfa-' + pdfa_level, env=spoof_tesseract_cache ) pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
def test_preserve_metadata(spoof_tesseract_noop, output_type): pdf_before = pypdf.PdfFileReader(_infile('graph.pdf')) output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf', '--output-type', output_type, env=spoof_tesseract_noop) pdf_after = pypdf.PdfFileReader(output) for key in ('/Title', '/Author'): assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key] pdfa_info = file_claims_pdfa(output) assert pdfa_info['output'] == output_type
def test_pdfa_n(pdfa_level, resources, outpdf): if pdfa_level == '3' and ghostscript.version() < '9.19': pytest.xfail(reason='Ghostscript >= 9.19 required') check_ocrmypdf( resources / 'ccitt.pdf', outpdf, '--output-type', 'pdfa-' + pdfa_level, '--plugin', 'tests/plugins/tesseract_cache.py', ) pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}B'
def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') output = check_ocrmypdf( resources / 'graph.pdf', outpdf, '--output-type', output_type, env=spoof_tesseract_noop) pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type
def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') output = check_ocrmypdf( resources / 'graph.pdf', outpdf, '--output-type', output_type, env=spoof_tesseract_noop, ) pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type
def test_preserve_docinfo(output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') output = check_ocrmypdf( resources / 'graph.pdf', outpdf, '--output-type', output_type, '--plugin', 'tests/plugins/tesseract_noop.py', ) pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err reader = pypdf.PdfFileReader(outpdf) assert reader.documentInfo['/Title'] == german assert reader.documentInfo['/Author'] == chinese assert reader.documentInfo.get('/Keywords', '') == '' pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() if not plugin_manager: plugin_manager = get_plugin_manager(options.plugins) work_folder = Path(mkdtemp(prefix="ocrmypdf.io.")) debug_log_handler = None if ( (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get('PYTEST_CURRENT_TEST', '') and not api ): # Debug log for command line interface only with verbose output # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this # when pytest is running debug_log_handler = configure_debug_logging( Path(work_folder) / "debug.log" ) # pragma: no cover pikepdf_enable_mmap() executor = setup_executor(plugin_manager) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) # Triage image or pdf origin_pdf = triage( original_filename, start_input_file, work_folder / 'origin.pdf', options ) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( origin_pdf, executor=executor, detailed_analysis=options.redo_ocr, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) # Execute the pipeline exec_concurrent(context, executor) if options.output_file == '-': log.info("Output sent to stdout") elif ( hasattr(options.output_file, 'writable') and options.output_file.writable() ): log.info("Output written to stream") elif samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info( "Output file is a %s (as expected)", pdfa_info['conformance'] ) else: log.warning( "Output file is okay but is not PDF/A (seems to be %s)", pdfa_info['conformance'], ) return ExitCode.pdfa_conversion_failed if not check_pdf(options.output_file): log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, start_input_file, options.output_file) except (KeyboardInterrupt if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("KeyboardInterrupt") else: log.error("KeyboardInterrupt") return ExitCode.ctrl_c except (ExitCodeException if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("ExitCodeException") elif str(e): log.error("%s: %s", type(e).__name__, str(e)) else: log.error(type(e).__name__) return e.exit_code except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except log.exception("An exception occurred while executing the pipeline") return ExitCode.other_error finally: if debug_log_handler: try: debug_log_handler.close() log.removeHandler(debug_log_handler) except EnvironmentError as e: print(e, file=sys.stderr) cleanup_working_files(work_folder, options) return ExitCode.ok