def test_metadata_fixup_warning(resources, outdir, caplog): options = get_parser().parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'] ) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PdfContext( options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PdfContext( options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records)
def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext( options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([]) ) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def exec_concurrent(context: PdfContext, executor: Executor): """Execute the pipeline concurrently""" # Run exec_page_sync on every page context options = context.options max_workers = min(len(context.pdfinfo), options.jobs) if max_workers > 1: log.info("Start processing %d pages concurrently", max_workers) sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) def update_page(result: PageResult, pbar): try: tls.pageno = result.pageno + 1 sidecars[result.pageno] = result.text pbar.update() ocrgraft.graft_page( pageno=result.pageno, image=result.pdf_page_from_image, textpdf=result.ocr, autorotate_correction=result.orientation_correction, ) pbar.update() finally: tls.pageno = None executor( use_threads=options.use_threads, max_workers=max_workers, tqdm_kwargs=dict( total=(2 * len(context.pdfinfo)), desc='OCR' if options.tesseract_timeout > 0 else 'Image processing', unit='page', unit_scale=0.5, disable=not options.progress_bar, ), worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS), task=exec_page_sync, task_arguments=context.get_page_contexts(), task_finished=update_page, ) # Output sidecar text if options.sidecar: text = merge_sidecars(sidecars, context) # Copy text file to destination copy_final(text, options.sidecar, context) # Merge layers to one single pdf pdf = ocrgraft.finalize() if options.output_type != 'none': # PDF/A and metadata log.info("Postprocessing...") pdf = post_process(pdf, context, executor) # Copy PDF file to destination copy_final(pdf, options.output_file, context)
def optimize_pdf(input_file: Path, context: PdfContext, executor: Executor): output_file = context.get_path('optimize.pdf') save_settings = dict( linearize=should_linearize(input_file, context), **get_pdf_save_settings(context.options.output_type), ) optimize(input_file, output_file, context, save_settings, executor) return output_file
def metadata_fixup(working_file: Path, context: PdfContext): output_file = context.get_path('metafix.pdf') options = context.options def report_on_metadata(missing): if not missing: return if options.output_type.startswith('pdfa'): log.warning( "Some input metadata could not be copied because it is not " "permitted in PDF/A. You may wish to examine the output " "PDF's XMP metadata.") log.debug("The following metadata fields were not copied: %r", missing) else: log.error("Some input metadata could not be copied." "You may wish to examine the output PDF's XMP metadata.") log.info("The following metadata fields were not copied: %r", missing) with pikepdf.open( context.origin) as original, pikepdf.open(working_file) as pdf: docinfo = get_docinfo(original, context) with pdf.open_metadata() as meta: meta.load_from_docinfo(docinfo, delete_missing=False, raise_failure=False) # If xmp:CreateDate is missing, set it to the modify date to # match Ghostscript, for consistency if 'xmp:CreateDate' not in meta: meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '') # Ghostscript likes to set title to Untitled if omitted from input. # Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1 # and the XMP Spec do not make this recommendation. if meta.get('dc:title') == 'Untitled': with original.open_metadata( set_pikepdf_as_editor=False, update_docinfo=False) as original_meta: if 'dc:title' not in original_meta: del meta['dc:title'] meta_original = original.open_metadata() missing = set(meta_original.keys()) - set(meta.keys()) report_on_metadata(missing) pdf.save( output_file, compress_streams=True, preserve_pdfa=True, object_stream_mode=pikepdf.ObjectStreamMode.generate, linearize=( # Don't linearize if optimize() will be linearizing too should_linearize(working_file, context) if options.optimize == 0 else False), ) return output_file
def optimize_pdf(input_file: Path, context: PdfContext): output_file = context.get_path('optimize.pdf') save_settings = dict( compress_streams=True, preserve_pdfa=True, object_stream_mode=pikepdf.ObjectStreamMode.generate, linearize=should_linearize(input_file, context), ) optimize(input_file, output_file, context, save_settings) return output_file
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext): options = context.options input_pdfinfo = context.pdfinfo fix_docinfo_file = context.get_path('fix_docinfo.pdf') output_file = context.get_path('pdfa.pdf') # If the DocumentInfo record contains NUL characters, Ghostscript will # produce XMP metadata which contains invalid XML entities (�). # NULs in DocumentInfo seem to be common since older Acrobats included them. # pikepdf can deal with this, but we make the world a better place by # stamping them out as soon as possible. modified = False with pikepdf.open(input_pdf) as pdf_file: try: len(pdf_file.docinfo) except TypeError: log.error( "File contains a malformed DocumentInfo block - continuing anyway" ) else: if pdf_file.docinfo: for k, v in pdf_file.docinfo.items(): if b'\x00' in bytes(v): pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'') modified = True if modified: pdf_file.save(fix_docinfo_file) else: safe_symlink(input_pdf, fix_docinfo_file) context.plugin_manager.hook.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=[fix_docinfo_file], pdfmark=input_ps_stub, output_file=output_file, compression=options.pdfa_image_compression, pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3 progressbar_class=(context.plugin_manager.hook.get_progressbar_class() if options.progress_bar else None), ) return output_file
def main(infile, outfile, level, jobs=1): from shutil import copy # pylint: disable=import-outside-toplevel from tempfile import TemporaryDirectory # pylint: disable=import-outside-toplevel class OptimizeOptions: """Emulate ocrmypdf's options""" def __init__( self, input_file, jobs, optimize_, jpeg_quality, png_quality, jb2lossy ): self.input_file = input_file self.jobs = jobs self.optimize = optimize_ self.jpeg_quality = jpeg_quality self.png_quality = png_quality self.jbig2_page_group_size = 0 self.jbig2_lossy = jb2lossy self.quiet = True self.progress_bar = False infile = Path(infile) options = OptimizeOptions( input_file=infile, jobs=jobs, optimize_=int(level), jpeg_quality=0, # Use default png_quality=0, jb2lossy=False, ) with TemporaryDirectory() as td: context = PdfContext(options, td, infile, None, None) tmpout = Path(td) / 'out.pdf' optimize( infile, tmpout, context, dict( compress_streams=True, preserve_pdfa=True, object_stream_mode=pikepdf.ObjectStreamMode.generate, ), ) copy(fspath(tmpout), fspath(outfile))
def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext): output_file = context.get_path('sidecar.txt') with open(output_file, 'w', encoding="utf-8") as stream: for page_num, txt_file in enumerate(txt_files): if page_num != 0: stream.write('\f') # Form feed between pages if txt_file: with open(txt_file, 'r', encoding="utf-8") as in_: txt = in_.read() # Some OCR engines (e.g. Tesseract v4 alpha) add form feeds # between pages, and some do not. For consistency, we ignore # any added by the OCR engine and them on our own. if txt.endswith('\f'): stream.write(txt[:-1]) else: stream.write(txt) else: stream.write(f'[OCR skipped on page {(page_num + 1)}]') return output_file
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() if not plugin_manager: plugin_manager = get_plugin_manager(options.plugins) work_folder = Path(mkdtemp(prefix="ocrmypdf.io.")) debug_log_handler = None if ( (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get('PYTEST_CURRENT_TEST', '') and not api ): # Debug log for command line interface only with verbose output # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this # when pytest is running debug_log_handler = configure_debug_logging( Path(work_folder) / "debug.log" ) # pragma: no cover pikepdf_enable_mmap() executor = setup_executor(plugin_manager) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) # Triage image or pdf origin_pdf = triage( original_filename, start_input_file, work_folder / 'origin.pdf', options ) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( origin_pdf, executor=executor, detailed_analysis=options.redo_ocr, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) # Execute the pipeline exec_concurrent(context, executor) if options.output_file == '-': log.info("Output sent to stdout") elif ( hasattr(options.output_file, 'writable') and options.output_file.writable() ): log.info("Output written to stream") elif samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info( "Output file is a %s (as expected)", pdfa_info['conformance'] ) else: log.warning( "Output file is okay but is not PDF/A (seems to be %s)", pdfa_info['conformance'], ) return ExitCode.pdfa_conversion_failed if not check_pdf(options.output_file): log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, start_input_file, options.output_file) except (KeyboardInterrupt if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("KeyboardInterrupt") else: log.error("KeyboardInterrupt") return ExitCode.ctrl_c except (ExitCodeException if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("ExitCodeException") elif str(e): log.error("%s: %s", type(e).__name__, str(e)) else: log.error(type(e).__name__) return e.exit_code except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except log.exception("An exception occurred while executing the pipeline") return ExitCode.other_error finally: if debug_log_handler: try: debug_log_handler.close() log.removeHandler(debug_log_handler) except EnvironmentError as e: print(e, file=sys.stderr) cleanup_working_files(work_folder, options) return ExitCode.ok
def generate_postscript_stub(context: PdfContext): output_file = context.get_path('pdfa.ps') generate_pdfa_ps(output_file) return output_file