def exec_concurrent(context: PdfContext, executor: Executor): """Execute the pipeline concurrently""" # Run exec_page_sync on every page context options = context.options max_workers = min(len(context.pdfinfo), options.jobs) if max_workers > 1: log.info("Start processing %d pages concurrently", max_workers) sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) def update_page(result: PageResult, pbar): try: tls.pageno = result.pageno + 1 sidecars[result.pageno] = result.text pbar.update() ocrgraft.graft_page( pageno=result.pageno, image=result.pdf_page_from_image, textpdf=result.ocr, autorotate_correction=result.orientation_correction, ) pbar.update() finally: tls.pageno = None executor( use_threads=options.use_threads, max_workers=max_workers, tqdm_kwargs=dict( total=(2 * len(context.pdfinfo)), desc='OCR' if options.tesseract_timeout > 0 else 'Image processing', unit='page', unit_scale=0.5, disable=not options.progress_bar, ), worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS), task=exec_page_sync, task_arguments=context.get_page_contexts(), task_finished=update_page, ) # Output sidecar text if options.sidecar: text = merge_sidecars(sidecars, context) # Copy text file to destination copy_final(text, options.sidecar, context) # Merge layers to one single pdf pdf = ocrgraft.finalize() if options.output_type != 'none': # PDF/A and metadata log.info("Postprocessing...") pdf = post_process(pdf, context, executor) # Copy PDF file to destination copy_final(pdf, options.output_file, context)
def exec_concurrent(context): """Execute the pipeline concurrently""" # Run exec_page_sync on every page context max_workers = min(len(context.pdfinfo), context.options.jobs) if max_workers > 1: log.info("Start processing %d pages concurrently", max_workers) sidecars = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) def update_page(result, pbar): sidecars[result.pageno] = result.text pbar.update() ocrgraft.graft_page( pageno=result.pageno, image=result.pdf_page_from_image, textpdf=result.ocr, autorotate_correction=result.orientation_correction, ) pbar.update() exec_progress_pool( use_threads=context.options.use_threads, max_workers=max_workers, tqdm_kwargs=dict( total=(2 * len(context.pdfinfo)), desc='OCR', unit='page', unit_scale=0.5, disable=not context.options.progress_bar, ), task_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS), task=exec_page_sync, task_arguments=context.get_page_contexts(), task_finished=update_page, ) # Output sidecar text if context.options.sidecar: text = merge_sidecars(sidecars, context) # Copy text file to destination copy_final(text, context.options.sidecar, context) # Merge layers to one single pdf pdf = ocrgraft.finalize() # PDF/A and metadata pdf = post_process(pdf, context) # Copy PDF file to destination copy_final(pdf, context.options.output_file, context)