def _pdf_pageinfo_concurrent( pdf, infile, progbar, max_workers, check_pages, detailed_analysis=False ): pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): page = result pages[page.pageno] = page pbar.update() if max_workers is None: max_workers = available_cpu_count() total = len(pdf.pages) contexts = ((n, infile, check_pages, detailed_analysis) for n in range(total)) use_threads = False # No performance gain if threaded due to GIL n_workers = min(1 + len(pages) // 4, max_workers) if n_workers == 1: # But if we decided on only one worker, there is no point in using # a separate process. use_threads = True exec_progress_pool( use_threads=use_threads, max_workers=n_workers, tqdm_kwargs=dict( total=total, desc="Scanning contents", unit='page', disable=not progbar ), task_initializer=partial(_pdf_pageinfo_sync_init, infile), task=_pdf_pageinfo_sync, task_arguments=contexts, task_finished=update_pageinfo, ) return pages
def exec_concurrent(context: PdfContext): """Execute the pipeline concurrently""" # Run exec_page_sync on every page context options = context.options max_workers = min(len(context.pdfinfo), options.jobs) if max_workers > 1: log.info("Start processing %d pages concurrently", max_workers) sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) def update_page(result: PageResult, pbar): try: tls.pageno = result.pageno + 1 sidecars[result.pageno] = result.text pbar.update() ocrgraft.graft_page( pageno=result.pageno, image=result.pdf_page_from_image, textpdf=result.ocr, autorotate_correction=result.orientation_correction, ) pbar.update() finally: tls.pageno = None exec_progress_pool( use_threads=options.use_threads, max_workers=max_workers, tqdm_kwargs=dict( total=(2 * len(context.pdfinfo)), desc='OCR' if options.tesseract_timeout > 0 else 'Image processing', unit='page', unit_scale=0.5, disable=not options.progress_bar, ), task_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS), task=exec_page_sync, task_arguments=context.get_page_contexts(), task_finished=update_page, ) # Output sidecar text if options.sidecar: text = merge_sidecars(sidecars, context) # Copy text file to destination copy_final(text, options.sidecar, context) # Merge layers to one single pdf pdf = ocrgraft.finalize() # PDF/A and metadata log.info("Postprocessing...") pdf = post_process(pdf, context) # Copy PDF file to destination copy_final(pdf, options.output_file, context)
def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers, check_pages, detailed_analysis=False): global worker_pdf # pylint: disable=global-statement pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): page = result if not page: raise InputFileError("Could read a page in the PDF") pages[page.pageno] = page pbar.update() if max_workers is None: max_workers = available_cpu_count() total = len(pdf.pages) contexts = ((n, infile, check_pages, detailed_analysis) for n in range(total)) use_threads = False # No performance gain if threaded due to GIL n_workers = min(1 + len(pages) // 4, max_workers) if n_workers == 1: # But if we decided on only one worker, there is no point in using # a separate process. use_threads = True try: exec_progress_pool( use_threads=use_threads, max_workers=n_workers, tqdm_kwargs=dict(total=total, desc="Scanning contents", unit='page', disable=not progbar), task_initializer=partial(_pdf_pageinfo_sync_init, infile, logging.getLogger('pdfminer').level), task=_pdf_pageinfo_sync, task_arguments=contexts, task_finished=update_pageinfo, ) finally: if worker_pdf and use_threads: assert n_workers == 1, "Should have only one worker when threaded" # This is messy, but if we ran in thread, close worker_pdf worker_pdf.close() return pages
def _produce_jbig2_images( jbig2_groups: Dict[int, List[XrefExt]], root: Path, options ) -> None: """Produce JBIG2 images from their groups""" def jbig2_group_args(root: Path, groups: Dict[int, List[XrefExt]]): for group, xref_exts in groups.items(): prefix = f'group{group:08d}' yield dict( cwd=fspath(root), infiles=(img_name(root, xref, ext) for xref, ext in xref_exts), out_prefix=prefix, ) def jbig2_single_args(root, groups: Dict[int, List[XrefExt]]): for group, xref_exts in groups.items(): prefix = f'group{group:08d}' # Second loop is to ensure multiple images per page are unpacked for n, xref_ext in enumerate(xref_exts): xref, ext = xref_ext yield dict( cwd=fspath(root), infile=img_name(root, xref, ext), outfile=root / f'{prefix}.{n:04d}', ) def convert_generic(fn, kwargs_dict): return fn(**kwargs_dict) if options.jbig2_page_group_size > 1: jbig2_args = jbig2_group_args jbig2_convert = partial(convert_generic, jbig2enc.convert_group) else: jbig2_args = jbig2_single_args jbig2_convert = partial(convert_generic, jbig2enc.convert_single) exec_progress_pool( use_threads=True, max_workers=options.jobs, tqdm_kwargs=dict( total=len(jbig2_groups), desc="JBIG2", unit='item', disable=not options.progress_bar, ), task=jbig2_convert, task_arguments=jbig2_args(root, jbig2_groups), )
def transcode_pngs( pike: Pdf, images: Sequence[Xref], image_name_fn: Callable[[Path, Xref], Path], root: Path, options, ) -> None: modified: MutableSet[Xref] = set() if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) def pngquant_args(): for xref in images: log.debug(image_name_fn(root, xref)) yield ( image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) modified.add(xref) def pngquant_fn(args): pngquant.quantize(*args) exec_progress_pool( use_threads=True, max_workers=options.jobs, tqdm_kwargs=dict( desc="PNGs", total=len(images), unit='image', disable=not options.progress_bar, ), task=pngquant_fn, task_arguments=pngquant_args(), ) for xref in modified: filename = png_name(root, xref) _transcode_png(pike, filename, xref)
def transcode_pngs(pike, images, image_name_fn, root, options): modified = set() if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) def pngquant_args(): for xref in images: log.debug(image_name_fn(root, xref)) yield ( image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) modified.add(xref) def pngquant_fn(args): pngquant.quantize(*args) exec_progress_pool( use_threads=True, max_workers=options.jobs, tqdm_kwargs=dict( desc="PNGs", total=len(images), unit='image', disable=not options.progress_bar, ), task=pngquant_fn, task_arguments=pngquant_args(), ) for xref in modified: im_obj = pike.get_object(xref, 0) try: pix = leptonica.Pix.open(png_name(root, xref)) if pix.mode == '1': compdata = pix.generate_pdf_ci_data(leptonica.lept.L_G4_ENCODE, 0) else: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}" ) continue if compdata.type == leptonica.lept.L_FLATE_ENCODE: rewrite_png(pike, im_obj, compdata) elif compdata.type == leptonica.lept.L_G4_ENCODE: rewrite_png_as_g4(pike, im_obj, compdata)