def exec_page_sync(page_context: PageContext): options = page_context.options tls.pageno = page_context.pageno + 1 if not is_ocr_required(page_context): return PageResult( pageno=page_context.pageno, pdf_page_from_image=None, ocr=None, text=None, orientation_correction=0, ) orientation_correction = 0 if options.rotate_pages: # Rasterize rasterize_preview_out = rasterize_preview(page_context.origin, page_context) orientation_correction = get_orientation_correction( rasterize_preview_out, page_context ) ocr_image, preprocess_out = make_intermediate_images( page_context, orientation_correction ) ocr_image_out = create_ocr_image(ocr_image, page_context) pdf_page_from_image_out = None if not options.lossless_reconstruction: assert preprocess_out visible_image_out = preprocess_out if should_visible_page_image_use_jpg(page_context.pageinfo): visible_image_out = create_visible_page_jpg(visible_image_out, page_context) filtered_image = page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=visible_image_out ) if filtered_image: visible_image_out = filtered_image pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context, orientation_correction ) if options.pdf_renderer.startswith('hocr'): (hocr_out, text_out) = ocr_engine_hocr(ocr_image_out, page_context) ocr_out = render_hocr_page(hocr_out, page_context) elif options.pdf_renderer == 'sandwich': (ocr_out, text_out) = ocr_engine_textonly_pdf(ocr_image_out, page_context) else: raise NotImplementedError(f"pdf_renderer {options.pdf_renderer}") return PageResult( pageno=page_context.pageno, pdf_page_from_image=pdf_page_from_image_out, ocr=ocr_out, text=text_out, orientation_correction=orientation_correction, )
def exec_page_sync(page_context): options = page_context.options tls.pageno = page_context.pageno + 1 orientation_correction = 0 pdf_page_from_image_out = None ocr_out = None text_out = None if is_ocr_required(page_context): if options.rotate_pages: # Rasterize rasterize_preview_out = rasterize_preview(page_context.origin, page_context) orientation_correction = get_orientation_correction( rasterize_preview_out, page_context) rasterize_out = rasterize( page_context.origin, page_context, correction=orientation_correction, remove_vectors=False, ) if not any( [options.clean, options.clean_final, options.remove_vectors]): ocr_image = preprocess_out = preprocess( page_context, rasterize_out, options.remove_background, options.deskew, clean=False, ) else: if not options.lossless_reconstruction: preprocess_out = preprocess( page_context, rasterize_out, options.remove_background, options.deskew, clean=options.clean_final, ) if options.remove_vectors: rasterize_ocr_out = rasterize( page_context.origin, page_context, correction=orientation_correction, remove_vectors=True, output_tag='_ocr', ) else: rasterize_ocr_out = rasterize_out ocr_image = preprocess( page_context, rasterize_ocr_out, options.remove_background, options.deskew, clean=options.clean, ) ocr_image_out = create_ocr_image(ocr_image, page_context) pdf_page_from_image_out = None if not options.lossless_reconstruction: visible_image_out = preprocess_out if should_visible_page_image_use_jpg(page_context.pageinfo): visible_image_out = create_visible_page_jpg( visible_image_out, page_context) visible_image_out = ( page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=Path(visible_image_out)) or visible_image_out) pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context) if options.pdf_renderer == 'hocr': (hocr_out, text_out) = ocr_engine_hocr(ocr_image_out, page_context) ocr_out = render_hocr_page(hocr_out, page_context) if options.pdf_renderer == 'sandwich': (ocr_out, text_out) = ocr_engine_textonly_pdf(ocr_image_out, page_context) return PageResult( pageno=page_context.pageno, pdf_page_from_image=pdf_page_from_image_out, ocr=ocr_out, text=text_out, orientation_correction=orientation_correction, )