def test_rasterize_rotates(resources, tmp_path): pm = get_plugin_manager([]) img = tmp_path / 'img90.png' pm.hook.rasterize_pdf_page( input_file=resources / 'graph.pdf', output_file=img, raster_device='pngmono', raster_dpi=Resolution(20, 20), page_dpi=Resolution(20, 20), pageno=1, rotation=90, filter_vector=False, ) assert Image.open(img).size == (123, 151), "Image not rotated" img = tmp_path / 'img180.png' pm.hook.rasterize_pdf_page( input_file=resources / 'graph.pdf', output_file=img, raster_device='pngmono', raster_dpi=Resolution(20, 20), page_dpi=Resolution(20, 20), pageno=1, rotation=180, filter_vector=False, ) assert Image.open(img).size == (151, 123), "Image not rotated"
def _get_dpi(ctm_shorthand, image_size) -> Resolution: """Given the transformation matrix and image size, find the image DPI. PDFs do not include image resolution information within image data. Instead, the PDF page content stream describes the location where the image will be rasterized, and the effective resolution is the ratio of the pixel size to raster target size. Normally a scanned PDF has the paper size set appropriately but this is not guaranteed. The most common case is a cropped image will change the page size (/CropBox) without altering the page content stream. That means it is not sufficient to assume that the image fills the page, even though that is the most common case. A PDF image may be scaled (always), cropped, translated, rotated in place to an arbitrary angle (rarely) and skewed. Only equal area mappings can be expressed, that is, it is not necessary to consider distortions where the effective DPI varies with position. To determine the image scale, transform an offset axis vector v0 (0, 0), width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix, which gives the dimensions of the image in PDF units. From there we can compare to actual image dimensions. PDF uses row vector * matrix_transposed unlike the traditional matrix * column vector. The offset, width and height vectors can be combined in a matrix and multiplied by the transform matrix. Then we want to calculated magnitude(width_vector - offset_vector) and magnitude(height_vector - offset_vector) When the above is worked out algebraically, the effect of translation cancels out, and the vector magnitudes become functions of the nonzero transformation matrix indices. The results of the derivation are used in this code. pdfimages -list does calculate the DPI in some way that is not completely naive, but it does not get the DPI of rotated images right, so cannot be used anymore to validate this. Photoshop works, or using Acrobat to rotate the image back to normal. It does not matter if the image is partially cropped, or even out of the /MediaBox. """ a, b, c, d, _, _ = ctm_shorthand # Calculate the width and height of the image in PDF units image_drawn = hypot(a, b), hypot(c, d) def calc(drawn, pixels, inches_per_pt=72.0): # The scale of the image is pixels per unit of default user space (1/72") scale = pixels / drawn if drawn != 0 else inf dpi = scale * inches_per_pt return dpi dpi_w, dpi_h = (calc(image_drawn[n], image_size[n]) for n in range(2)) return Resolution(dpi_w, dpi_h)
def test_remove_background(resources, outdir): # Ensure the input image does not contain pure white/black with Image.open(resources / 'congress.jpg') as im: assert im.getextrema() != ((0, 255), (0, 255), (0, 255)) output_pdf = check_ocrmypdf( resources / 'congress.jpg', outdir / 'test_remove_bg.pdf', '--remove-background', '--image-dpi', '150', '--plugin', 'tests/plugins/tesseract_noop.py', ) output_png = outdir / 'remove_bg.png' ghostscript.rasterize_pdf( output_pdf, output_png, raster_device='png16m', raster_dpi=Resolution(100, 100), pageno=1, ) # The output image should contain pure white and black with Image.open(output_png) as im: assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
def test_deskew(resources, outdir): # Run with deskew deskewed_pdf = check_ocrmypdf( resources / 'skew.pdf', outdir / 'skew.pdf', '-d', '--plugin', 'tests/plugins/tesseract_noop.py', ) # Now render as an image again and use Leptonica to find the skew angle # to confirm that it was deskewed deskewed_png = outdir / 'deskewed.png' ghostscript.rasterize_pdf( deskewed_pdf, deskewed_png, raster_device='pngmono', raster_dpi=Resolution(150, 150), pageno=1, ) pix = Pix.open(deskewed_png) skew_angle, _skew_confidence = pix.find_skew() print(skew_angle) assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, *, raster_device: str, raster_dpi: Resolution, pageno: int = 1, page_dpi: Optional[Resolution] = None, rotation: Optional[int] = None, filter_vector: bool = False, ): """Rasterize one page of a PDF at resolution raster_dpi in canvas units.""" raster_dpi = raster_dpi.round(6) if not page_dpi: page_dpi = raster_dpi args_gs = ([ GS, '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', f'-sDEVICE={raster_device}', f'-dFirstPage={pageno}', f'-dLastPage={pageno}', f'-r{raster_dpi.x:f}x{raster_dpi.y:f}', ] + (['-dFILTERVECTOR'] if filter_vector else []) + [ '-o', '-', '-sstdout=%stderr', '-dAutoRotatePages=/None', # Probably has no effect on raster '-f', fspath(input_file), ]) try: p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) except CalledProcessError as e: log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript rasterizing failed') else: stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): log.error(stderr) with Image.open(BytesIO(p.stdout)) as im: if rotation is not None: log.debug("Rotating output by %i", rotation) # rotation is a clockwise angle and Image.ROTATE_* is # counterclockwise so this cancels out the rotation if rotation == 90: im = im.transpose(Image.ROTATE_90) elif rotation == 180: im = im.transpose(Image.ROTATE_180) elif rotation == 270: im = im.transpose(Image.ROTATE_270) if rotation % 180 == 90: page_dpi = page_dpi.flip_axis() im.save(fspath(output_file), dpi=page_dpi)
def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path: output_file = page_context.get_path('visible.jpg') with Image.open(image) as im: # At this point the image should be a .png, but deskew, unpaper # might have removed the DPI information. In this case, fall back to # square DPI used to rasterize. When the preview image was # rasterized, it was also converted to square resolution, which is # what we want to give to the OCR engine, so keep it square. if 'dpi' in im.info: dpi = Resolution(*im.info['dpi']) else: # Fallback to page-implied DPI dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) # Pillow requires integer DPI im.save(output_file, format='JPEG', dpi=dpi.to_int()) return output_file
def triage_image_file(input_file, output_file, options): log.info("Input file is not a PDF, checking if it is an image...") try: im = Image.open(input_file) except EnvironmentError as e: # Recover the original filename log.error(str(e).replace(str(input_file), str(options.input_file))) raise UnsupportedImageFormatError() from e with im: log.info("Input file is an image") if 'dpi' in im.info: if im.info['dpi'] <= (96, 96) and not options.image_dpi: log.info("Image size: (%d, %d)", *im.size) log.info("Image resolution: (%d, %d)", *im.info['dpi']) log.error( "Input file is an image, but the resolution (DPI) is " "not credible. Estimate the resolution at which the " "image was scanned and specify it using --image-dpi.") raise DpiError() elif not options.image_dpi: log.info("Image size: (%d, %d)", *im.size) log.error("Input file is an image, but has no resolution (DPI) " "in its metadata. Estimate the resolution at which " "image was scanned and specify it using --image-dpi.") raise DpiError() if im.mode in ('RGBA', 'LA'): log.error("The input image has an alpha channel. Remove the alpha " "channel first.") raise UnsupportedImageFormatError() if 'iccprofile' not in im.info: if im.mode == 'RGB': log.info("Input image has no ICC profile, assuming sRGB") elif im.mode == 'CMYK': log.error("Input CMYK image has no ICC profile, not usable") raise UnsupportedImageFormatError() try: log.info("Image seems valid. Try converting to PDF...") layout_fun = img2pdf.default_layout_fun if options.image_dpi: layout_fun = img2pdf.get_fixed_dpi_layout_fun( Resolution(options.image_dpi, options.image_dpi)) with open(output_file, 'wb') as outf: img2pdf.convert( os.fspath(input_file), layout_fun=layout_fun, with_pdfrw=False, outputstream=outf, ) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) raise UnsupportedImageFormatError() from e
def get_canvas_square_dpi(pageinfo, options) -> Resolution: """Get the DPI when we require xres == yres, in Postscript units""" units = float( max( (pageinfo.dpi.x) or VECTOR_PAGE_DPI, (pageinfo.dpi.y) or VECTOR_PAGE_DPI, VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0, options.oversample or 0.0, )) return Resolution(units, units)
def rasterize(pdf, pageno, png): if png.exists(): print(png) return ghostscript.rasterize_pdf( pdf, png, raster_device='pngmono', raster_dpi=Resolution(100, 100), pageno=pageno, rotation=0, )
def test_rasterize_size(francais, outdir): path, pdf = francais page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('50.0'), Decimal('30.0') forced_dpi = Resolution(42.0, 4242.0) rasterize_pdf( path, outdir / 'out.png', raster_device='pngmono', raster_dpi=Resolution( target_size[0] / page_size[0], target_size[1] / page_size[1] ), page_dpi=forced_dpi, ) with Image.open(outdir / 'out.png') as im: assert im.size == target_size assert im.info['dpi'] == forced_dpi
def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf' opt.main(infile, outdir / 'out.pdf', level=3) rasterize_pdf( outdir / 'out.pdf', outdir / 'im.png', raster_device='pnggray', raster_dpi=Resolution(10, 10), ) with Image.open(fspath(outdir / 'im.png')) as im: assert im.getpixel((0, 0)) == 255, "Expected white background"
def get_page_square_dpi(pageinfo, options) -> Resolution: "Get the DPI when we require xres == yres, scaled to physical units" xres = pageinfo.dpi.x or 0.0 yres = pageinfo.dpi.y or 0.0 userunit = float(pageinfo.userunit) or 1.0 units = float( max( (xres * userunit) or VECTOR_PAGE_DPI, (yres * userunit) or VECTOR_PAGE_DPI, VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0, options.oversample or 0.0, )) return Resolution(units, units)
def get_page_dpi(pageinfo, options): "Get the DPI when nonsquare DPI is tolerable" xres = max( pageinfo.dpi.x or VECTOR_PAGE_DPI, options.oversample or 0.0, VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0, ) yres = max( pageinfo.dpi.y or VECTOR_PAGE_DPI, options.oversample or 0, VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0, ) return Resolution(float(xres), float(yres))
def test_rasterize_rotated(francais, outdir, caplog): path, pdf = francais page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('50.0'), Decimal('30.0') forced_dpi = Resolution(42.0, 4242.0) caplog.set_level(logging.DEBUG) rasterize_pdf( path, outdir / 'out.png', raster_device='pngmono', raster_dpi=Resolution( target_size[0] / page_size[0], target_size[1] / page_size[1] ), page_dpi=forced_dpi, rotation=90, ) with Image.open(outdir / 'out.png') as im: assert im.size == (target_size[1], target_size[0]) assert im.info['dpi'] == (forced_dpi[1], forced_dpi[0])
def test_image_scale0(resources, outpdf): with pikepdf.open(resources / 'cmyk.pdf') as cmyk: xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() p = pikepdf.Pdf.new() p.add_blank_page(page_size=(72, 72)) objname = pikepdf.Page(p.pages[0]).add_resource( p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0) print(objname) p.pages[0].Contents = pikepdf.Stream( p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)) p.save(outpdf) pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) assert not pi.pages[0]._images[0].dpi.is_finite assert pi.pages[0].dpi == Resolution(0, 0)
def dpi(self) -> Resolution: return self._pageinfo.get('dpi', Resolution(0.0, 0.0))
def _pdf_get_pageinfo( pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool ): pageinfo: Dict[str, Any] = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] check_this_page = pageno in check_pages if check_this_page and detailed_analysis: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) bboxes = (box.bbox for box in pageinfo['textboxes']) pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) else: pageinfo['textboxes'] = [] pageinfo['has_text'] = None # i.e. "no information" userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) pageinfo['userunit'] = userunit pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0) pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0) try: pageinfo['rotate'] = int(page['/Rotate']) except KeyError: pageinfo['rotate'] = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) if check_this_page: pageinfo['has_vector'] = False pageinfo['has_text'] = False pageinfo['images'] = [] for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): if isinstance(ci, VectorMarker): pageinfo['has_vector'] = True elif isinstance(ci, TextMarker): pageinfo['has_text'] = True elif isinstance(ci, ImageInfo): pageinfo['images'].append(ci) else: raise NotImplementedError() else: pageinfo['has_vector'] = None # i.e. "no information" pageinfo['has_text'] = None pageinfo['images'] = None if pageinfo['images']: dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images']) pageinfo['dpi'] = dpi pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches']))) pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches']))) return pageinfo
from reportlab.pdfgen.canvas import Canvas from ocrmypdf import _pipeline, pdfinfo from ocrmypdf.helpers import Resolution @pytest.fixture(scope='session') def rgb_image(): im = Image.new('RGB', (8, 8)) im.putpixel((4, 4), (255, 0, 0)) im.putpixel((5, 5), (0, 255, 0)) im.putpixel((6, 6), (0, 0, 255)) return ImageReader(im) DUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0) VECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI, _pipeline.VECTOR_PAGE_DPI) @pytest.mark.parametrize( 'image, text, vector, result', [ (False, False, False, VECTOR_RESOLUTION), (False, True, False, VECTOR_RESOLUTION), (True, False, False, DUMMY_OVERSAMPLE_RESOLUTION), (True, True, False, VECTOR_RESOLUTION), (False, False, True, VECTOR_RESOLUTION), (False, True, True, VECTOR_RESOLUTION), (True, False, True, VECTOR_RESOLUTION), (True, True, True, VECTOR_RESOLUTION),
def dpi(self) -> Resolution: if self._dpi is None: return Resolution(0.0, 0.0) return self._dpi
def _gather_pageinfo( self, pdf: Pdf, pageno: int, infile: PathLike, check_pages: Container[int], detailed_analysis: bool, ): page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] check_this_page = pageno in check_pages if check_this_page and detailed_analysis: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) self._textboxes = list(simplify_textboxes(miner, get_text_boxes)) bboxes = (box.bbox for box in self._textboxes) self._has_text = _page_has_text(bboxes, width_pt, height_pt) else: self._textboxes = [] self._has_text = None # i.e. "no information" userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) self._userunit = userunit self._width_inches = width_pt * userunit / Decimal(72.0) self._height_inches = height_pt * userunit / Decimal(72.0) try: self._rotate = int(page['/Rotate']) except KeyError: self._rotate = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) if check_this_page: self._has_vector = False self._has_text = False self._images = [] for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): if isinstance(ci, VectorMarker): self._has_vector = True elif isinstance(ci, TextMarker): self._has_text = True elif isinstance(ci, ImageInfo): self._images.append(ci) else: raise NotImplementedError() else: self._has_vector = None # i.e. "no information" self._has_text = None self._images = None self._dpi = None if self._images: dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in self._images) self._dpi = dpi self._width_pixels = int(round(dpi.x * float(self._width_inches))) self._height_pixels = int(round(dpi.y * float(self._height_inches)))