def test_remove_background(spoof_tesseract_noop, resources, outdir): from PIL import Image # Ensure the input image does not contain pure white/black im = Image.open(resources / 'congress.jpg') assert im.getextrema() != ((0, 255), (0, 255), (0, 255)) output_pdf = check_ocrmypdf(resources / 'congress.jpg', outdir / 'test_remove_bg.pdf', '--remove-background', '--image-dpi', '150', env=spoof_tesseract_noop) log = logging.getLogger() output_png = outdir / 'remove_bg.png' ghostscript.rasterize_pdf(output_pdf, output_png, xres=100, yres=100, raster_device='png16m', log=log, pageno=1) # The output image should contain pure white and black im = Image.open(output_png) assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
def test_deskew(spoof_tesseract_noop, resources, outdir): # Run with deskew deskewed_pdf = check_ocrmypdf(resources / 'skew.pdf', outdir / 'skew.pdf', '-d', env=spoof_tesseract_noop) # Now render as an image again and use Leptonica to find the skew angle # to confirm that it was deskewed log = logging.getLogger() deskewed_png = outdir / 'deskewed.png' ghostscript.rasterize_pdf(deskewed_pdf, deskewed_png, xres=150, yres=150, raster_device='pngmono', log=log, pageno=1) from ocrmypdf.leptonica import Pix pix = Pix.read(str(deskewed_png)) skew_angle, skew_confidence = pix.find_skew() print(skew_angle) assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def test_deskew(spoof_tesseract_noop, resources, outdir): # Run with deskew deskewed_pdf = check_ocrmypdf( resources / 'skew.pdf', outdir / 'skew.pdf', '-d', env=spoof_tesseract_noop) # Now render as an image again and use Leptonica to find the skew angle # to confirm that it was deskewed log = logging.getLogger() deskewed_png = outdir / 'deskewed.png' ghostscript.rasterize_pdf( deskewed_pdf, deskewed_png, xres=150, yres=150, raster_device='pngmono', log=log, pageno=1) from ocrmypdf.leptonica import Pix pix = Pix.read(str(deskewed_png)) skew_angle, skew_confidence = pix.find_skew() print(skew_angle) assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def rasterize(pdf, pageno, png): if png.exists(): print(png) return ghostscript.rasterize_pdf( pdf, png, xres=100, yres=100, raster_device='pngmono', log=gslog, pageno=pageno)
def test_rasterize_rotated(linn, outdir, caplog): path, pdf = linn page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('200.0'), Decimal('150.0') target_dpi = 42.0, 4242.0 log = logging.getLogger() caplog.set_level(logging.DEBUG) rasterize_pdf( path, outdir / 'out.png', target_size[0] / page_size[0], target_size[1] / page_size[1], raster_device='pngmono', log=log, page_dpi=target_dpi, rotation=90, ) with Image.open(outdir / 'out.png') as im: assert im.size == (target_size[1], target_size[0]) assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
def rasterize(pdf, pageno, png): if png.exists(): print(png) return ghostscript.rasterize_pdf( pdf, png, xres=100, yres=100, raster_device='pngmono', log=gslog, pageno=pageno)
def test_remove_background(spoof_tesseract_noop, resources, outdir): from PIL import Image # Ensure the input image does not contain pure white/black im = Image.open(resources / 'congress.jpg') assert im.getextrema() != ((0, 255), (0, 255), (0, 255)) output_pdf = check_ocrmypdf( resources / 'congress.jpg', outdir / 'test_remove_bg.pdf', '--remove-background', '--image-dpi', '150', env=spoof_tesseract_noop) log = logging.getLogger() output_png = outdir / 'remove_bg.png' ghostscript.rasterize_pdf( output_pdf, output_png, xres=100, yres=100, raster_device='png16m', log=log, pageno=1) # The output image should contain pure white and black im = Image.open(output_png) assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf' opt.main(infile, outdir / 'out.pdf', level=3) rasterize_pdf(outdir / 'out.pdf', outdir / 'im.png', xres=10, yres=10, raster_device='pnggray', log=logging.getLogger(name='test_mono_flip')) im = Image.open(fspath(outdir / 'im.png')) assert im.getpixel((0, 0)) == 255, "Expected white background"
def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf' opt.main(infile, outdir / 'out.pdf', level=3) rasterize_pdf( outdir / 'out.pdf', outdir / 'im.png', xres=10, yres=10, raster_device='pnggray', log=logging.getLogger(name='test_mono_flip'), ) im = Image.open(fspath(outdir / 'im.png')) assert im.getpixel((0, 0)) == 255, "Expected white background"
def test_rasterize_size(linn, outdir, caplog): path, pdf = linn page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('200.0'), Decimal('150.0') target_dpi = 42.0, 4242.0 log = logging.getLogger() rasterize_pdf( path, outdir / 'out.png', target_size[0] / page_size[0], target_size[1] / page_size[1], raster_device='pngmono', log=log, page_dpi=target_dpi, ) with Image.open(outdir / 'out.png') as im: assert im.size == target_size assert im.info['dpi'] == target_dpi
def test_rasterize_size(francais, outdir, caplog): path, pdf = francais page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('50.0'), Decimal('30.0') forced_dpi = 42.0, 4242.0 log = logging.getLogger() rasterize_pdf( path, outdir / 'out.png', target_size[0] / page_size[0], target_size[1] / page_size[1], raster_device='pngmono', log=log, page_dpi=forced_dpi, ) with Image.open(outdir / 'out.png') as im: assert im.size == target_size assert im.info['dpi'] == forced_dpi