Ejemplo n.º 1
0
def test_remove_background(spoof_tesseract_noop, resources, outdir):
    from PIL import Image

    # Ensure the input image does not contain pure white/black
    im = Image.open(resources / 'congress.jpg')
    assert im.getextrema() != ((0, 255), (0, 255), (0, 255))

    output_pdf = check_ocrmypdf(resources / 'congress.jpg',
                                outdir / 'test_remove_bg.pdf',
                                '--remove-background',
                                '--image-dpi',
                                '150',
                                env=spoof_tesseract_noop)

    log = logging.getLogger()

    output_png = outdir / 'remove_bg.png'

    ghostscript.rasterize_pdf(output_pdf,
                              output_png,
                              xres=100,
                              yres=100,
                              raster_device='png16m',
                              log=log,
                              pageno=1)

    # The output image should contain pure white and black
    im = Image.open(output_png)
    assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
Ejemplo n.º 2
0
def test_deskew(spoof_tesseract_noop, resources, outdir):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(resources / 'skew.pdf',
                                  outdir / 'skew.pdf',
                                  '-d',
                                  env=spoof_tesseract_noop)

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    log = logging.getLogger()

    deskewed_png = outdir / 'deskewed.png'

    ghostscript.rasterize_pdf(deskewed_pdf,
                              deskewed_png,
                              xres=150,
                              yres=150,
                              raster_device='pngmono',
                              log=log,
                              pageno=1)

    from ocrmypdf.leptonica import Pix
    pix = Pix.read(str(deskewed_png))
    skew_angle, skew_confidence = pix.find_skew()

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"
Ejemplo n.º 3
0
def test_deskew(spoof_tesseract_noop, resources, outdir):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(
        resources / 'skew.pdf', outdir / 'skew.pdf', '-d',
        env=spoof_tesseract_noop)

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    log = logging.getLogger()

    deskewed_png = outdir / 'deskewed.png'

    ghostscript.rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        xres=150,
        yres=150,
        raster_device='pngmono',
        log=log,
        pageno=1)

    from ocrmypdf.leptonica import Pix
    pix = Pix.read(str(deskewed_png))
    skew_angle, skew_confidence = pix.find_skew()

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"
Ejemplo n.º 4
0
 def rasterize(pdf, pageno, png):
     if png.exists():
         print(png)
         return
     ghostscript.rasterize_pdf(
         pdf, png, xres=100, yres=100,
         raster_device='pngmono', log=gslog, pageno=pageno)
Ejemplo n.º 5
0
def test_rasterize_rotated(linn, outdir, caplog):
    path, pdf = linn
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72),
                 page_size_pts[1] / Decimal(72))
    target_size = Decimal('200.0'), Decimal('150.0')
    target_dpi = 42.0, 4242.0

    log = logging.getLogger()
    caplog.set_level(logging.DEBUG)
    rasterize_pdf(
        path,
        outdir / 'out.png',
        target_size[0] / page_size[0],
        target_size[1] / page_size[1],
        raster_device='pngmono',
        log=log,
        page_dpi=target_dpi,
        rotation=90,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == (target_size[1], target_size[0])
        assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
Ejemplo n.º 6
0
 def rasterize(pdf, pageno, png):
     if png.exists():
         print(png)
         return
     ghostscript.rasterize_pdf(
         pdf, png, xres=100, yres=100,
         raster_device='pngmono', log=gslog, pageno=pageno)
Ejemplo n.º 7
0
def test_remove_background(spoof_tesseract_noop, resources, outdir):
    from PIL import Image

    # Ensure the input image does not contain pure white/black
    im = Image.open(resources / 'congress.jpg')
    assert im.getextrema() != ((0, 255), (0, 255), (0, 255))

    output_pdf = check_ocrmypdf(
        resources / 'congress.jpg',
        outdir / 'test_remove_bg.pdf',
        '--remove-background',
        '--image-dpi', '150',
        env=spoof_tesseract_noop)

    log = logging.getLogger()

    output_png = outdir / 'remove_bg.png'

    ghostscript.rasterize_pdf(
        output_pdf,
        output_png,
        xres=100,
        yres=100,
        raster_device='png16m',
        log=log,
        pageno=1)

    # The output image should contain pure white and black
    im = Image.open(output_png)
    assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
Ejemplo n.º 8
0
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(outdir / 'out.pdf',
                  outdir / 'im.png',
                  xres=10,
                  yres=10,
                  raster_device='pnggray',
                  log=logging.getLogger(name='test_mono_flip'))

    im = Image.open(fspath(outdir / 'im.png'))
    assert im.getpixel((0, 0)) == 255, "Expected white background"
Ejemplo n.º 9
0
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(
        outdir / 'out.pdf',
        outdir / 'im.png',
        xres=10,
        yres=10,
        raster_device='pnggray',
        log=logging.getLogger(name='test_mono_flip'),
    )

    im = Image.open(fspath(outdir / 'im.png'))
    assert im.getpixel((0, 0)) == 255, "Expected white background"
Ejemplo n.º 10
0
def test_rasterize_size(linn, outdir, caplog):
    path, pdf = linn
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('200.0'), Decimal('150.0')
    target_dpi = 42.0, 4242.0

    log = logging.getLogger()
    rasterize_pdf(
        path,
        outdir / 'out.png',
        target_size[0] / page_size[0],
        target_size[1] / page_size[1],
        raster_device='pngmono',
        log=log,
        page_dpi=target_dpi,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == target_dpi
Ejemplo n.º 11
0
def test_rasterize_size(francais, outdir, caplog):
    path, pdf = francais
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72),
                 page_size_pts[1] / Decimal(72))
    target_size = Decimal('50.0'), Decimal('30.0')
    forced_dpi = 42.0, 4242.0

    log = logging.getLogger()
    rasterize_pdf(
        path,
        outdir / 'out.png',
        target_size[0] / page_size[0],
        target_size[1] / page_size[1],
        raster_device='pngmono',
        log=log,
        page_dpi=forced_dpi,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == forced_dpi