Esempio n. 1
0
def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
                                      resources, outpdf):
    from math import isclose

    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    # --force-ocr requires means forced conversion to square resolution
    check_ocrmypdf(resources / 'aspect.pdf',
                   outpdf,
                   '--force-ocr',
                   '--pdf-renderer',
                   renderer,
                   env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(str(outpdf))

    in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]

    # Resolution show now be equal
    assert out_p0['xres'] == out_p0['yres']

    # Page size should match input page size
    assert isclose(in_p0['width_inches'], out_p0['width_inches'])
    assert isclose(in_p0['height_inches'], out_p0['height_inches'])

    # Because we rasterized the page to produce a new image, it should occupy
    # the entire page
    out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
    out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
    assert isclose(out_p0['width_inches'], out_im_w)
    assert isclose(out_p0['height_inches'], out_im_h)
Esempio n. 2
0
def test_rotated_skew_timeout():
    """This document contains an image that is rotated 90 into place with a
    /Rotate tag and intentionally skewed by altering the transformation matrix.

    This tests for a bug where the combinatino of preprocessing and a tesseract
    timeout produced a page whose dimensions did not match the original's.
    """

    input_file = _infile('rotated_skew.pdf')
    in_pageinfo = pdf_get_all_pageinfo(input_file)[0]

    assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
        "Expected the input page to be landscape"
    assert in_pageinfo['rotate'] == 90, "Expected a rotated page"

    out = check_ocrmypdf(
        'rotated_skew.pdf', 'test_rotated_skew.pdf',
        '--pdf-renderer', 'hocr',
        '--deskew', '--tesseract-timeout', '0')

    out_pageinfo = pdf_get_all_pageinfo(out)[0]

    assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
        "Expected the output page to be portrait"

    assert out_pageinfo['rotate'] == 0, \
        "Expected no page rotation for output"

    assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
        in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
        "Expected page rotation to be baked in"
Esempio n. 3
0
def test_rotated_skew_timeout(resources, outpdf):
    """This document contains an image that is rotated 90 into place with a
    /Rotate tag and intentionally skewed by altering the transformation matrix.

    This tests for a bug where the combinatino of preprocessing and a tesseract
    timeout produced a page whose dimensions did not match the original's.
    """

    input_file = str(resources / 'rotated_skew.pdf')
    in_pageinfo = pdf_get_all_pageinfo(input_file)[0]

    assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
        "Expected the input page to be landscape"
    assert in_pageinfo['rotate'] == 90, "Expected a rotated page"

    out = check_ocrmypdf(input_file, outpdf, '--pdf-renderer', 'hocr',
                         '--deskew', '--tesseract-timeout', '0')

    out_pageinfo = pdf_get_all_pageinfo(str(out))[0]

    assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
        "Expected the output page to be portrait"

    assert out_pageinfo['rotate'] == 0, \
        "Expected no page rotation for output"

    assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
        in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
        "Expected page rotation to be baked in"
Esempio n. 4
0
def test_non_square_resolution(renderer, spoof_tesseract_cache):
    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    out = 'aspect_%s.pdf' % renderer
    check_ocrmypdf(
        'aspect.pdf', out,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(_outfile(out))

    # Confirm resolution was kept the same
    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
Esempio n. 5
0
def test_non_square_resolution(renderer, spoof_tesseract_cache):
    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    out = 'aspect_%s.pdf' % renderer
    check_ocrmypdf(
        'aspect.pdf', out,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(_outfile(out))

    # Confirm resolution was kept the same
    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
Esempio n. 6
0
def test_single_page_inline_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf')
    pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    with pytest.raises(NotImplementedError):
        pageinfo.pdf_get_all_pageinfo(filename)
Esempio n. 7
0
def test_non_square_resolution(renderer, spoof_tesseract_cache,
                               resources, outpdf):
    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    check_ocrmypdf(
        resources / 'aspect.pdf', outpdf,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(str(outpdf))

    # Confirm resolution was kept the same
    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
def test_single_page_inline_image():
    filename = os.path.join(TEST_OUTPUT, "image-mono-inline.pdf")
    pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new("1", (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format="PNG")
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    with pytest.raises(NotImplementedError):
        pageinfo.pdf_get_all_pageinfo(filename)
Esempio n. 9
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')
    pdf = Canvas(filename, pagesize=(72, 72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    # assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert pdfimage['dpi_w'] == 8
    assert pdfimage['dpi_h'] == 8
Esempio n. 10
0
def test_force_ocr(spoof_tesseract_cache):
    out = check_ocrmypdf('graph_ocred.pdf',
                         'test_force.pdf',
                         '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']
Esempio n. 11
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')

    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')

        pdf_bytes = img2pdf.convert([im_tmp.name], dpi=8)
        with open(filename, 'wb') as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert pdfimage['dpi_w'] == 8
    assert pdfimage['dpi_h'] == 8
Esempio n. 12
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')

    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')

        pdf_bytes = img2pdf.convert([im_tmp.name], dpi=8)
        with open(filename, 'wb') as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert pdfimage['dpi_w'] == 8
    assert pdfimage['dpi_h'] == 8
Esempio n. 13
0
def test_jpeg():
    filename = resource_filename(req, 'tests/resources/c02-22.pdf')

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
Esempio n. 14
0
def test_jpeg():
    filename = resource_filename(req, "tests/resources/c02-22.pdf")

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]["images"][0]
    assert pdfimage["enc"] == "jpeg"
Esempio n. 15
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, "image-mono.pdf")

    with NamedTemporaryFile() as im_tmp:
        im = Image.new("1", (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format="PNG")

        pdf_bytes = img2pdf.convert([im_tmp.name], dpi=8)
        with open(filename, "wb") as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page["has_text"]
    assert len(page["images"]) == 1

    pdfimage = page["images"][0]
    assert pdfimage["width"] == 8
    assert pdfimage["color"] == "gray"

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage["bpc"] == 8

    # DPI in a 1"x1" is the image width
    assert pdfimage["dpi_w"] == 8
    assert pdfimage["dpi_h"] == 8
Esempio n. 16
0
def test_jpeg():
    filename = resource_filename(req, 'tests/resources/c02-22.pdf')

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
Esempio n. 17
0
def test_jpeg():
    filename = _make_input('c02-22.pdf')

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
Esempio n. 18
0
def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'graph_ocred.pdf',
                         outpdf,
                         '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']
Esempio n. 19
0
def test_single_page_image(outdir):
    filename = outdir / 'image-mono.pdf'

    im_tmp = outdir / 'tmp.png'
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(str(im_tmp), format='PNG')

    imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
    layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

    im_bytes = im_tmp.read_bytes()
    pdf_bytes = img2pdf.convert(im_bytes,
                                producer="img2pdf",
                                with_pdfrw=False,
                                layout_fun=layout_fun)
    filename.write_bytes(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # DPI in a 1"x1" is the image width
    assert abs(pdfimage['dpi_w'] - 8) < 1e-5
    assert abs(pdfimage['dpi_h'] - 8) < 1e-5
Esempio n. 20
0
def test_skip_big(spoof_tesseract_cache):
    out = check_ocrmypdf('enormous.pdf',
                         'test_enormous.pdf',
                         '--skip-big',
                         '10',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert not pdfinfo[0]['has_text']
Esempio n. 21
0
def test_jpeg(resources, outdir):
    filename = resources / 'c02-22.pdf'

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
    assert (pdfimage['dpi_w'] - 150) < 1e-5
Esempio n. 22
0
def test_skip_big(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'enormous.pdf',
                         outpdf,
                         '--skip-big',
                         '10',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(str(out))
    assert not pdfinfo[0]['has_text']
Esempio n. 23
0
def test_jpeg():
    filename = _make_input('c02-22.pdf')

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
    assert (pdfimage['dpi_w'] - 150) < 1e-5
Esempio n. 24
0
def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(
        resources / 'jbig2.pdf', outpdf,
        '--output-type', 'pdf',
        '--pdf-renderer', 'hocr',
        env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(str(out))
    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
Esempio n. 25
0
def check_oversample(renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300',
        '--pdf-renderer', renderer)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 300) < 1
Esempio n. 26
0
def test_jbig2_passthrough(spoof_tesseract_cache):
    out = check_ocrmypdf(
        'jbig2.pdf', 'jbig2_out.pdf',
        '--output-type', 'pdf',
        '--pdf-renderer', 'hocr',
        env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(out)
    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
Esempio n. 27
0
def test_jbig2_passthrough(spoof_tesseract_cache):
    out = check_ocrmypdf(
        'jbig2.pdf', 'jbig2_out.pdf',
        '--output-type', 'pdf',
        '--pdf-renderer', 'hocr',
        env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(out)
    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
Esempio n. 28
0
def test_oversample(spoof_tesseract_cache, renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350',
        '-f',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 350) < 1
Esempio n. 29
0
def test_oversample(spoof_tesseract_cache, renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350',
        '-f',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 350) < 1
Esempio n. 30
0
def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
    oversampled_pdf = check_ocrmypdf(
        resources / 'skew.pdf', outpdf, '--oversample', '350',
        '-f',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(str(oversampled_pdf))

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 350) < 1
Esempio n. 31
0
def check_oversample(renderer):
    oversampled_pdf = check_ocrmypdf('skew.pdf',
                                     'test_oversample_%s.pdf' % renderer,
                                     '--oversample', '300', '--pdf-renderer',
                                     renderer)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 300) < 1
Esempio n. 32
0
def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
    "Checks for a Decimal quantize error with high DPI, etc"
    check_ocrmypdf(resources / '2400dpi.pdf',
                   outpdf,
                   env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(outpdf)

    image = pdfinfo[0]['images'][0]
    assert image['dpi_w'] == image['dpi_h']
    assert image['dpi_w'] == 2400
Esempio n. 33
0
def test_skip_pages_does_not_replicate(
        ensure_tess4, resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf, '--pdf-renderer', 'tess4', '--force-ocr',
        '--tesseract-timeout', '0',
        env=ensure_tess4
    )

    info_in = pageinfo.pdf_get_all_pageinfo(str(infile))

    info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
    for page in info:
        assert len(page['images']) == 1, "skipped page was replicated"

    for n in range(len(info_in)):
        assert info[n]['width_inches'] == info_in[n]['width_inches']
Esempio n. 34
0
def test_content_preservation(ensure_tess4, resources, outpdf):
    infile = resources / 'masks.pdf'

    check_ocrmypdf(
        infile,
        outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0',
        env=ensure_tess4
    )

    info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
    page = info[0]
    assert len(page['images']) > 1, "masked were rasterized"
Esempio n. 35
0
def test_single_page_text():
    filename = os.path.join(TEST_OUTPUT, "text.pdf")
    pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72))
    text = pdf.beginText()
    text.setFont("Helvetica", 12)
    text.setTextOrigin(1 * 72, 3 * 72)
    text.textLine("Methink'st thou art a general offence and every" " man should beat thee.")
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert page["has_text"]
    assert len(page["images"]) == 0
Esempio n. 36
0
def test_single_page_inline_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf')
    pdf = Canvas(filename, pagesize=(8*72, 6*72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
    print(pdfinfo)
    pdfimage = pdfinfo[0]['images'][0]
    assert (pdfimage['dpi_w'] - 8) < 1e-5
    assert pdfimage['color'] != '-'
    assert pdfimage['width'] == 8
Esempio n. 37
0
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf):
    sidecar = outpdf + '.txt'
    check_ocrmypdf(resources / 'multipage.pdf',
                   outpdf,
                   '--skip-text',
                   '--sidecar',
                   sidecar,
                   env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(str(resources / 'multipage.pdf'))
    num_pages = len(pdfinfo)

    with open(sidecar, 'r') as f:
        ocr_text = f.read()

    # There should a formfeed between each pair of pages, so the count of
    # formfeeds is the page count less one
    assert ocr_text.count('\f') == num_pages - 1, \
        "Sidecar page count does not match PDF page count"
Esempio n. 38
0
def test_single_page_inline_image(outdir):
    filename = outdir / 'image-mono-inline.pdf'
    pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
    print(pdfinfo)
    pdfimage = pdfinfo[0]['images'][0]
    assert (pdfimage['dpi_w'] - 8) < 1e-5
    assert pdfimage['color'] != '-'
    assert pdfimage['width'] == 8
Esempio n. 39
0
def test_single_page_text(outdir):
    filename = outdir / 'text.pdf'
    pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72))
    text = pdf.beginText()
    text.setFont('Helvetica', 12)
    text.setTextOrigin(1 * 72, 3 * 72)
    text.textLine("Methink'st thou art a general offence and every"
                  " man should beat thee.")
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert page['has_text']
    assert len(page['images']) == 0
Esempio n. 40
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')

    with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')

        imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
        layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

        im_tmp.seek(0)
        im_bytes = im_tmp.read()
        pdf_bytes = img2pdf.convert(im_bytes,
                                    producer="img2pdf",
                                    with_pdfrw=False,
                                    layout_fun=layout_fun)

        with open(filename, 'wb') as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert abs(pdfimage['dpi_w'] - 8) < 1e-5
    assert abs(pdfimage['dpi_h'] - 8) < 1e-5
Esempio n. 41
0
def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')

    with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')

        imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
        layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

        im_tmp.seek(0)
        im_bytes = im_tmp.read()
        pdf_bytes = img2pdf.convert(
                im_bytes, producer="img2pdf", with_pdfrw=False,
                layout_fun=layout_fun)

        with open(filename, 'wb') as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert pdfimage['dpi_w'] == 8
    assert pdfimage['dpi_h'] == 8
Esempio n. 42
0
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, resources,
                               image, outpdf):
    from PIL import Image

    input_file = str(resources / image)
    output_file = str(outpdf)

    im = Image.open(input_file)

    # Runs: ocrmypdf - output.pdf < testfile
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + [
            '--image-dpi', '150', '--output-type', 'pdf', '-', output_file
        ]
        p = Popen(p_args,
                  close_fds=True,
                  stdout=PIPE,
                  stderr=PIPE,
                  stdin=input_stream,
                  env=spoof_tesseract_noop)
        out, err = p.communicate()

        assert p.returncode == ExitCode.ok

    pdfinfo = pdf_get_all_pageinfo(output_file)

    pdfimage = pdfinfo[0]['images'][0]

    if input_file.endswith('.png'):
        assert pdfimage['enc'] != 'jpeg', \
            "Lossless compression changed to lossy!"
    elif input_file.endswith('.jpg'):
        assert pdfimage['enc'] == 'jpeg', \
            "Lossy compression changed to lossless!"
    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
        assert pdfimage['color'] == 'rgb', \
            "Colorspace changed"
    elif im.mode.startswith('L'):
        assert pdfimage['color'] == 'gray', \
            "Colorspace changed"
Esempio n. 43
0
def first_page_dimensions(pdf):
    from ocrmypdf import pageinfo
    info = pageinfo.pdf_get_all_pageinfo(str(pdf))
    page0 = info[0]
    return (page0['width_inches'], page0['height_inches'])
Esempio n. 44
0
def test_skip_big(spoof_tesseract_cache):
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
                         '--skip-big', '10', env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert not pdfinfo[0]['has_text']
Esempio n. 45
0
def test_ocr_timeout(renderer):
    out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
                         '--tesseract-timeout', '1.0')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert not pdfinfo[0]['has_text']
Esempio n. 46
0
def test_force_ocr(spoof_tesseract_cache):
    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']
Esempio n. 47
0
def test_force_ocr():
    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']
Esempio n. 48
0
def test_skip_big():
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
                         '--skip-big', '10')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False
Esempio n. 49
0
def test_form_xobject(resources):
    filename = resources / 'formxobject.pdf'

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['width'] == 50
Esempio n. 50
0
def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'

    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
    assert len(pdfinfo[0]['images']) == 0
    assert pdfinfo[0]['has_text'] == False