Example #1
0
def test_document_save():

    with tempfile.TemporaryDirectory() as d:
        doc = Document(res('sample.pdf'))
        fname = f'{d}/temp.pdf'
        doc.save(fname)

        doc2 = Document(fname)

        assert len(doc) == len(doc2)
        assert doc2.meta['Title'] == 'Red Stork'
Example #2
0
def test_geometry_sane():

    for fname in [
            'arxiv1901.01387.pdf',
            'arxiv1901.02668.pdf',
            'arxiv1901.09059.pdf',
            'arxiv1901.02066.pdf',
            'arxiv1901.08145v1.pdf',
            'arxiv1901.10092.pdf',
            # 'arxiv1901.02527.pdf',
            # 'arxiv1901.08637.pdf',
            'arxiv1901.11067.pdf'
    ]:
        doc = Document(res(fname))

        for pageidx, page in enumerate(doc):
            objcount = 0
            for obj in page.flat_iter():
                if obj.type == PageObject.OBJ_TYPE_TEXT:
                    txt_boxes = list(obj.text_geometry_iter())
                    for _, bx in txt_boxes:
                        assert inside(page.crop_box,
                                      bx), (fname, pageidx, page.crop_box,
                                            objcount, bx)
                    # assert all(inside(obj.rect, bx) for _,bx in txt_boxes), (fname, pageidx, obj.rect, objcount)
                objcount += 1
Example #3
0
def test_text_geometry_iter():
    doc = Document(res('sample.pdf'))
    page = doc[2]
    assert len(page) == 19

    text_objs = [
        x for x in page.flat_iter() if x.type == PageObject.OBJ_TYPE_TEXT
    ]
    text = text_objs[0]
    assert len(text) == 8

    items = list(text.text_geometry_iter())
    assert items == [
        ('M', (489.1150207519531, 619.3770141601562, 500.0322828043718,
               629.8354499922134)),
        ('a', (501.0654058456421, 619.0477800783701, 508.73936194344424,
               627.1105125918984)),
        ('n', (509.0418930053711, 619.3770141601562, 516.0293610177469,
               627.1105125918984)),
        ('u', (516.8749179840088, 619.0477800783701, 523.7888337015174,
               626.8933581975289)),
        ('a', (524.8514060974121, 619.0477800783701, 532.5253621952143,
               627.1105125918984)),
        ('l', (532.8278923034668, 619.3770141601562, 535.0099437178578,
               629.8354499922134)),
        (':', (536.0127487182617, 619.3770141601562, 539.0844326515216,
               626.8933581975289)),
    ]
Example #4
0
def test_unicode_map_multichar3():
    doc = Document(res('arxiv1901.09059.pdf'))

    text = []
    for obj in doc[0]:
        if obj.type == PageObject.OBJ_TYPE_TEXT:
            text.append(obj.text)  # should not crash
    assert ''.join(text)[-20:] == 'd applicability.    '
Example #5
0
def test_unicode_map_multichar2():
    doc = Document(res('arxiv1901.08145v1.pdf'))

    text = []
    for obj in doc[0]:
        if obj.type == PageObject.OBJ_TYPE_TEXT:
            text.append(obj.text)  # should not crash
    assert ''.join(text)[-20:] == 'ing-centerpositionX,'
Example #6
0
def test_page_smoke():

    doc = Document(res('sample.pdf'))

    page = doc[0]

    assert page.media_box == (0., 0., 612., 792.)
    assert page.crop_box == (0., 0., 612., 792.)
Example #7
0
def test_document():
    doc = Document(res('sample.pdf'))

    assert len(doc) == 15

    assert doc.meta['Title'] == 'Red Stork'

    str(doc)  # should not crash
Example #8
0
def test_rotated_text():
    doc = Document(res('arxiv1901.02066.pdf'))

    page = doc[0]
    objs = list(page.flat_iter())
    obj = objs[296]
    rect = obj.rect
    text_geom = list(obj.text_geometry_iter())
    assert all(inside(rect, bx) for _, bx in text_geom)
Example #9
0
def test_page_object_smoke():
    doc = Document(res('sample.pdf'))
    page = doc[2]
    assert len(page) == 19

    text_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_TEXT]
    path_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_PATH]
    assert len(text_objs) == 16
    assert len(path_objs) == 3
Example #10
0
def test_flatiter():
    '''used to crash'''
    doc = Document(res('arxiv1901.10092.pdf'))

    text = []
    for obj in doc[0].flat_iter():
        if obj.type == PageObject.OBJ_TYPE_TEXT:
            text.append(obj.text)  # should not crash
    assert ''.join(text)[-20:] == 're.com/naturephysics'
Example #11
0
def test_document_edit_meta():

    with tempfile.TemporaryDirectory() as d:
        doc = Document(res('sample.pdf'))

        doc.meta['Title'] = 'Best PDF Parsing tool in this world!'
        fname = f'{d}/temp.pdf'
        doc.save(fname)

        doc2 = Document(fname)

        assert len(doc) == len(doc2)
        assert doc2.meta['Title'] == 'Best PDF Parsing tool in this world!'
Example #12
0
def test_page_object_chars():
    doc = Document(res('sample.pdf'))
    page = doc[2]
    assert len(page) == 19

    text_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_TEXT]
    text = text_objs[0]
    assert len(text) == 8

    items = list(text)

    assert items == [
        (77, 0.0, 0.0),
        (97, 11.950385093688965, 0.0),
        (110, 19.92687225341797, 0.0),
        (-1, 10.0, 0.0),
        (117, 27.759897232055664, 0.0),
        (97, 35.736385345458984, 0.0),
        (108, 43.71287155151367, 0.0),
        (58, 46.897727966308594, 0.0),
    ]

    assert text.text == 'Manual:'
Example #13
0
def test_page_label():
    doc = Document(res('sample.pdf'))

    assert doc[2].label == 'i'
    assert doc[-3].label == '9'
Example #14
0
from redstork import Document, PageObject, Glyph
from redstork.test import res

doc = Document(res('sample.pdf'))

print('Number of pages:', len(doc))

print('MediaBox of the first page is:', doc[0].media_box)

print('Rotation of the first page is:', doc[0].rotation)

print('Document title:', doc.meta['Title'])

print('First page has', len(doc[0]), 'objects')

doc[0].render('page-0.ppm', scale=2)  # render page #1 as image

page = doc[0]
for o in page:
    if o.type == PageObject.OBJ_TYPE_TEXT:
        for code, _, _ in o:
            print(o.font[code], end='')
        print()

for fid, font in doc.fonts.items():
    print(font.simple_name, fid)

# lets generate an SVG file of the first letter on page 1
text_object = [o for o in page
               if o.type == PageObject.OBJ_TYPE_TEXT][0]  # first text object
charcode, _, _ = text_object[0]  # first character of the first text object