def test_document_save(): with tempfile.TemporaryDirectory() as d: doc = Document(res('sample.pdf')) fname = f'{d}/temp.pdf' doc.save(fname) doc2 = Document(fname) assert len(doc) == len(doc2) assert doc2.meta['Title'] == 'Red Stork'
def test_geometry_sane(): for fname in [ 'arxiv1901.01387.pdf', 'arxiv1901.02668.pdf', 'arxiv1901.09059.pdf', 'arxiv1901.02066.pdf', 'arxiv1901.08145v1.pdf', 'arxiv1901.10092.pdf', # 'arxiv1901.02527.pdf', # 'arxiv1901.08637.pdf', 'arxiv1901.11067.pdf' ]: doc = Document(res(fname)) for pageidx, page in enumerate(doc): objcount = 0 for obj in page.flat_iter(): if obj.type == PageObject.OBJ_TYPE_TEXT: txt_boxes = list(obj.text_geometry_iter()) for _, bx in txt_boxes: assert inside(page.crop_box, bx), (fname, pageidx, page.crop_box, objcount, bx) # assert all(inside(obj.rect, bx) for _,bx in txt_boxes), (fname, pageidx, obj.rect, objcount) objcount += 1
def test_text_geometry_iter(): doc = Document(res('sample.pdf')) page = doc[2] assert len(page) == 19 text_objs = [ x for x in page.flat_iter() if x.type == PageObject.OBJ_TYPE_TEXT ] text = text_objs[0] assert len(text) == 8 items = list(text.text_geometry_iter()) assert items == [ ('M', (489.1150207519531, 619.3770141601562, 500.0322828043718, 629.8354499922134)), ('a', (501.0654058456421, 619.0477800783701, 508.73936194344424, 627.1105125918984)), ('n', (509.0418930053711, 619.3770141601562, 516.0293610177469, 627.1105125918984)), ('u', (516.8749179840088, 619.0477800783701, 523.7888337015174, 626.8933581975289)), ('a', (524.8514060974121, 619.0477800783701, 532.5253621952143, 627.1105125918984)), ('l', (532.8278923034668, 619.3770141601562, 535.0099437178578, 629.8354499922134)), (':', (536.0127487182617, 619.3770141601562, 539.0844326515216, 626.8933581975289)), ]
def test_unicode_map_multichar3(): doc = Document(res('arxiv1901.09059.pdf')) text = [] for obj in doc[0]: if obj.type == PageObject.OBJ_TYPE_TEXT: text.append(obj.text) # should not crash assert ''.join(text)[-20:] == 'd applicability. '
def test_unicode_map_multichar2(): doc = Document(res('arxiv1901.08145v1.pdf')) text = [] for obj in doc[0]: if obj.type == PageObject.OBJ_TYPE_TEXT: text.append(obj.text) # should not crash assert ''.join(text)[-20:] == 'ing-centerpositionX,'
def test_page_smoke(): doc = Document(res('sample.pdf')) page = doc[0] assert page.media_box == (0., 0., 612., 792.) assert page.crop_box == (0., 0., 612., 792.)
def test_document(): doc = Document(res('sample.pdf')) assert len(doc) == 15 assert doc.meta['Title'] == 'Red Stork' str(doc) # should not crash
def test_rotated_text(): doc = Document(res('arxiv1901.02066.pdf')) page = doc[0] objs = list(page.flat_iter()) obj = objs[296] rect = obj.rect text_geom = list(obj.text_geometry_iter()) assert all(inside(rect, bx) for _, bx in text_geom)
def test_page_object_smoke(): doc = Document(res('sample.pdf')) page = doc[2] assert len(page) == 19 text_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_TEXT] path_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_PATH] assert len(text_objs) == 16 assert len(path_objs) == 3
def test_flatiter(): '''used to crash''' doc = Document(res('arxiv1901.10092.pdf')) text = [] for obj in doc[0].flat_iter(): if obj.type == PageObject.OBJ_TYPE_TEXT: text.append(obj.text) # should not crash assert ''.join(text)[-20:] == 're.com/naturephysics'
def test_document_edit_meta(): with tempfile.TemporaryDirectory() as d: doc = Document(res('sample.pdf')) doc.meta['Title'] = 'Best PDF Parsing tool in this world!' fname = f'{d}/temp.pdf' doc.save(fname) doc2 = Document(fname) assert len(doc) == len(doc2) assert doc2.meta['Title'] == 'Best PDF Parsing tool in this world!'
def test_page_object_chars(): doc = Document(res('sample.pdf')) page = doc[2] assert len(page) == 19 text_objs = [x for x in page if x.type == PageObject.OBJ_TYPE_TEXT] text = text_objs[0] assert len(text) == 8 items = list(text) assert items == [ (77, 0.0, 0.0), (97, 11.950385093688965, 0.0), (110, 19.92687225341797, 0.0), (-1, 10.0, 0.0), (117, 27.759897232055664, 0.0), (97, 35.736385345458984, 0.0), (108, 43.71287155151367, 0.0), (58, 46.897727966308594, 0.0), ] assert text.text == 'Manual:'
def test_page_label(): doc = Document(res('sample.pdf')) assert doc[2].label == 'i' assert doc[-3].label == '9'
from redstork import Document, PageObject, Glyph from redstork.test import res doc = Document(res('sample.pdf')) print('Number of pages:', len(doc)) print('MediaBox of the first page is:', doc[0].media_box) print('Rotation of the first page is:', doc[0].rotation) print('Document title:', doc.meta['Title']) print('First page has', len(doc[0]), 'objects') doc[0].render('page-0.ppm', scale=2) # render page #1 as image page = doc[0] for o in page: if o.type == PageObject.OBJ_TYPE_TEXT: for code, _, _ in o: print(o.font[code], end='') print() for fid, font in doc.fonts.items(): print(font.simple_name, fid) # lets generate an SVG file of the first letter on page 1 text_object = [o for o in page if o.type == PageObject.OBJ_TYPE_TEXT][0] # first text object charcode, _, _ = text_object[0] # first character of the first text object