def test_orderedgroup_export_order(): """ See https://github.com/OCR-D/core/issues/475 """ # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # act og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() xml_before = to_xml(og) children = og.get_AllIndexed() # assert assert len(children) == 22 assert [c.index for c in children] == list(range(0, 22)) # mix up the indexes children[0].index = 11 children[11].index = 3 children[3].index = 0 assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22)) assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType # serialize and make sure the correct order was serialized new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22)) xml_after = to_xml(new_og)
def test_issue_269(self): """ @conf is parsed as str but should be float https://github.com/OCR-D/core/issues/269 """ # GIGO self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf(1.0) self.assertEqual(type(self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()), float) self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf('1.0') self.assertEqual(type(self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()), str) # test with parseString that @conf in TextEquiv won't throw an error parseString(simple_page, silence=True)
def test_simpletypes(self): pcgts = parseString(simple_page, silence=True) self.assertTrue(isinstance(pcgts.get_Page().imageWidth, int)) el = pcgts.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word( )[0].get_TextEquiv()[0] self.assertTrue(isinstance(el.conf, float)) # XXX no validation on setting attributes :-( # c.f. https://www.davekuhlman.org/generateDS.html#simpletype # el.set_conf('2.0987') # self.assertTrue(isinstance(el.conf, float)) with self.assertRaisesRegex(TypeError, ''): el.set_conf('I AM NOT A FLOAT DEAL WITH IT') parseString(to_xml(pcgts).encode('utf8'))
def test_get_UnorderdGroupChildren(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup( ).get_UnorderedGroupIndexed()[0] self.assertEqual(len(ug.get_UnorderedGroupChildren()), 1)
def setUp(self): with open( assets.path_to( 'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f: self.xml_as_str = f.read() self.pcgts = parseString(self.xml_as_str, silence=True)
def test_image_from_page_basic(self): with pushd_popd(assets.path_to('gutachten/data')): ws = self.resolver.workspace_from_url('mets.xml') with open('TEMP1/PAGE_TEMP1.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') self.assertEquals(info['features'], 'binarized,clipped') img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') self.assertEquals(info['features'], 'binarized,clipped')
def test_to_xml_unicode_nsprefix(self): with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f: from_xml = f.read() self.assertIn('<Unicode>', from_xml.decode('utf-8'), 'without NS prefix') self.assertIn('<Created', from_xml.decode('utf-8'), 'without NS prefix') pcgts = parseString(from_xml, silence=True) as_xml = to_xml(pcgts) self.assertIn('<pc:Unicode>', as_xml, 'with NS prefix') self.assertIn('<pc:Created>', as_xml, 'with NS prefix')
def test_serialize_no_empty_readingorder(): """ https://github.com/OCR-D/core/issues/602 """ pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) pcgts.get_Page().set_ReadingOrder(ReadingOrderType()) assert pcgts.get_Page().get_ReadingOrder() pcgts = parseString(to_xml(pcgts, skip_declaration=True)) assert not pcgts.get_Page().get_ReadingOrder()
def test_delete_region(): pcgts = parseString(simple_page, silence=True) assert len(pcgts.get_Page().get_TextRegion()) == 1 # act del pcgts.get_Page().get_TextRegion()[0] # assert assert len(pcgts.get_Page().get_TextRegion()) == 0
def test_get_unorderd_group_children(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # act ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0] # assert assert len(ug.get_UnorderedGroupChildren()) == 1
def test_get_all_regions_invalid_depth_raises_exeption(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pg = parseString(f.read().encode('utf8'), silence=True).get_Page() # act with pytest.raises(Exception) as exc: pg.get_AllRegions(depth=-1) # assert assert "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" in str(exc.value)
def test_get_AllIndexed_classes(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString( f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() self.assertEqual(len(og.get_AllIndexed(classes=['RegionRef'])), 17) self.assertEqual(len(og.get_AllIndexed(classes=['OrderedGroup'])), 3) self.assertEqual( len(og.get_AllIndexed(classes=['UnorderedGroup'])), 2)
def test_empty_groups_to_regionrefindexed(self): """ Corrolary See https://github.com/OCR-D/core/issues/475 """ with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], OrderedGroupIndexedType)) self.assertTrue(isinstance(children[21], UnorderedGroupIndexedType)) # empty all the elements in the first orederdGroupIndexed children[1].set_RegionRefIndexed([]) # serialize apnd parse to see empty group converted pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], RegionRefIndexedType)) self.assertTrue(isinstance(children[21], RegionRefIndexedType))
def test_get_all_regions_invalid_order_raises_exeption(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pg = parseString(f.read().encode('utf8'), silence=True).get_Page() # act with pytest.raises(Exception) as exc: pg.get_AllRegions(order='random') # assert assert "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" in str(exc.value)
def test_get_all_indexed_classes(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # act og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() # assert assert len(og.get_AllIndexed(classes=['RegionRef'])) == 17 assert len(og.get_AllIndexed(classes=['OrderedGroup'])) == 3 assert len(og.get_AllIndexed(classes=['UnorderedGroup'])) == 2
def test_get_AllAlternativeImages(): with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) page = pcgts.get_Page() assert page.get_AllAlternativeImages(page=False, region=False, line=False) == [] assert [x.filename for x in page.get_AllAlternativeImages(page=True, region=False, line=False)] == [ 'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png', 'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png', 'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png', 'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png', 'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png', 'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png'] assert isinstance(page.get_AllAlternativeImages()[0], AlternativeImageType)
def test_extend_AllIndexed_validate_continuity(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString( f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() with self.assertRaisesRegex(Exception, "@index already used: 1"): og.extend_AllIndexed([ RegionRefIndexedType(index=3, id='r3'), RegionRefIndexedType(index=2, id='r2'), RegionRefIndexedType(index=1, id='r1'), ], validate_continuity=True)
def test_extend_AllIndexed_no_validation(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString( f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() og.extend_AllIndexed([ RegionRefIndexedType(index=3, id='r3'), RegionRefIndexedType(index=2, id='r2'), RegionRefIndexedType(index=1, id='r1'), ]) rrs = og.get_RegionRefIndexed() self.assertEqual([x.index for x in rrs][-3:], [22, 23, 24])
def test_to_xml_unicode_nsprefix(): """see https://github.com/OCR-D/core/pull/474#issuecomment-621477590""" # arrange with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f: from_xml = f.read() # assert assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix' assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix' pcgts = parseString(from_xml, silence=True) as_xml = to_xml(pcgts) assert '<pc:Unicode>' in as_xml, 'with NS prefix' assert '<pc:Created>' in as_xml, 'with NS prefix'
def test_extend_all_indexed_validate_continuity(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() # act with pytest.raises(Exception) as index_exc: og.extend_AllIndexed([ RegionRefIndexedType(index=3, id='r3'), RegionRefIndexedType(index=2, id='r2'), RegionRefIndexedType(index=1, id='r1'), ], validate_continuity=True) assert "@index already used: 1" in str(index_exc.value)
def test_image_from_page_basic(workspace_gutachten_data): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # act + assert _, info, _ = workspace_gutachten_data.image_from_page( pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') assert info['features'] == 'binarized,clipped' _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') assert info['features'] == 'binarized,clipped'
def test_get_all_indexed_index_sort(): # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() # act unogs = og.get_UnorderedGroupIndexed() # assert assert [x.index for x in unogs] == [20, 21] unogs[0].index = 21 unogs[1].index = 20 assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)] == [20, 21] assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [21, 20] og.sort_AllIndexed() assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [20, 21]
def test_get_all_alternative_image_paths(): # arrange with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # assert assert pcgts.get_AllAlternativeImagePaths(page=False, region=False, line=False) == [] assert pcgts.get_AllAlternativeImagePaths(page=True, region=False, line=False) == [ 'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png', 'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png', 'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png', 'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png', 'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png', 'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png'] assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12 assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12 assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=True)) == 36
def test_all_regions_with_reading_order(self): """ https://github.com/OCR-D/core/pull/479 https://github.com/OCR-D/core/issues/240#issuecomment-493135797 """ with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pg = parseString(f.read().encode('utf8'), silence=True).get_Page() with self.assertRaisesRegex( Exception, "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" ): pg.get_AllRegions(order='random') with self.assertRaisesRegex( Exception, "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" ): pg.get_AllRegions(depth=-1) self.assertEqual( len(pg.get_AllRegions(order='reading-order-only')), 40) self.assertEqual( len(pg.get_AllRegions(order='reading-order-only', depth=1)), 20) self.assertEqual( len(pg.get_AllRegions(order='reading-order-only', depth=2)), 40) self.assertEqual( len(pg.get_AllRegions(order='reading-order', depth=0)), 65) self.assertEqual( len(pg.get_AllRegions(order='reading-order', depth=1)), 45) self.assertEqual( len(pg.get_AllRegions(order='reading-order', depth=2)), 65) self.assertEqual( len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) self.assertEqual( len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 37) self.assertEqual( len( pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)), 17)
def test_image_feature_selectoro(self): with pushd_popd('tests/data/sample-features'): ws = self.resolver.workspace_from_url('mets.xml') with open('image_features.page.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # richest feature set is not last: img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped') # recropped because foo4 contains cropped+deskewed but not recropped yet: self.assertEqual( info['features'], 'cropped,dewarped,binarized,despeckled,deskewed,recropped') # richest feature set is also last: img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped', feature_filter='binarized') # no deskewing here, thus no recropping: self.assertEqual(info['features'], 'cropped,dewarped,despeckled')
def test_image_feature_selectoro(workspace_sample_features): # arrange with open( join(str(workspace_sample_features.directory), 'image_features.page.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8')) # richest feature set is not last: _, info, _ = workspace_sample_features.image_from_page( pcgts.get_Page(), page_id='page1', feature_selector='dewarped') # recropped because foo4 contains cropped+deskewed but not recropped yet: assert info['features'] == 'cropped,dewarped,binarized,despeckled,deskewed' # richest feature set is also last: _, info, _ = workspace_sample_features.image_from_page( pcgts.get_Page(), page_id='page1', feature_selector='dewarped', feature_filter='binarized') # no deskewing here, thus no recropping: assert info['features'] == 'cropped,dewarped,despeckled'
def test_all_regions_with_reading_order(): """ https://github.com/OCR-D/core/pull/479 https://github.com/OCR-D/core/issues/240#issuecomment-493135797 """ # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pg = parseString(f.read().encode('utf8'), silence=True).get_Page() # assert assert len(pg.get_AllRegions(order='reading-order-only')) == 40 assert len(pg.get_AllRegions(order='reading-order-only', depth=1)) == 20 assert len(pg.get_AllRegions(order='reading-order-only', depth=2)) == 40 assert len(pg.get_AllRegions(order='reading-order', depth=0)) == 65 assert len(pg.get_AllRegions(order='reading-order', depth=1)) == 45 assert len(pg.get_AllRegions(order='reading-order', depth=2)) == 65 assert len(pg.get_AllRegions(classes=['Table'], order='reading-order')) == 3 assert len(pg.get_AllRegions(classes=['Text'], order='reading-order')) == 37 assert len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)) == 17
def get_lines(fname, flist=False): with open(fname, 'r') as fd: rawlines = [line.rstrip('\r\n') for line in fd.readlines()] try: # PAGE-XML case if rawlines and rawlines[0].startswith('<?xml'): rawlines[0] = rawlines[0][rawlines[0].index('?>') + 2:] pcgts = parseString(''.join(rawlines)) #pcgts = parse(fname) lines = page_get_lines(pcgts) except Exception: # plaintext case lines = rawlines if flist: # ocropy style (e.g. -F <(ls -1 *.gt.txt) <(ls -1 *.ocr.txt)) files = lines lines = [] for fname in files: with open(fname, 'r') as fd: lines.append(fd.readline()) return lines
def test_all_regions_without_reading_order(self): """ https://github.com/OCR-D/core/pull/479 https://github.com/OCR-D/core/issues/240#issuecomment-493135797 """ with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() self.assertEqual(len(pg.get_AllRegions()), 65) self.assertEqual(len(pg.get_AllRegions(depth=0)), 65) self.assertEqual(len(pg.get_AllRegions(depth=1)), 45) self.assertEqual(len(pg.get_AllRegions(depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(depth=3)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Separator'])), 25) self.assertEqual(len(pg.get_AllRegions(classes=['Table'])), 3) self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 37) self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=1)), 17) self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=2)), 37)
def test_get_AllIndexed_index_sort(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString( f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() unogs = og.get_UnorderedGroupIndexed() self.assertEqual([x.index for x in unogs], [20, 21]) unogs[0].index = 21 unogs[1].index = 20 self.assertEqual([ x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True) ], [20, 21]) self.assertEqual([ x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False) ], [21, 20]) og.sort_AllIndexed() self.assertEqual([ x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False) ], [20, 21])