def test_orderedgroup_export_order():
    """
    See https://github.com/OCR-D/core/issues/475
    """
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    xml_before = to_xml(og)
    children = og.get_AllIndexed()

    # assert
    assert len(children) == 22
    assert [c.index for c in children] == list(range(0, 22))
    # mix up the indexes
    children[0].index = 11
    children[11].index = 3
    children[3].index = 0
    assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22))
    assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType
    # serialize and make sure the correct order was serialized
    new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
    new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22))

    xml_after = to_xml(new_og)
Beispiel #2
0
 def test_issue_269(self):
     """
     @conf is parsed as str but should be float
     https://github.com/OCR-D/core/issues/269
     """
     # GIGO
     self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf(1.0)
     self.assertEqual(type(self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()), float)
     self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf('1.0')
     self.assertEqual(type(self.pcgts.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()), str)
     # test with parseString that @conf in TextEquiv won't throw an error
     parseString(simple_page, silence=True)
Beispiel #3
0
 def test_simpletypes(self):
     pcgts = parseString(simple_page, silence=True)
     self.assertTrue(isinstance(pcgts.get_Page().imageWidth, int))
     el = pcgts.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word(
     )[0].get_TextEquiv()[0]
     self.assertTrue(isinstance(el.conf, float))
     # XXX no validation on setting attributes :-(
     # c.f. https://www.davekuhlman.org/generateDS.html#simpletype
     #  el.set_conf('2.0987')
     #  self.assertTrue(isinstance(el.conf, float))
     with self.assertRaisesRegex(TypeError, ''):
         el.set_conf('I AM NOT A FLOAT DEAL WITH IT')
         parseString(to_xml(pcgts).encode('utf8'))
Beispiel #4
0
 def test_get_UnorderdGroupChildren(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         pcgts = parseString(f.read().encode('utf8'), silence=True)
         ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup(
         ).get_UnorderedGroupIndexed()[0]
         self.assertEqual(len(ug.get_UnorderedGroupChildren()), 1)
Beispiel #5
0
 def setUp(self):
     with open(
             assets.path_to(
                 'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'),
             'rb') as f:
         self.xml_as_str = f.read()
         self.pcgts = parseString(self.xml_as_str, silence=True)
Beispiel #6
0
 def test_image_from_page_basic(self):
     with pushd_popd(assets.path_to('gutachten/data')):
         ws = self.resolver.workspace_from_url('mets.xml')
         with open('TEMP1/PAGE_TEMP1.xml', 'r') as f:
             pcgts = parseString(f.read().encode('utf8'), silence=True)
         img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped')
         self.assertEquals(info['features'], 'binarized,clipped')
         img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017')
         self.assertEquals(info['features'], 'binarized,clipped')
Beispiel #7
0
 def test_to_xml_unicode_nsprefix(self):
     with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
         from_xml = f.read()
         self.assertIn('<Unicode>', from_xml.decode('utf-8'), 'without NS prefix')
         self.assertIn('<Created', from_xml.decode('utf-8'), 'without NS prefix')
         pcgts = parseString(from_xml, silence=True)
         as_xml = to_xml(pcgts)
         self.assertIn('<pc:Unicode>', as_xml, 'with NS prefix')
         self.assertIn('<pc:Created>', as_xml, 'with NS prefix')
def test_serialize_no_empty_readingorder():
    """
    https://github.com/OCR-D/core/issues/602
    """
    pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
    pcgts.get_Page().set_ReadingOrder(ReadingOrderType())
    assert pcgts.get_Page().get_ReadingOrder()
    pcgts = parseString(to_xml(pcgts, skip_declaration=True))
    assert not pcgts.get_Page().get_ReadingOrder()
def test_delete_region():
    pcgts = parseString(simple_page, silence=True)
    assert len(pcgts.get_Page().get_TextRegion()) == 1

    # act
    del pcgts.get_Page().get_TextRegion()[0]

    # assert
    assert len(pcgts.get_Page().get_TextRegion()) == 0
def test_get_unorderd_group_children():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0]

    # assert
    assert len(ug.get_UnorderedGroupChildren()) == 1
def test_get_all_regions_invalid_depth_raises_exeption():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # act
    with pytest.raises(Exception) as exc:
        pg.get_AllRegions(depth=-1)

    # assert
    assert "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" in str(exc.value)
Beispiel #12
0
 def test_get_AllIndexed_classes(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         og = parseString(
             f.read().encode('utf8'),
             silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
         self.assertEqual(len(og.get_AllIndexed(classes=['RegionRef'])), 17)
         self.assertEqual(len(og.get_AllIndexed(classes=['OrderedGroup'])),
                          3)
         self.assertEqual(
             len(og.get_AllIndexed(classes=['UnorderedGroup'])), 2)
Beispiel #13
0
 def test_empty_groups_to_regionrefindexed(self):
     """
     Corrolary See https://github.com/OCR-D/core/issues/475
     """
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         pcgts = parseString(f.read().encode('utf8'), silence=True)
         og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
         children = og.get_AllIndexed()
         self.assertTrue(isinstance(children[1], OrderedGroupIndexedType))
         self.assertTrue(isinstance(children[21],
                                    UnorderedGroupIndexedType))
         # empty all the elements in the first orederdGroupIndexed
         children[1].set_RegionRefIndexed([])
         # serialize apnd parse to see empty group converted
         pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
         og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
         children = og.get_AllIndexed()
         self.assertTrue(isinstance(children[1], RegionRefIndexedType))
         self.assertTrue(isinstance(children[21], RegionRefIndexedType))
def test_get_all_regions_invalid_order_raises_exeption():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # act
    with pytest.raises(Exception) as exc:
        pg.get_AllRegions(order='random')

    # assert
    assert "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" in str(exc.value)
def test_get_all_indexed_classes():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()

    # assert
    assert len(og.get_AllIndexed(classes=['RegionRef'])) == 17
    assert len(og.get_AllIndexed(classes=['OrderedGroup'])) == 3
    assert len(og.get_AllIndexed(classes=['UnorderedGroup'])) == 2
def test_get_AllAlternativeImages():
    with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)
        page = pcgts.get_Page()
        assert page.get_AllAlternativeImages(page=False, region=False, line=False) == []
        assert [x.filename for x in page.get_AllAlternativeImages(page=True, region=False, line=False)] == [
            'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
            'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
            'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
            'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
            'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
            'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
        assert isinstance(page.get_AllAlternativeImages()[0], AlternativeImageType)
Beispiel #17
0
 def test_extend_AllIndexed_validate_continuity(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         og = parseString(
             f.read().encode('utf8'),
             silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
         with self.assertRaisesRegex(Exception, "@index already used: 1"):
             og.extend_AllIndexed([
                 RegionRefIndexedType(index=3, id='r3'),
                 RegionRefIndexedType(index=2, id='r2'),
                 RegionRefIndexedType(index=1, id='r1'),
             ],
                                  validate_continuity=True)
Beispiel #18
0
 def test_extend_AllIndexed_no_validation(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         og = parseString(
             f.read().encode('utf8'),
             silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
         og.extend_AllIndexed([
             RegionRefIndexedType(index=3, id='r3'),
             RegionRefIndexedType(index=2, id='r2'),
             RegionRefIndexedType(index=1, id='r1'),
         ])
         rrs = og.get_RegionRefIndexed()
         self.assertEqual([x.index for x in rrs][-3:], [22, 23, 24])
def test_to_xml_unicode_nsprefix():
    """see https://github.com/OCR-D/core/pull/474#issuecomment-621477590"""

    # arrange
    with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
        from_xml = f.read()

    # assert
    assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix'
    assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix'
    pcgts = parseString(from_xml, silence=True)
    as_xml = to_xml(pcgts)
    assert '<pc:Unicode>' in as_xml, 'with NS prefix'
    assert '<pc:Created>' in as_xml, 'with NS prefix'
def test_extend_all_indexed_validate_continuity():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()

    # act
    with pytest.raises(Exception) as index_exc:
        og.extend_AllIndexed([
            RegionRefIndexedType(index=3, id='r3'),
            RegionRefIndexedType(index=2, id='r2'),
            RegionRefIndexedType(index=1, id='r1'),
        ], validate_continuity=True)

    assert "@index already used: 1" in str(index_exc.value)
Beispiel #21
0
def test_image_from_page_basic(workspace_gutachten_data):
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act + assert
    _, info, _ = workspace_gutachten_data.image_from_page(
        pcgts.get_Page(),
        page_id='PHYS_0017',
        feature_selector='clipped',
        feature_filter='cropped')
    assert info['features'] == 'binarized,clipped'
    _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(),
                                                          page_id='PHYS_0017')
    assert info['features'] == 'binarized,clipped'
def test_get_all_indexed_index_sort():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()

    # act
    unogs = og.get_UnorderedGroupIndexed()

    # assert
    assert [x.index for x in unogs] == [20, 21]
    unogs[0].index = 21
    unogs[1].index = 20
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)] == [20, 21]
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [21, 20]
    og.sort_AllIndexed()
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [20, 21]
def test_get_all_alternative_image_paths():
    # arrange
    with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # assert
    assert pcgts.get_AllAlternativeImagePaths(page=False, region=False, line=False) == []
    assert pcgts.get_AllAlternativeImagePaths(page=True, region=False, line=False) == [
        'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
        'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
        'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
        'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
        'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
        'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=True)) == 36
Beispiel #24
0
 def test_all_regions_with_reading_order(self):
     """
     https://github.com/OCR-D/core/pull/479
     https://github.com/OCR-D/core/issues/240#issuecomment-493135797
     """
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         pg = parseString(f.read().encode('utf8'), silence=True).get_Page()
         with self.assertRaisesRegex(
                 Exception,
                 "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'"
         ):
             pg.get_AllRegions(order='random')
         with self.assertRaisesRegex(
                 Exception,
                 "Argument 'depth' must be an integer greater-or-equal 0, not '-1'"
         ):
             pg.get_AllRegions(depth=-1)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order-only')), 40)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order-only', depth=1)),
             20)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order-only', depth=2)),
             40)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order', depth=0)), 65)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order', depth=1)), 45)
         self.assertEqual(
             len(pg.get_AllRegions(order='reading-order', depth=2)), 65)
         self.assertEqual(
             len(pg.get_AllRegions(classes=['Table'],
                                   order='reading-order')), 3)
         self.assertEqual(
             len(pg.get_AllRegions(classes=['Text'],
                                   order='reading-order')), 37)
         self.assertEqual(
             len(
                 pg.get_AllRegions(classes=['Text'],
                                   order='reading-order',
                                   depth=1)), 17)
Beispiel #25
0
 def test_image_feature_selectoro(self):
     with pushd_popd('tests/data/sample-features'):
         ws = self.resolver.workspace_from_url('mets.xml')
         with open('image_features.page.xml', 'r') as f:
             pcgts = parseString(f.read().encode('utf8'), silence=True)
         # richest feature set is not last:
         img, info, exif = ws.image_from_page(pcgts.get_Page(),
                                              page_id='page1',
                                              feature_selector='dewarped')
         # recropped because foo4 contains cropped+deskewed but not recropped yet:
         self.assertEqual(
             info['features'],
             'cropped,dewarped,binarized,despeckled,deskewed,recropped')
         # richest feature set is also last:
         img, info, exif = ws.image_from_page(pcgts.get_Page(),
                                              page_id='page1',
                                              feature_selector='dewarped',
                                              feature_filter='binarized')
         # no deskewing here, thus no recropping:
         self.assertEqual(info['features'], 'cropped,dewarped,despeckled')
Beispiel #26
0
def test_image_feature_selectoro(workspace_sample_features):
    # arrange
    with open(
            join(str(workspace_sample_features.directory),
                 'image_features.page.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'))

    # richest feature set is not last:
    _, info, _ = workspace_sample_features.image_from_page(
        pcgts.get_Page(), page_id='page1', feature_selector='dewarped')
    # recropped because foo4 contains cropped+deskewed but not recropped yet:
    assert info['features'] == 'cropped,dewarped,binarized,despeckled,deskewed'
    # richest feature set is also last:
    _, info, _ = workspace_sample_features.image_from_page(
        pcgts.get_Page(),
        page_id='page1',
        feature_selector='dewarped',
        feature_filter='binarized')
    # no deskewing here, thus no recropping:
    assert info['features'] == 'cropped,dewarped,despeckled'
def test_all_regions_with_reading_order():
    """
    https://github.com/OCR-D/core/pull/479
    https://github.com/OCR-D/core/issues/240#issuecomment-493135797
    """

    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # assert
    assert len(pg.get_AllRegions(order='reading-order-only')) == 40
    assert len(pg.get_AllRegions(order='reading-order-only', depth=1)) == 20
    assert len(pg.get_AllRegions(order='reading-order-only', depth=2)) == 40
    assert len(pg.get_AllRegions(order='reading-order', depth=0)) == 65
    assert len(pg.get_AllRegions(order='reading-order', depth=1)) == 45
    assert len(pg.get_AllRegions(order='reading-order', depth=2)) == 65
    assert len(pg.get_AllRegions(classes=['Table'], order='reading-order')) == 3
    assert len(pg.get_AllRegions(classes=['Text'], order='reading-order')) == 37
    assert len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)) == 17
Beispiel #28
0
def get_lines(fname, flist=False):
    with open(fname, 'r') as fd:
        rawlines = [line.rstrip('\r\n') for line in fd.readlines()]
    try:
        # PAGE-XML case
        if rawlines and rawlines[0].startswith('<?xml'):
            rawlines[0] = rawlines[0][rawlines[0].index('?>') + 2:]
        pcgts = parseString(''.join(rawlines))
        #pcgts = parse(fname)
        lines = page_get_lines(pcgts)
    except Exception:
        # plaintext case
        lines = rawlines
        if flist:
            # ocropy style (e.g. -F <(ls -1 *.gt.txt) <(ls -1 *.ocr.txt))
            files = lines
            lines = []
            for fname in files:
                with open(fname, 'r') as fd:
                    lines.append(fd.readline())
    return lines
Beispiel #29
0
 def test_all_regions_without_reading_order(self):
     """
     https://github.com/OCR-D/core/pull/479
     https://github.com/OCR-D/core/issues/240#issuecomment-493135797
     """
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         pcgts = parseString(f.read().encode('utf8'), silence=True)
         pg = pcgts.get_Page()
         self.assertEqual(len(pg.get_AllRegions()), 65)
         self.assertEqual(len(pg.get_AllRegions(depth=0)), 65)
         self.assertEqual(len(pg.get_AllRegions(depth=1)), 45)
         self.assertEqual(len(pg.get_AllRegions(depth=2)), 65)
         self.assertEqual(len(pg.get_AllRegions(depth=3)), 65)
         self.assertEqual(len(pg.get_AllRegions(classes=['Separator'])), 25)
         self.assertEqual(len(pg.get_AllRegions(classes=['Table'])), 3)
         self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 37)
         self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=1)),
                          17)
         self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=2)),
                          37)
Beispiel #30
0
 def test_get_AllIndexed_index_sort(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         og = parseString(
             f.read().encode('utf8'),
             silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
         unogs = og.get_UnorderedGroupIndexed()
         self.assertEqual([x.index for x in unogs], [20, 21])
         unogs[0].index = 21
         unogs[1].index = 20
         self.assertEqual([
             x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'],
                                                index_sort=True)
         ], [20, 21])
         self.assertEqual([
             x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'],
                                                index_sort=False)
         ], [21, 20])
         og.sort_AllIndexed()
         self.assertEqual([
             x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'],
                                                index_sort=False)
         ], [20, 21])