def test_fix(self):
     ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True)
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
     PageValidator.validate(ocrd_page=ocrd_page, strictness='fix')
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors')
Esempio n. 2
0
 def test_fix(self):
     ocrd_page = parse(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), silence=True)
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len(report.errors), 17, 'errors')
     PageValidator.validate(ocrd_page=ocrd_page, strictness='fix')
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len(report.errors), 0, 'no more errors')
Esempio n. 3
0
 def test_validate_err(self):
     with self.assertRaisesRegex(Exception, 'At least one of ocrd_page, ocrd_file or filename must be set'):
         PageValidator.validate()
     with self.assertRaisesRegex(Exception, 'Element selection strategy best not implemented'):
         PageValidator(None, None, 'best')
     with self.assertRaisesRegex(Exception, 'Strictness level superstrictest not implemented'):
         PageValidator(None, 'superstrictest', 'index1')
Esempio n. 4
0
    def test_validate_lax(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True)

        # introduce a single word error (not just whitespace inconsistency)
        ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO')

        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 26, '26 errors - strict')
        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page, strictness='lax').errors), 1, '1 error - lax')
    def test_validate_lax(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True)

        # introduce a single word error (not just whitespace inconsistency)
        ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO')

        report = PageValidator.validate(ocrd_page=ocrd_page)
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict')
        report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax')
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax')
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
Esempio n. 7
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
Esempio n. 8
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
 def test_validate_err(self):
     with self.assertRaisesRegex(Exception, 'At least one of ocrd_page, ocrd_file or filename must be set'):
         PageValidator.validate()
     with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'):
         PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best')
     # test with deprecated name
     with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'):
         PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best')
     with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'):
         PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first')
Esempio n. 10
0
    def test_validate_multi_textequiv(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True)
        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        del(word.get_TextEquiv()[0])

        # Add textequiv
        set_text(word, 'FOO', 'index1')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))

        self.assertEqual(get_text(word, 'index1'), 'FOO')
        set_text(word, 'BAR', 'index1')
        self.assertEqual(get_text(word, 'index1'), 'BAR')
    def test_validate_multi_textequiv_first(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True)
        report = PageValidator.validate(ocrd_page=ocrd_page)
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        word.set_TextEquiv([])

        # Add textequiv
        set_text(word, 'FOO', 'first')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))
        word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0))
        self.assertEqual(get_text(word, 'first'), 'BAZ')
        set_text(word, 'XYZ', 'first')
        self.assertEqual(get_text(word, 'first'), 'XYZ')
Esempio n. 12
0
 def test_validate_filename_off(self):
     report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME,
                                     page_textequiv_consistency='off')
     self.assertEqual(
         len([e for e in report.errors if isinstance(e, ConsistencyError)]),
         0, '0 textequiv consistency errors')
Esempio n. 13
0
 def test_validate_filename(self):
     report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME)
     self.assertEqual(
         len([e for e in report.errors if isinstance(e, ConsistencyError)]),
         17, '17 textequiv consistency errors')
Esempio n. 14
0
def validate_page(page, **kwargs):
    '''
    Validate PAGE against OCR-D conventions
    '''
    _inform_of_result(PageValidator.validate(filename=page, **kwargs))
Esempio n. 15
0
 def test_validate_filename_off(self):
     report = PageValidator.validate(filename=assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), strictness='off')
     self.assertEqual(len(report.errors), 0, 'no errors')
Esempio n. 16
0
 def test_validate_filename(self):
     report = PageValidator.validate(filename=assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'))
     self.assertEqual(len(report.errors), 17, '17 errors')