def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors')
def test_fix(self): ocrd_page = parse(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len(report.errors), 17, 'errors') PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len(report.errors), 0, 'no more errors')
def test_validate_err(self): with self.assertRaisesRegex(Exception, 'At least one of ocrd_page, ocrd_file or filename must be set'): PageValidator.validate() with self.assertRaisesRegex(Exception, 'Element selection strategy best not implemented'): PageValidator(None, None, 'best') with self.assertRaisesRegex(Exception, 'Strictness level superstrictest not implemented'): PageValidator(None, 'superstrictest', 'index1')
def test_validate_lax(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True) # introduce a single word error (not just whitespace inconsistency) ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO') self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 26, '26 errors - strict') self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page, strictness='lax').errors), 1, '1 error - lax')
def test_validate_lax(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True) # introduce a single word error (not just whitespace inconsistency) ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax')
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] if not ocrd_file.local_filename: workspace.download_file(ocrd_file) report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_validate_err(self): with self.assertRaisesRegex(Exception, 'At least one of ocrd_page, ocrd_file or filename must be set'): PageValidator.validate() with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first')
def test_validate_multi_textequiv(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True) self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs del(word.get_TextEquiv()[0]) # Add textequiv set_text(word, 'FOO', 'index1') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) self.assertEqual(get_text(word, 'index1'), 'FOO') set_text(word, 'BAR', 'index1') self.assertEqual(get_text(word, 'index1'), 'BAR')
def test_validate_multi_textequiv_first(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs word.set_TextEquiv([]) # Add textequiv set_text(word, 'FOO', 'first') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0)) self.assertEqual(get_text(word, 'first'), 'BAZ') set_text(word, 'XYZ', 'first') self.assertEqual(get_text(word, 'first'), 'XYZ')
def test_validate_filename_off(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='off') self.assertEqual( len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, '0 textequiv consistency errors')
def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) self.assertEqual( len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def validate_page(page, **kwargs): ''' Validate PAGE against OCR-D conventions ''' _inform_of_result(PageValidator.validate(filename=page, **kwargs))
def test_validate_filename_off(self): report = PageValidator.validate(filename=assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), strictness='off') self.assertEqual(len(report.errors), 0, 'no errors')
def test_validate_filename(self): report = PageValidator.validate(filename=assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS')) self.assertEqual(len(report.errors), 17, '17 errors')