class TestSerializations(unittest.TestCase): """ Tests for output serialization """ def setUp(self): with open(os.path.join(resources, 'records.json'), 'r') as fp: self.records = [rpred.ocr_record(**x) for x in json.load(fp)] self.validator = HocrValidator('standard') def test_vertical_hocr_serialization(self): """ Test vertical line hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', writing_mode='vertical-lr', template='hocr')) fp.seek(0) report = self.validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid()) def test_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr')) fp.seek(0) report = self.validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid()) def test_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(os.path.join(resources, 'alto-4-0.xsd')) as schema_fp: alto_schema = etree.XMLSchema(etree.parse(schema_fp)) alto_schema.assertValid(doc) def test_abbyyxml_serialization_validation(self): """ Validates output against abbyyXML schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='abbyyxml')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(os.path.join(resources, 'FineReader10-schema-v1.xml')) as schema_fp: abbyy_schema = etree.XMLSchema(etree.parse(schema_fp)) abbyy_schema.assertValid(doc)
def validate_hocr(self, fp): fp.seek(0) validator = HocrValidator('standard') report = validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid()) doc = etree.fromstring(fp.getvalue().encode('utf-8')) ids = [x.get('id') for x in doc.findall('.//*[@id]')] counts = Counter(ids) self.assertEqual(counts.most_common(1)[0][1], 1, msg='Duplicate IDs in hOCR output')
def main(): args = parser.parse_args() validator = HocrValidator(args.profile, skip_check=args.skip_check, implicit_capabilities=args.implicit_capabilities) failed = 0 for source in args.sources: report = validator.validate(source, parse_strict=args.parse_strict, filename=args.filename) failed += not report.is_valid() if not args.silent: print(report.format(args.format)) sys.exit(0 if not failed else 1)
class TestSerializations(unittest.TestCase): """ Tests for output serialization """ def setUp(self): with open(os.path.join(resources, 'records.json'), 'r') as fp: self.records = [rpred.ocr_record(**x) for x in json.load(fp)] self.validator = HocrValidator('standard') def test_vertical_hocr_serialization(self): """ Test vertical line hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', writing_mode='vertical-lr', template='hocr')) fp.seek(0) report = self.validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid()) def test_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr')) fp.seek(0) report = self.validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid()) def test_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto')) doc = etree.fromstring(fp.getvalue()) print(fp.getvalue()[:2000]) with open(os.path.join(resources, 'alto-3-1.xsd')) as schema_fp: alto_schema = etree.XMLSchema(etree.parse(schema_fp)) alto_schema.assertValid(doc)
def setUp(self): with open(os.path.join(resources, 'records.json'), 'r') as fp: self.records = [rpred.ocr_record(**x) for x in json.load(fp)] self.validator = HocrValidator('standard')