コード例 #1
0
class TestSerializations(unittest.TestCase):
    """
    Tests for output serialization
    """
    def setUp(self):
        with open(os.path.join(resources, 'records.json'), 'r') as fp:
            self.records = [rpred.ocr_record(**x) for x in json.load(fp)]
        self.validator = HocrValidator('standard')

    def test_vertical_hocr_serialization(self):
        """
        Test vertical line hOCR serialization
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', writing_mode='vertical-lr', template='hocr'))
        fp.seek(0)

        report = self.validator.validate(fp, parse_strict=True)
        self.assertTrue(report.is_valid())

    def test_hocr_serialization(self):
        """
        Test hOCR serialization
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr'))
        fp.seek(0)

        report = self.validator.validate(fp, parse_strict=True)
        self.assertTrue(report.is_valid())

    def test_alto_serialization_validation(self):
        """
        Validates output against ALTO schema
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto'))
        doc = etree.fromstring(fp.getvalue().encode('utf-8'))
        with open(os.path.join(resources, 'alto-4-0.xsd')) as schema_fp:
            alto_schema = etree.XMLSchema(etree.parse(schema_fp))
            alto_schema.assertValid(doc)

    def test_abbyyxml_serialization_validation(self):
        """
        Validates output against abbyyXML schema
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', template='abbyyxml'))
        doc = etree.fromstring(fp.getvalue().encode('utf-8'))
        with open(os.path.join(resources, 'FineReader10-schema-v1.xml')) as schema_fp:
            abbyy_schema = etree.XMLSchema(etree.parse(schema_fp))
            abbyy_schema.assertValid(doc)
コード例 #2
0
def validate_hocr(self, fp):
    fp.seek(0)

    validator = HocrValidator('standard')
    report = validator.validate(fp, parse_strict=True)
    self.assertTrue(report.is_valid())

    doc = etree.fromstring(fp.getvalue().encode('utf-8'))

    ids = [x.get('id') for x in doc.findall('.//*[@id]')]
    counts = Counter(ids)
    self.assertEqual(counts.most_common(1)[0][1],
                     1,
                     msg='Duplicate IDs in hOCR output')
コード例 #3
0
def main():
    args = parser.parse_args()

    validator = HocrValidator(args.profile,
                              skip_check=args.skip_check,
                              implicit_capabilities=args.implicit_capabilities)
    failed = 0
    for source in args.sources:
        report = validator.validate(source,
                                    parse_strict=args.parse_strict,
                                    filename=args.filename)
        failed += not report.is_valid()
        if not args.silent:
            print(report.format(args.format))
    sys.exit(0 if not failed else 1)
コード例 #4
0
ファイル: test_serialization.py プロジェクト: yufish/kraken
class TestSerializations(unittest.TestCase):
    """
    Tests for output serialization
    """
    def setUp(self):
        with open(os.path.join(resources, 'records.json'), 'r') as fp:
            self.records = [rpred.ocr_record(**x) for x in json.load(fp)]
        self.validator = HocrValidator('standard')

    def test_vertical_hocr_serialization(self):
        """
        Test vertical line hOCR  serialization
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', writing_mode='vertical-lr', template='hocr'))
        fp.seek(0)

        report = self.validator.validate(fp, parse_strict=True)
        self.assertTrue(report.is_valid())

    def test_hocr_serialization(self):
        """
        Test hOCR serialization
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr'))
        fp.seek(0)

        report = self.validator.validate(fp, parse_strict=True)
        self.assertTrue(report.is_valid())

    def test_alto_serialization_validation(self):
        """
        Validates output against ALTO schema
        """
        fp = StringIO()

        fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto'))
        doc = etree.fromstring(fp.getvalue())
        print(fp.getvalue()[:2000])
        with open(os.path.join(resources, 'alto-3-1.xsd')) as schema_fp:
            alto_schema = etree.XMLSchema(etree.parse(schema_fp))
            alto_schema.assertValid(doc)
コード例 #5
0
 def setUp(self):
     with open(os.path.join(resources, 'records.json'), 'r') as fp:
         self.records = [rpred.ocr_record(**x) for x in json.load(fp)]
     self.validator = HocrValidator('standard')
コード例 #6
0
 def setUp(self):
     with open(os.path.join(resources, 'records.json'), 'r') as fp:
         self.records = [rpred.ocr_record(**x) for x in json.load(fp)]
     self.validator = HocrValidator('standard')