Beispiel #1
0
 def test_get_AllTextLine(self):
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         page = parseString(f.read().encode('utf8'),
                            silence=True).get_Page()
         assert len(page.get_AllTextLines()) == 55
Beispiel #2
0
 def setUp(self):
     with open(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f:
         self.xml_as_str = f.read()
         self.pcgts = parseString(self.xml_as_str, silence=True)
Beispiel #3
0
# -*- coding: utf-8 -*-

from os.path import join as pjoin
from pathlib import Path
from tempfile import TemporaryDirectory

from tests.base import TestCase, assets, main, copy_of_directory

from ocrd.resolver import Resolver
from ocrd_utils import pushd_popd, initLogging

METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml')
FOLDER_KANT = assets.path_to('kant_aufklaerung_1784')

# pylint: disable=redundant-unittest-assert, broad-except, deprecated-method, too-many-public-methods

class TestResolver(TestCase):

    def setUp(self):
        initLogging()
        self.resolver = Resolver()

    def test_workspace_from_url_bad(self):
        with self.assertRaisesRegex(Exception, "Must pass 'mets_url'"):
            self.resolver.workspace_from_url(None)

    def test_workspace_from_url_tempdir(self):
        self.resolver.workspace_from_url(
            mets_basename='foo.xml',
            mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml')
Beispiel #4
0
def _fixture_kant_complex(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784-complex/data'),
             str(tmp_path))
    yield Workspace(Resolver, directory=tmp_path)
Beispiel #5
0
 def test_str(self):
     with Image.open(assets.path_to('SBB0000F29300010000/data/OCR-D-IMG/FILE_0001_IMAGE.tif')) as img:
         exif = OcrdExif(img)
     print(str(exif.to_xml()))
Beispiel #6
0
from tests.base import TestCase, main, assets, create_ocrd_file, create_ocrd_file_with_defaults

from ocrd_utils import MIMETYPE_PAGE
from ocrd_models import OcrdMets
from ocrd_modelfactory import (exif_from_filename, page_from_image,
                               page_from_file)

SAMPLE_IMG = assets.path_to(
    'kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')
SAMPLE_PAGE = assets.path_to(
    'kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')


class TestModelFactory(TestCase):
    def test_exif_from_filename(self):
        exif_from_filename(SAMPLE_IMG)
        with self.assertRaisesRegex(
                Exception,
                "Must pass 'image_filename' to 'exif_from_filename'"):
            exif_from_filename(None)

    def test_page_from_file(self):
        f = create_ocrd_file_with_defaults(mimetype='image/tiff',
                                           local_filename=SAMPLE_IMG,
                                           ID='file1')
        self.assertEqual(f.mimetype, 'image/tiff')
        p = page_from_file(f)
        self.assertEqual(p.pcGtsId, f.ID)
        self.assertEqual(p.get_Page().imageWidth, 1457)

    def test_page_from_file_page(self):
Beispiel #7
0
def _fixture_sbb_data_tmp(tmp_path):
    copytree(assets.path_to('SBB0000F29300010000/data'), str(tmp_path))
    yield str(tmp_path)
Beispiel #8
0
 def test_validate_filename_off(self):
     report = PageValidator.validate(filename=assets.path_to(
         'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'),
                                     strictness='off')
     self.assertEqual(len(report.errors), 0, 'no errors')
Beispiel #9
0
from tests.base import TestCase, assets, main  # pylint: disable=import-error,no-name-in-module
from ocrd.resolver import Resolver
from ocrd_validators import PageValidator
from ocrd_validators.page_validator import get_text, set_text, ConsistencyError
from ocrd_models.ocrd_page import parse, TextEquivType
from ocrd_utils import pushd_popd

FAULTY_GLYPH_PAGE_FILENAME = filename = assets.path_to(
    'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml')


class TestPageValidator(TestCase):
    def setUp(self):
        pass

    def test_validate_err(self):
        with self.assertRaisesRegex(
                Exception,
                'At least one of ocrd_page, ocrd_file or filename must be set'
        ):
            PageValidator.validate()
        with self.assertRaisesRegex(
                Exception, 'page_textequiv_strategy best not implemented'):
            PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME,
                                   page_textequiv_strategy='best')
        # test with deprecated name
        with self.assertRaisesRegex(
                Exception, 'page_textequiv_strategy best not implemented'):
            PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME,
                                   strategy='best')
        with self.assertRaisesRegex(
Beispiel #10
0
 def test_resolve_image_exif(self):
     with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')):
         ws = self.resolver.workspace_from_url('mets.xml')
         exif = ws.resolve_image_exif('OCR-D-IMG/INPUT_0017.tif')
         self.assertEqual(exif.compression, 'jpeg')
         self.assertEqual(exif.width, 1457)
Beispiel #11
0
 def test_validate_filename(self):
     report = PageValidator.validate(filename=assets.path_to(
         'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'))
     self.assertEqual(len(report.errors), 17, '17 errors')
Beispiel #12
0
 def test_validate_page(self):
     page_path = assets.path_to(
         'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml')
     result = self.runner.invoke(validate_cli, ['page', page_path])
     self.assertEqual(result.exit_code, 1)
     self.assertIn('<report valid="false">', result.stdout)
Beispiel #13
0
def _fixture_sbb(tmp_path):
    src_path = assets.path_to('SBB0000F29300010000/data')
    dst_path = tmp_path / 'SBB_directory'
    shutil.copytree(src_path, dst_path)
    mets_path = str(join(dst_path, 'mets.xml'))
    yield OcrdMets(filename=mets_path)
Beispiel #14
0
def test_merge(sbb_sample_01):
    assert len(sbb_sample_01.file_groups) == 17
    other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml'))
    sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'})
    assert len(sbb_sample_01.file_groups) == 18
 def test_validate_page(self):
     page_path = assets.path_to(
         'glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml')
     code, out, _ = self.invoke_cli(validate_cli, ['page', page_path])
     self.assertEqual(code, 1)
     self.assertIn('<report valid="false">', out)
Beispiel #16
0
import numpy as np

import pytest

from tests.base import (assets, main, FIFOIO)

from ocrd_models import (OcrdFile, OcrdMets)
from ocrd_models.ocrd_page import parseString
from ocrd_models.ocrd_page import TextRegionType, CoordsType, AlternativeImageType
from ocrd_utils import polygon_mask, xywh_from_polygon, bbox_from_polygon, points_from_polygon
from ocrd_modelfactory import page_from_file
from ocrd.resolver import Resolver
from ocrd.workspace import Workspace

TMP_FOLDER = '/tmp/test-core-workspace'
SRC_METS = assets.path_to('kant_aufklaerung_1784/data/mets.xml')

SAMPLE_FILE_FILEGRP = 'OCR-D-IMG'
SAMPLE_FILE_ID = 'INPUT_0017'
SAMPLE_FILE_URL = join(SAMPLE_FILE_FILEGRP, '%s.tif' % SAMPLE_FILE_ID)


def copytree(src, dst, *args, **kwargs):
    rmtree(dst)
    copytree_(src, dst, *args, **kwargs)


def count_files(d):
    return sum(len(files) for _, _, files in walk(d))

Beispiel #17
0
 def test_remove_page_after_remove_file(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
         mets.remove_one_file('FILE_0005_IMAGE')
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
Beispiel #18
0
 def test_remove_file_group_rmdir(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
         workspace.remove_file_group('OCR-D-IMG', recursive=True)
         self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))