def test_resolve_image_grayscale(self): img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png') workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) img_pil1 = workspace.resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1))
def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def test_resolve_image_bitonal(self): img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png') workspace = self.resolver.workspace_from_url(METS_HEROLD) img_pil1 = workspace._resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1))
def test_check_file_grp_basic(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='foo'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', 'FOO') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', None) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, None, '') self.assertTrue(report.is_valid)
def test_check_file_grp_page_id_valid(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004') self.assertTrue(report.is_valid)
def test_no_input_file_grp(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): _ = processor.input_files
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] if not ocrd_file.local_filename: workspace.download_file(ocrd_file) report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_validate_twice(self): validator = WorkspaceValidator( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) report = validator._validate() # pylint: disable=protected-access report = validator._validate() # pylint: disable=protected-access self.assertTrue(report.is_valid)
def test_with_mets_url_input_files(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 20) self.assertTrue( all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_resolve_image_bitonal(self): workspace = self.resolver.workspace_from_url( pjoin(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')) img_url = 'OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png' img_pil1 = workspace._resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1))
def test_run_cli(self): with TemporaryDirectory() as tempdir: run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), workspace=None, page_id='page1', log_level='DEBUG', input_file_grp='INPUT', output_file_grp='OUTPUT', parameter='/path/to/param.json', working_dir=tempdir) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), )
def test_resolve_image_as_pil_deprecated(): url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml') workspace = Resolver().workspace_from_url(url_path) with pytest.warns(DeprecationWarning) as record: workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png') # assert assert len(record) == 1 assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
def test_check_file_grp_page_id_list(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003', 'PHYS_0001']) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1)
def test_run1(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenSegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'level-of-operation': 'line'} ) proc.process() workspace.save_mets()
def test_param_json(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) run_processor(KrakenOcr, resolver=resolver, workspace=workspace, input_file_grp="INPUT", output_file_grp="OCR-D-OCR-KRAKEN") workspace.save_mets()
def test_parameter_url(self): with TemporaryDirectory() as tempdir: jsonpath = join(tempdir, 'params.json') with open(jsonpath, 'w') as f: f.write('{}') processor = run_processor( DummyProcessor, parameter='file://%s' % jsonpath, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml') ) self.assertEqual(len(processor.input_files), 35)
def test_parameter(self): with TemporaryDirectory() as tempdir: jsonpath = join(tempdir, 'params.json') with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: processor = run_processor( DummyProcessor, parameter=json.load(f), resolver=self.resolver, mets_url=assets.url_of( 'SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 20)
def test_check_file_grp_page_id_str(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001" )
def testProcessorProfiling(self): initLogging() log_capture_string = FIFOIO(256) ch = logging.StreamHandler(log_capture_string) ch.setFormatter(logging.Formatter(LOG_FORMAT)) getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
def setUp(self): super().setUp() self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
import os from os.path import join, exists from shutil import copytree, rmtree from re import sub from tempfile import TemporaryDirectory from tests.base import TestCase, assets, main from ocrd.resolver import Resolver from ocrd.workspace import Workspace # from ocrd_utils.logging import setOverrideLogLevel # setOverrideLogLevel('DEBUG') TMP_FOLDER = '/tmp/test-pyocrd-resolver' METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml') FOLDER_KANT = assets.path_to('kant_aufklaerung_1784') TEST_ZIP = assets.path_to('test.ocrd.zip') oldpwd = os.getcwd() # pylint: disable=redundant-unittest-assert, broad-except, deprecated-method, too-many-public-methods class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784') if exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder)
def setUp(self): self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml'))
# pylint: disable=import-error import os import shutil from tests.base import TestCase, assets, main from ocrd import Resolver from ocrd_kraken.binarize import KrakenBinarize from ocrd_utils.logging import setOverrideLogLevel setOverrideLogLevel('DEBUG') PARAM_JSON = assets.url_of('param-binarize.json') WORKSPACE_DIR = '/tmp/ocrd-kraken-binarize-test' class TestKrakenBinarize(TestCase): def setUp(self): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) # def test_param_json(self): # resolver = Resolver() # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) # run_processor( # KrakenBinarize, # resolver=resolver, # workspace=workspace,
def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/data/mets.xml')) initLogging()
# pylint: disable=import-error import os import shutil from tests.base import TestCase, assets, main from ocrd.resolver import Resolver from ocrd_kraken.segment import KrakenSegment PARAM_JSON = assets.url_of('param-segment.json') WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test' class TestKrakenSegment(TestCase): def setUp(self): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) def test_run1(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenSegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'level-of-operation': 'line'} ) proc.process() workspace.save_mets()
def test_resolve_image_as_pil(image_url, size_pil): url_path = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') workspace = Resolver().workspace_from_url(url_path) img_pil = workspace._resolve_image_as_pil(image_url, [[0, 0], [1, 1]]) assert img_pil.size == size_pil
def test_simple(self): report = WorkspaceValidator.validate( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) self.assertTrue(report.is_valid)
def test_with_mets_url_input_files(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 35)