def test_merge(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as ws1dir, \ copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as ws2dir: ws1 = Workspace(self.resolver, ws1dir) ws2 = Workspace(self.resolver, ws2dir) assert len(ws1.mets.find_all_files()) == 6 ws1.merge(ws2) assert len(ws1.mets.find_all_files()) == 41 assert exists(join(ws1dir, 'OCR-D-IMG/FILE_0001_IMAGE.tif'))
def test_find_all_files_multiple_physical_pages_for_fileids(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url']) self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n') self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url']) self.assertEqual(len(result.stdout.split('\n')), 19)
def test_download_to_directory_default(self): with copy_of_directory(FOLDER_KANT) as src: with TemporaryDirectory() as dst: fn = self.resolver.download_to_directory( dst, pjoin(src, 'data/mets.xml')) self.assertEqual(fn, 'mets.xml') self.assertTrue(Path(dst, fn).exists())
def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def test_remove_file_ocrdfile(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005']) ocrd_file = mets.find_all_files(ID='FILE_0005_IMAGE')[0] mets.remove_one_file(ocrd_file) self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
def test_physical_pages_for_fileids(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual( mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'])
def test_remove_file_group_rmdir(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) self.assertTrue(exists(join(tempdir, 'OCR-D-IMG'))) workspace.remove_file_group('OCR-D-IMG', recursive=True) self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))
def test_remove_file_page_recursive_same_group(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir: with pushd_popd(tempdir): ws = Workspace(self.resolver, directory=tempdir) before = count_files() ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=True, force=False) after = count_files() self.assertEqual(after, before - 1, '2 file deleted')
def test_remove_file_page_recursive(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir: with pushd_popd(tempdir): ws = Workspace(self.resolver, directory=tempdir) self.assertEqual(len(ws.mets.find_files()), 119) ws.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True) self.assertEqual(len(ws.mets.find_files()), 83) ws.remove_file('PAGE_0017_ALTO', page_recursive=True)
def test_cli_process_smoke(self): disableLogging() with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): with self.assertRaisesRegex( Exception, "Executable not found in PATH: ocrd-foo"): self.invoke_cli(process_cli, ['foo'])
def test_remove_file_regex(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005']) mets.remove_file('//FILE_0005.*') self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
def test_parameter_override_wo_param(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: with pushd_popd(tempdir): code, out, err = self.invoke_cli( cli_dummy_processor, ['-P', 'baz', 'two', *DEFAULT_IN_OUT]) print(out) self.assertEqual(out, '{"baz": "two"}\n')
def test_processor_run(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: with pushd_popd(tempdir): result = self.runner.invoke( cli_dummy_processor, ['-p', '{"foo": 42}', '--mets', 'mets.xml']) self.assertEqual(result.exit_code, 0)
def test_remove_file_group_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(Exception, "No such fileGrp"): # raise error unless force workspace.remove_file_group('I DO NOT EXIST') # no error workspace.remove_file_group('I DO NOT EXIST', force=True)
def test_remove_file_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(FileNotFoundError, "not found"): # should fail workspace.remove_file('non-existing-id') # should succeed workspace.remove_file('non-existing-id', force=True)
def test_rename_file_group0(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) with self.assertRaisesRegex(FileNotFoundError, "No such fileGrp 'FOOBAR'"): mets.rename_file_group('FOOBAR', 'FOOBAR') assert 'FOOBAR' not in mets.file_groups mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') assert 'OCR-D-GT-PAGE' not in mets.file_groups assert 'FOOBAR' in mets.file_groups
def test_processor_run(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: with pushd_popd(tempdir): exit_code, out, err = self.invoke_cli(cli_dummy_processor, [ '-p', '{"baz": "forty-two"}', '--mets', 'mets.xml', *DEFAULT_IN_OUT ]) assert not exit_code
def test_remove_file_group_regex(self): """ Test removal of filegrp """ with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(len(mets.file_groups), 17) self.assertEqual(len(mets.find_all_files()), 35) mets.remove_file_group('//OCR-D-GT-.*', recursive=True) self.assertEqual(len(mets.file_groups), 15) self.assertEqual(len(mets.find_all_files()), 31)
def test_remove_file_group_force(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(Exception, "No such fileGrp"): # should fail workspace.remove_file_group('I DO NOT EXIST') # should succeed workspace.remove_file_group('I DO NOT EXIST', force=True) # should also succeed workspace.overwrite_mode = True workspace.remove_file_group('I DO NOT EXIST', force=False)
def test_pcgtsid(self): with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): # remove the @pcGtsId attribute for testing os.system( """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate(self.resolver, join(wsdir, 'mets.xml')) self.assertIn( 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"', report.warnings)
def test_rename_file_group(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with pushd_popd(tempdir): pcgts_before = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))) assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif' # from os import system # print(system('find')) workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') # print(system('find')) pcgts_after = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))) assert pcgts_after.get_Page().imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif' assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists() assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
def test_crop(self): if not torch.cuda.is_available(): pytest.skip('CUDA is not available, cannot test dewarping') with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor(OcrdAnybaseocrDewarper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='DEWARP-TEST', parameter={'model_path': str(self.model_path)}) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_crop(self): with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor( OcrdAnybaseocrCropper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='CROP-TEST', parameter={}, ) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_remove_file_group0(self): """ Test removal of filegrp """ with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(len(mets.file_groups), 17) self.assertEqual(len(mets.find_all_files()), 35) # print() # before = sorted([x.ID for x in mets.find_all_files()]) with self.assertRaisesRegex(Exception, "not empty"): mets.remove_file_group('OCR-D-GT-ALTO') mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) # print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])]) self.assertEqual(len(mets.file_groups), 16) self.assertEqual(len(mets.find_all_files()), 33)
def test_copy_vs_clone(self): src_dir = assets.path_to('kant_aufklaerung_1784/data') with TemporaryDirectory() as tempdir: # cloned without download shallowcloneddir = join(tempdir, 'cloned-shallow') # cloned with download fullcloneddir = join(tempdir, 'cloned-all') # copied copieddir = join(tempdir, 'copied') Path(fullcloneddir).mkdir() Path(shallowcloneddir).mkdir() result = self.runner.invoke( workspace_cli, ['clone', join(src_dir, 'mets.xml'), shallowcloneddir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['clone', '-a', join(src_dir, 'mets.xml'), fullcloneddir]) self.assertEqual(result.exit_code, 0) with copy_of_directory(src_dir, copieddir): shallow_vs_copied = dircmp(shallowcloneddir, copieddir) self.assertEqual( set(shallow_vs_copied.right_only), set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG'])) full_vs_copied = dircmp(fullcloneddir, copieddir) # print(full_vs_copied) # from ocrd_utils import pushd_popd # with pushd_popd(tempdir): # import os # os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir)) # XXX mets.xml will not have the exact same content because # URLs that are actually files will be marked up as such with # @LOCTYPE/@OTHERLOCTYPE # self.assertEqual(full_vs_copied.diff_files, []) self.assertEqual(full_vs_copied.left_only, []) self.assertEqual(full_vs_copied.right_only, [])
def test_task_run(self): resolver = Resolver() with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): ws = resolver.workspace_from_url('mets.xml') ws.add_file('GRP0', content='', local_filename='GRP0/foo', ID='file0', mimetype=MIMETYPE_PAGE, pageId=None) ws.save_mets() files_before = len(ws.mets.find_files()) run_tasks('mets.xml', 'DEBUG', None, [ "dummy -I OCR-D-IMG -O GRP1", "dummy -I GRP1 -O GRP2", ]) ws.reload_mets() # step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1 # step 2: 2 images and 2 PAGEXML in GRP1 -> process just the PAGEXML self.assertEqual(len(ws.mets.find_files()), files_before + 6)