def test_check_file_grp_basic(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='foo'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', 'FOO') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', None) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, None, '') self.assertTrue(report.is_valid)
def test_dimensions(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'foo') copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir) with pushd_popd(wsdir): os.system( """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'page_xsd', 'mets_xsd' ], download=True) self.assertIn( "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)", report.errors) # print(report.errors) self.assertEqual(len(report.errors), 1) self.assertEqual(report.is_valid, False) report2 = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'dimension', 'page_xsd', 'mets_xsd' ], download=False) self.assertEqual(report2.is_valid, True)
def validate_tasks(tasks, workspace, page_id=None, overwrite=False): report = ValidationReport() prev_output_file_grps = workspace.mets.file_groups first_task = tasks[0] first_task.validate() # first task: check input/output file groups from METS WorkspaceValidator.check_file_grp( workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report) prev_output_file_grps += first_task.output_file_grps for task in tasks[1:]: task.validate() # check either existing fileGrp or output-file group of previous task matches current input_file_group for input_file_grp in task.input_file_grps: if not input_file_grp in prev_output_file_grps: report.add_error( "Input file group not contained in METS or produced by previous steps: %s" % input_file_grp) if not overwrite: WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report) # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever. # if len(prev_output_file_grps) != len(set(prev_output_file_grps)): # report.add_error("Output file group specified multiple times: %s" % # [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2]) prev_output_file_grps += task.output_file_grps if not report.is_valid: raise Exception("Invalid task sequence input/output file groups: %s" % report.errors) return report
def test_validate_twice(self): validator = WorkspaceValidator( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) report = validator._validate() # pylint: disable=protected-access report = validator._validate() # pylint: disable=protected-access self.assertTrue(report.is_valid)
def test_validate_empty(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertIn('no unique identifier', report.errors[0]) self.assertIn('No files', report.errors[1]) workspace.mets.unique_identifier = 'foobar' workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1)
def test_src_dir(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=['imagefilename'], download=True, ) self.assertEqual(len(report.errors), 42)
def test_validate_weird_urls(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', pageId='page1', url='file:/java-file-url') f = workspace.mets.add_file('OCR-D-GT-PAGE', ID='file2', mimetype='image/png', pageId='page2', url='nothttp://unusual.scheme') f._el.set('GROUPID', 'donotuse') # pylint: disable=protected-access workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 2) self.assertIn("Java-specific", report.warnings[0]) self.assertIn("non-HTTP", report.warnings[1]) self.assertEqual(len(report.notices), 1) self.assertIn("has GROUPID attribute", report.notices[0])
def test_imagefilename(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=['page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density'], download=False, ) self.assertEqual(len(report.errors), 0)
def test_check_file_grp_page_id_valid(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004') self.assertTrue(report.is_valid)
def test_validate_files_nopageid(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density', 'imagefilename']) self.assertEqual(len(report.errors), 1) self.assertIn("does not manifest any physical page.", report.errors[0])
def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertEqual(report.errors[0], "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'") self.assertIn('No files', report.errors[1])
def test_validate_file_groups_bad_name(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-GT-X') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertIn("Invalid USE name 'X' in fileGrp", report.errors[0]) self.assertIn('No files', report.errors[1])
def test_check_file_grp_page_id_list(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003', 'PHYS_0001']) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1)
def test_validate_file_groups_non_ocrd(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('FOO') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0])
def test_validate_pixel_no_download(self): imgpath = assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.notices), 0)
def test_src_dir(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=['imagefilename'], download=True, ) print(report.errors) self.assertEqual( len([e for e in report.errors if isinstance(e, ConsistencyError)]), 42, '42 textequiv consistency errors')
def test_pcgtsid(self): with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): # remove the @pcGtsId attribute for testing os.system( """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate(self.resolver, join(wsdir, 'mets.xml')) self.assertIn( 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"', report.warnings)
def test_check_file_grp_page_id_str(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001" )
def test_skip_page(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), download=True, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', ]) self.assertTrue(report.is_valid)
def test_validate_weird_urls(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', pageId='page1', url='file:/java-file-url') f = workspace.mets.add_file('OCR-D-GT-PAGE', ID='file2', mimetype='image/png', pageId='page2', url='nothttp://unusual.scheme') f._el.set('GROUPID', 'donotuse') # pylint: disable=protected-access workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) assert not report.is_valid assert len(report.errors) == 2 assert "invalid (Java-specific) file URL" in report.errors[0]
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, help=False, version=False, **kwargs): LOG = getLogger('ocrd_cli_wrap_processor') if dump_json: processorClass(workspace=None, dump_json=True) elif help: processorClass(workspace=None, show_help=True) elif version: processorClass(workspace=None, show_version=True) elif mets is None: msg = 'Error: Missing option "-m" / "--mets".' LOG.error(msg) raise Exception(msg) else: if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) # TODO once we implement 'overwrite' CLI option and mechanism, disable the # `output_file_grp_ check by setting to False-y value if 'overwrite' is set report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], kwargs['output_file_grp']) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def validate_many(bagsdir, report_dir): """ Update many OCR-D bags at once BAGSDIR must contain only directories thaty contain unserialized OCRD-ZIP """ # yes, that is bagsdir, bagdirs and bagdir. Deal with it 😎 🆒 bagsdir = Path(bagsdir) bagdirs = [ x for x in bagsdir.iterdir() if x.is_dir() and not x.name.startswith('.') ] total = len(bagdirs) cur = 0 report_dir = Path(report_dir) report_dir.mkdir(parents=True, exist_ok=True) for bagdir in bagdirs: directory = Path(bagdir, 'data') cur += 1 LOG.info(">>>>> OCR-D-ZIP [%05d / %05d] %s", cur, total, bagdir.name) report = WorkspaceValidator.validate(resolver, str(Path(directory, 'mets.xml')), src_dir=directory, skip=[], download=False, page_strictness='lax') Path(report_dir, '%s.workspace.txt' % bagdir.name).write_text(report.to_xml()) try: report = OcrdZipValidator(resolver, str(bagdir)).validate(skip_unzip=True) Path(report_dir, '%s.ocrd-zip.txt' % bagdir.name).write_text(report.to_xml()) except Exception as e: Path(report_dir, '%s.ocrd-zip.txt' % bagdir.name).write_text(str(e))
def ocrd_cli_wrap_processor( processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, help=False, # pylint: disable=redefined-builtin version=False, overwrite=False, **kwargs): if not sys.argv[1:]: processorClass(workspace=None, show_help=True) sys.exit(1) if dump_json or help or version: processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) sys.exit() else: initLogging() LOG = getLogger('ocrd_cli_wrap_processor') # LOG.info('kwargs=%s' % kwargs) # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) # TODO OCR-D/core#274 # Assert -I / -O # if not kwargs['input_file_grp']: # raise ValueError('-I/--input-file-grp is required') # if not kwargs['output_file_grp']: # raise ValueError('-O/--output-file-grp is required') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) page_id = kwargs.get('page_id') # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 # if overwrite # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: # raise Exception("--overwrite requires --output-file-grp") # LOG.info("Removing files because of --overwrite") # for grp in kwargs['output_file_grp'].split(','): # if page_id: # for one_page_id in kwargs['page_id'].split(','): # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) # else: # LOG.debug("Removing all files in output file group %s ", grp) # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) # workspace.save_mets() # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: workspace.overwrite_mode = True report = WorkspaceValidator.check_file_grp( workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def test_simple(self): report = WorkspaceValidator.validate( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) self.assertTrue(report.is_valid)
def test_bad_workspace(self): report = WorkspaceValidator.validate(self.resolver, 'non existe') self.assertFalse(report.is_valid) self.assertIn('Failed to instantiate workspace:', report.errors[0])