def validate_tasks(tasks, workspace, page_id=None, overwrite=False): report = ValidationReport() prev_output_file_grps = workspace.mets.file_groups first_task = tasks[0] first_task.validate() # first task: check input/output file groups from METS WorkspaceValidator.check_file_grp( workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report) prev_output_file_grps += first_task.output_file_grps for task in tasks[1:]: task.validate() # check either existing fileGrp or output-file group of previous task matches current input_file_group for input_file_grp in task.input_file_grps: if not input_file_grp in prev_output_file_grps: report.add_error( "Input file group not contained in METS or produced by previous steps: %s" % input_file_grp) if not overwrite: WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report) # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever. # if len(prev_output_file_grps) != len(set(prev_output_file_grps)): # report.add_error("Output file group specified multiple times: %s" % # [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2]) prev_output_file_grps += task.output_file_grps if not report.is_valid: raise Exception("Invalid task sequence input/output file groups: %s" % report.errors) return report
def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, page_strictness='strict', page_coordinate_consistency='poly'): """ Construct a new WorkspaceValidator. Args: resolver (Resolver): mets_url (string): src_dir (string): skip (list): download (boolean): page_strictness ("strict"|"lax"|"fix"|"off"): page_coordinate_consistency ("poly"|"baseline"|"both"|"off"): """ self.report = ValidationReport() self.skip = skip if skip else [] self.log = getLogger('ocrd.workspace_validator') self.log.debug('resolver=%s mets_url=%s src_dir=%s', resolver, mets_url, src_dir) self.resolver = resolver if mets_url is None and src_dir is not None: mets_url = '%s/mets.xml' % src_dir self.mets_url = mets_url self.download = download self.page_strictness = page_strictness self.page_coordinate_consistency = page_coordinate_consistency self.src_dir = src_dir self.workspace = None self.mets = None
def _validate(self, obj): """ Do the actual validation Arguments: obj (dict): object to validate Returns: ValidationReport """ report = ValidationReport() if not self.validator.is_valid(obj): for v in self.validator.iter_errors(obj): # print(">>>>>>>>> v='%s', obj='%s'" % (v, obj)) report.add_error("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) return report
def __init__(self, resolver, path_to_zip): """ Arguments: resolver (Resolver): resolver path_to_zip (string): Path to the OCRD-ZIP file """ self.resolver = resolver self.path_to_zip = path_to_zip self.report = ValidationReport() self.profile_validator = Profile(OCRD_BAGIT_PROFILE_URL, profile=OCRD_BAGIT_PROFILE)
def _validate(self, doc): """ Do the actual validation. Arguments: doc (etree.ElementTree|str|bytes|pathlib.Path): the document. if etree: us as-is. if str/bytes: parse as XML string. If Path: read_text on it Returns: ValidationReport """ report = ValidationReport() if isinstance(doc, Path): doc = ET.parse(str(doc)) if isinstance(doc, (bytes, str)): doc = ET.fromstring(doc) try: self._xmlschema.assertValid(doc) except ET.DocumentInvalid as fail: for err in fail.error_log: # pylint: disable=no-member report.add_error("Line %s: %s" % (err.line, err.message)) return report
def test_toxml(self): report = ValidationReport() self.assertEqual(str(report), 'OK') report.add_warning('This is not good') self.assertEqual(str(report), 'INVALID[ 1 warnings ]') report.add_error('This is bad') self.assertEqual(str(report), 'INVALID[ 1 warnings 1 errors ]') report.add_notice('This is noticeable') self.assertEqual(str(report), 'INVALID[ 1 warnings 1 errors 1 notices ]') self.assertEqual( report.to_xml(), '''\ <report valid="false"> <warning>This is not good</warning> <error>This is bad</error> <notice>This is noticeable</notice> </report>''')
def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id=None, report=None): """ Return a report on whether input_file_grp is/are in workspace.mets and output_file_grp is/are not. To be run before processing Arguments: workspacec (Workspace) the workspace to validate input_file_grp (list|string) list or comma-separated list of input file groups output_file_grp (list|string) list or comma-separated list of output file groups page_id (list|string) list or comma-separated list of page_ids to write to """ if not report: report = ValidationReport() if isinstance(input_file_grp, str): input_file_grp = input_file_grp.split( ',') if input_file_grp else [] if isinstance(output_file_grp, str): output_file_grp = output_file_grp.split( ',') if output_file_grp else [] if page_id and isinstance(page_id, str): page_id = page_id.split(',') log = getLogger('ocrd.workspace_validator') log.debug("input_file_grp=%s output_file_grp=%s" % (input_file_grp, output_file_grp)) if input_file_grp: for grp in input_file_grp: if grp not in workspace.mets.file_groups: report.add_error("Input fileGrp[@USE='%s'] not in METS!" % grp) if output_file_grp: for grp in output_file_grp: if grp in workspace.mets.file_groups: if page_id: for one_page_id in page_id: if next( workspace.mets.find_files( fileGrp=grp, pageId=one_page_id), None): report.add_error( "Output fileGrp[@USE='%s'] already contains output for page %s" % (grp, one_page_id)) else: report.add_error( "Output fileGrp[@USE='%s'] already in METS!" % grp) return report
def validate(filename=None, ocrd_page=None, ocrd_file=None, page_textequiv_consistency='strict', page_textequiv_strategy='first', check_baseline=True, check_coords=True): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' page_textequiv_strategy (string): Currently only 'first' check_baseline (bool): whether Baseline must be fully within TextLine/Coords check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully contained within Border/*Region/TextLine/Word, resp. Returns: report (:class:`ValidationReport`) Report on the validity """ log = getLogger('ocrd.page_validator.validate') if ocrd_page: page = ocrd_page file_id = ocrd_page.get_pcGtsId() elif ocrd_file: page = page_from_file(ocrd_file) file_id = ocrd_file.ID elif filename: page = parse(filename, silence=True) file_id = filename else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): raise Exception( "page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) return report
class WorkspaceValidator(): """ Validates an OCR-D/METS workspace against the specs. """ @staticmethod def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id=None, report=None): """ Return a report on whether input_file_grp is/are in workspace.mets and output_file_grp is/are not. To be run before processing Arguments: workspacec (Workspace) the workspace to validate input_file_grp (list|string) list or comma-separated list of input file groups output_file_grp (list|string) list or comma-separated list of output file groups page_id (list|string) list or comma-separated list of page_ids to write to """ if not report: report = ValidationReport() if isinstance(input_file_grp, str): input_file_grp = input_file_grp.split( ',') if input_file_grp else [] if isinstance(output_file_grp, str): output_file_grp = output_file_grp.split( ',') if output_file_grp else [] if page_id and isinstance(page_id, str): page_id = page_id.split(',') log = getLogger('ocrd.workspace_validator') log.debug("input_file_grp=%s output_file_grp=%s" % (input_file_grp, output_file_grp)) if input_file_grp: for grp in input_file_grp: if grp not in workspace.mets.file_groups: report.add_error("Input fileGrp[@USE='%s'] not in METS!" % grp) if output_file_grp: for grp in output_file_grp: if grp in workspace.mets.file_groups: if page_id: for one_page_id in page_id: if next( workspace.mets.find_files( fileGrp=grp, pageId=one_page_id), None): report.add_error( "Output fileGrp[@USE='%s'] already contains output for page %s" % (grp, one_page_id)) else: report.add_error( "Output fileGrp[@USE='%s'] already in METS!" % grp) return report def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, page_strictness='strict', page_coordinate_consistency='poly'): """ Construct a new WorkspaceValidator. Args: resolver (Resolver): mets_url (string): src_dir (string): skip (list): download (boolean): page_strictness ("strict"|"lax"|"fix"|"off"): page_coordinate_consistency ("poly"|"baseline"|"both"|"off"): """ self.report = ValidationReport() self.skip = skip if skip else [] log = getLogger('ocrd.workspace_validator') log.debug('resolver=%s mets_url=%s src_dir=%s', resolver, mets_url, src_dir) self.resolver = resolver if mets_url is None and src_dir is not None: mets_url = '%s/mets.xml' % src_dir self.mets_url = mets_url self.download = download self.page_strictness = page_strictness self.page_coordinate_consistency = page_coordinate_consistency self.src_dir = src_dir self.workspace = None self.mets = None @staticmethod def validate(*args, **kwargs): """ Validates the workspace of a METS URL against the specs Arguments: resolver (:class:`ocrd.Resolver`): Resolver mets_url (string): URL of the METS file src_dir (string, None): Directory containing mets file skip (list): Tests to skip. One or more of 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'dimension', 'url' download (boolean): Whether to download files Returns: report (:class:`ValidationReport`) Report on the validity """ validator = WorkspaceValidator(*args, **kwargs) return validator._validate() # pylint: disable=protected-access def _validate(self): """ Actual validation. """ log = getLogger('ocrd.workspace_validator') try: self._resolve_workspace() except Exception as e: # pylint: disable=broad-except log.warning("Failed to instantiate workspace: %s", e) self.report.add_error("Failed to instantiate workspace: %s" % e) return self.report with pushd_popd(self.workspace.directory): try: if 'mets_unique_identifier' not in self.skip: self._validate_mets_unique_identifier() if 'mets_file_group_names' not in self.skip: self._validate_mets_file_group_names() if 'mets_files' not in self.skip: self._validate_mets_files() if 'pixel_density' not in self.skip: self._validate_pixel_density() if 'multipage' not in self.skip: self._validate_multipage() if 'dimension' not in self.skip: self._validate_dimension() if 'imagefilename' not in self.skip: self._validate_imagefilename() if 'page' not in self.skip: self._validate_page() if 'page_xsd' not in self.skip: self._validate_page_xsd() if 'mets_xsd' not in self.skip: self._validate_mets_xsd() except Exception: # pylint: disable=broad-except self.report.add_error("Validation aborted with exception: %s" % format_exc()) return self.report def _resolve_workspace(self): """ Clone workspace from mets_url unless workspace was provided. """ if self.workspace is None: self.workspace = self.resolver.workspace_from_url( self.mets_url, src_baseurl=self.src_dir, download=self.download) self.mets = self.workspace.mets def _validate_mets_unique_identifier(self): """ Validate METS unique identifier exists. See `spec <https://ocr-d.github.io/mets#unique-id-for-the-document-processed>`_. """ if self.mets.unique_identifier is None: self.report.add_error("METS has no unique identifier") def _validate_imagefilename(self): """ Validate that the imageFilename is correctly set to a filename relative to the workspace """ for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote PAGE XML <%s>" % f.url) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): self.report.add_error( "PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) if is_local_filename( imageFilename) and not Path(imageFilename).exists(): self.report.add_warning( "PAGE-XML %s : imageFilename '%s' points to non-existent local file" ) def _validate_dimension(self): """ Validate image height and PAGE imageHeight match """ for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): if not is_local_filename(f.url) and not self.download: self.report.add_notice( "_validate_dimension: Not executed because --download wasn't set and PAGE might reference remote (Alternative)Images <%s>" % f.url) continue page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: self.report.add_error( "PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) if page.imageWidth != exif.width: self.report.add_error( "PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) def _validate_multipage(self): """ Validate the number of images per file is 1 (TIFF allows multi-page images) See `spec <https://ocr-d.github.io/mets#no-multi-page-images>`_. """ for f in [ f for f in self.mets.find_files() if f.mimetype.startswith('image/') ]: if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote image <%s>" % f.url) continue exif = self.workspace.resolve_image_exif(f.url) if exif.n_frames > 1: self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) def _validate_pixel_density(self): """ Validate image pixel density See `spec <https://ocr-d.github.io/mets#pixel-density-of-images-must-be-explicit-and-high-enough>`_. """ for f in [ f for f in self.mets.find_files() if f.mimetype.startswith('image/') ]: if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote image <%s>" % f.url) continue exif = self.workspace.resolve_image_exif(f.url) for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: self.report.add_notice( "Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit)) def _validate_mets_file_group_names(self): """ Ensure ``USE`` attributes of ``mets:fileGrp`` conform to OCR-D naming schema.. See `spec <https://ocr-d.github.io/mets#file-group-use-syntax>`_. """ for fileGrp in self.mets.file_groups: if not fileGrp.startswith(FILE_GROUP_PREFIX): self.report.add_notice( "fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp)) else: # OCR-D-FOO-BAR -> ('FOO', 'BAR') # \____/\_/ \_/ # | | | # Prefix | Name # Category category = fileGrp[len(FILE_GROUP_PREFIX):] name = None if '-' in category: category, name = category.split('-', 1) if category not in FILE_GROUP_CATEGORIES: self.report.add_notice( "Unspecified USE category '%s' in fileGrp '%s'" % (category, fileGrp)) if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name): self.report.add_notice( "Invalid USE name '%s' in fileGrp '%s'" % (name, fileGrp)) def _validate_mets_files(self): """ Validate ``mets:file`` URLs are sane. """ try: next(self.mets.find_files()) except StopIteration: self.report.add_error("No files") for f in self.mets.find_files(): if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice( "File '%s' has GROUPID attribute - document might need an update" % f.ID) if not f.pageId: self.report.add_error( "File '%s' does not manifest any physical page." % f.ID) if not f.url: self.report.add_error( "File '%s' has no mets:Flocat/@xlink:href" % f.ID) continue if 'url' not in self.skip and ':/' in f.url: if re.match(r'^file:/[^/]', f.url): self.report.add_error( "File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) scheme = f.url[0:f.url.index(':')] if scheme not in ('http', 'https', 'file'): self.report.add_warning( "File '%s' has non-HTTP, non-file URL '%s'" % (f.ID, f.url)) def _validate_page(self): """ Run PageValidator on the PAGE-XML documents referenced in the METS. """ for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE): self.workspace.download_file(ocrd_file) page_report = PageValidator.validate( ocrd_file=ocrd_file, page_textequiv_consistency=self.page_strictness, check_coords=self.page_coordinate_consistency in ['poly', 'both'], check_baseline=self.page_coordinate_consistency in ['baseline', 'both']) pg = page_from_file(ocrd_file) if pg.pcGtsId != ocrd_file.ID: page_report.add_warning( 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pg.pcGtsId or '', ocrd_file.ID or '')) self.report.merge_report(page_report) def _validate_page_xsd(self): """ Validate all PAGE-XML files against PAGE XSD schema """ log = getLogger('ocrd.workspace_validator') log.debug("Validating all PAGE-XML files against XSD") for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE): self.workspace.download_file(ocrd_file) for err in XsdPageValidator.validate(Path( ocrd_file.local_filename)).errors: self.report.add_error("%s: %s" % (ocrd_file.ID, err)) log.debug("Finished alidating all PAGE-XML files against XSD") def _validate_mets_xsd(self): """ Validate METS against METS XSD schema """ log = getLogger('ocrd.workspace_validator') log.debug("Validating METS %s against XSD" % self.workspace.mets_target) for err in XsdMetsValidator.validate(Path( self.workspace.mets_target)).errors: self.report.add_error("%s: %s" % (self.workspace.mets_target, err)) log.debug("Finished Validating METS against XSD")
def test_str(self): report = ValidationReport() report.add_error('This is bad') self.assertEqual(str(report), 'INVALID[ 1 errors ]')
def test_merge(self): report = ValidationReport() other_report = ValidationReport() report.add_error("foo") other_report.add_error("bar") other_report.add_warning("foo") report.merge_report(other_report) self.assertEqual(report.errors, ['foo', 'bar']) self.assertEqual(report.warnings, ['foo'])