def validation_report(package_id=None): ''' Looks at every harvested metadata record and compares the validation errors that it had on last import and what it would be with the current validators. Useful when going to update the validators. Returns a ReportTable. ''' validators = SpatialHarvester()._get_validator() log.debug('Validators: %r', validators.profiles) query = model.Session.query(HarvestObject).\ filter_by(current=True).\ order_by(HarvestObject.fetch_finished.desc()) if package_id: query = query.filter(HarvestObject.package_id==package_id) report = ReportTable([ 'Harvest Object id', 'GEMINI2 id', 'Date fetched', 'Dataset name', 'Publisher', 'Source URL', 'Old validation errors', 'New validation errors']) for harvest_object in query: validation_errors = [] for err in harvest_object.errors: if 'not a valid Gemini' in err.message or \ 'Validating against' in err.message: validation_errors.append(err.message) groups = harvest_object.package.get_groups() publisher = groups[0].title if groups else '(none)' xml = etree.fromstring(harvest_object.content.encode("utf-8")) valid, errors = validators.is_valid(xml) report.add_row_dict({ 'Harvest Object id': harvest_object.id, 'GEMINI2 id': harvest_object.guid, 'Date fetched': harvest_object.fetch_finished, 'Dataset name': harvest_object.package.name, 'Publisher': publisher, 'Source URL': harvest_object.source.url, 'Old validation errors': '; '.join(validation_errors), 'New validation errors': '; '.join(errors), }) log.debug('%i results', query.count()) return report
def validate_file(metadata_filepath): from ckanext.spatial.harvesters import SpatialHarvester from ckanext.spatial.model import ISODocument if not os.path.exists(metadata_filepath): print('Filepath %s not found' % metadata_filepath) sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() validators = SpatialHarvester()._get_validator() print('Validators: %r' % validators.profiles) try: xml_string = metadata_xml.encode("utf-8") except UnicodeDecodeError as e: print('ERROR: Unicode Error reading file \'%s\': %s' % \ (metadata_filepath, e)) sys.exit(1) #import pdb; pdb.set_trace() xml = etree.fromstring(xml_string) # XML validation valid, errors = validators.is_valid(xml) # CKAN read of values if valid: try: iso_document = ISODocument(xml_string) iso_values = iso_document.read_values() except Exception as e: valid = False errors.append( 'CKAN exception reading values from ISODocument: %s' % e) print('***************') print('Summary') print('***************') print('File: \'%s\'' % metadata_filepath) print('Valid: %s' % valid) if not valid: print('Errors:') print(pprint(errors)) print('***************')
def validate_file(self): from ckanext.spatial.harvesters import SpatialHarvester if len(self.args) > 2: print 'Too many parameters %i' % len(self.args) sys.exit(1) if len(self.args) < 2: print 'Not enough parameters %i' % len(self.args) sys.exit(1) metadata_filepath = self.args[1] if not os.path.exists(metadata_filepath): print 'Filepath %s not found' % metadata_filepath sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() validators = SpatialHarvester()._get_validator() print 'Validators: %r' % validators.profiles xml = etree.fromstring(metadata_xml.encode("utf-8")) valid, errors = validators.is_valid(xml) print 'Valid: %s' % valid if not valid: print 'Errors:' print pprint(errors)
def validation_report(package_id=None): ''' Looks at every harvested metadata record and compares the validation errors that it had on last import and what it would be with the current validators. Useful when going to update the validators. Returns a ReportTable. ''' log = logging.getLogger(__name__ + '.validation_report') validators = SpatialHarvester()._get_validator() log.debug('Validators: %r', validators.profiles) query = model.Session.query(HarvestObject).\ filter_by(current=True).\ order_by(HarvestObject.fetch_finished.desc()) if package_id: query = query.filter(HarvestObject.package_id == package_id) report = ReportTable([ 'Harvest Object id', 'GEMINI2 id', 'Date fetched', 'Dataset name', 'Publisher', 'Source URL', 'Old validation errors', 'New validation errors' ]) old_validation_failure_count = 0 new_validation_failure_count = 0 for harvest_object in query: validation_errors = [] for err in harvest_object.errors: if 'not a valid Gemini' in err.message or \ 'Validating against' in err.message: validation_errors.append(err.message) if validation_errors: old_validation_failure_count += 1 groups = harvest_object.package.get_groups() publisher = groups[0].title if groups else '(none)' xml = etree.fromstring(harvest_object.content.encode("utf-8")) valid, errors = validators.is_valid(xml) if not valid: new_validation_failure_count += 1 report.add_row_dict({ 'Harvest Object id': harvest_object.id, 'GEMINI2 id': harvest_object.guid, 'Date fetched': harvest_object.fetch_finished, 'Dataset name': harvest_object.package.name, 'Publisher': publisher, 'Source URL': harvest_object.source.url, 'Old validation errors': '; '.join(validation_errors), 'New validation errors': '; '.join(errors), }) log.debug('%i results', query.count()) log.debug('%i failed old validation', old_validation_failure_count) log.debug('%i failed new validation', new_validation_failure_count) return report