Ejemplo n.º 1
0
def validation_report(package_id=None):
    '''
    Looks at every harvested metadata record and compares the
    validation errors that it had on last import and what it would be with
    the current validators. Useful when going to update the validators.

    Returns a ReportTable.
    '''

    validators = SpatialHarvester()._get_validator()
    log.debug('Validators: %r', validators.profiles)

    query = model.Session.query(HarvestObject).\
            filter_by(current=True).\
            order_by(HarvestObject.fetch_finished.desc())

    if package_id:
        query = query.filter(HarvestObject.package_id==package_id)

    report = ReportTable([
        'Harvest Object id',
        'GEMINI2 id',
        'Date fetched',
        'Dataset name',
        'Publisher',
        'Source URL',
        'Old validation errors',
        'New validation errors'])

    for harvest_object in query:
        validation_errors = []
        for err in harvest_object.errors:
            if 'not a valid Gemini' in err.message or \
                   'Validating against' in err.message:
                validation_errors.append(err.message)

        groups = harvest_object.package.get_groups()
        publisher = groups[0].title if groups else '(none)'

        xml = etree.fromstring(harvest_object.content.encode("utf-8"))
        valid, errors = validators.is_valid(xml)
                         
        report.add_row_dict({
                             'Harvest Object id': harvest_object.id,
                             'GEMINI2 id': harvest_object.guid,
                             'Date fetched': harvest_object.fetch_finished,
                             'Dataset name': harvest_object.package.name,
                             'Publisher': publisher,
                             'Source URL': harvest_object.source.url,
                             'Old validation errors': '; '.join(validation_errors),
                             'New validation errors': '; '.join(errors),
                             })

    log.debug('%i results', query.count())
    return report
    def validate_file(self):
        ''' '''
        from ckanext.spatial.harvesters import SpatialHarvester
        from ckanext.spatial.model import ISODocument

        if len(self.args) > 2:
            print u'Too many parameters %i' % len(self.args)
            sys.exit(1)
        if len(self.args) < 2:
            print u'Not enough parameters %i' % len(self.args)
            sys.exit(1)
        metadata_filepath = self.args[1]
        if not os.path.exists(metadata_filepath):
            print u'Filepath %s not found' % metadata_filepath
            sys.exit(1)
        with open(metadata_filepath, u'rb') as f:
            metadata_xml = f.read()

        validators = SpatialHarvester()._get_validator()
        print u'Validators: %r' % validators.profiles
        try:
            xml_string = metadata_xml.encode(u'utf-8')
        except UnicodeDecodeError, e:
            print u'ERROR: Unicode Error reading file \'%s\': %s' % \
                  (metadata_filepath, e)
            sys.exit(1)
Ejemplo n.º 3
0
def validate_file(metadata_filepath):
    from ckanext.spatial.harvesters import SpatialHarvester
    from ckanext.spatial.model import ISODocument

    if not os.path.exists(metadata_filepath):
        print('Filepath %s not found' % metadata_filepath)
        sys.exit(1)
    with open(metadata_filepath, 'rb') as f:
        metadata_xml = f.read()

    validators = SpatialHarvester()._get_validator()
    print('Validators: %r' % validators.profiles)
    try:
        xml_string = metadata_xml.encode("utf-8")
    except UnicodeDecodeError as e:
        print('ERROR: Unicode Error reading file \'%s\': %s' % \
              (metadata_filepath, e))
        sys.exit(1)
        #import pdb; pdb.set_trace()
    xml = etree.fromstring(xml_string)

    # XML validation
    valid, errors = validators.is_valid(xml)

    # CKAN read of values
    if valid:
        try:
            iso_document = ISODocument(xml_string)
            iso_values = iso_document.read_values()
        except Exception as e:
            valid = False
            errors.append(
                'CKAN exception reading values from ISODocument: %s' % e)

    print('***************')
    print('Summary')
    print('***************')
    print('File: \'%s\'' % metadata_filepath)
    print('Valid: %s' % valid)
    if not valid:
        print('Errors:')
        print(pprint(errors))
    print('***************')
Ejemplo n.º 4
0
    def validate_file(self):
        from ckanext.spatial.harvesters import SpatialHarvester

        if len(self.args) > 2:
            print 'Too many parameters %i' % len(self.args)
            sys.exit(1)
        if len(self.args) < 2:
            print 'Not enough parameters %i' % len(self.args)
            sys.exit(1)
        metadata_filepath = self.args[1]
        if not os.path.exists(metadata_filepath):
            print 'Filepath %s not found' % metadata_filepath
            sys.exit(1)
        with open(metadata_filepath, 'rb') as f:
            metadata_xml = f.read()

        validators = SpatialHarvester()._get_validator()
        print 'Validators: %r' % validators.profiles
        xml = etree.fromstring(metadata_xml.encode("utf-8"))
        valid, errors = validators.is_valid(xml)
        print 'Valid: %s' % valid
        if not valid:
            print 'Errors:'
            print pprint(errors)
Ejemplo n.º 5
0
def validation_report(package_id=None):
    '''
    Looks at every harvested metadata record and compares the
    validation errors that it had on last import and what it would be with
    the current validators. Useful when going to update the validators.

    Returns a ReportTable.
    '''
    log = logging.getLogger(__name__ + '.validation_report')

    validators = SpatialHarvester()._get_validator()
    log.debug('Validators: %r', validators.profiles)

    query = model.Session.query(HarvestObject).\
        filter_by(current=True).\
        order_by(HarvestObject.fetch_finished.desc())

    if package_id:
        query = query.filter(HarvestObject.package_id == package_id)

    report = ReportTable([
        'Harvest Object id', 'GEMINI2 id', 'Date fetched', 'Dataset name',
        'Publisher', 'Source URL', 'Old validation errors',
        'New validation errors'
    ])

    old_validation_failure_count = 0
    new_validation_failure_count = 0

    for harvest_object in query:
        validation_errors = []
        for err in harvest_object.errors:
            if 'not a valid Gemini' in err.message or \
                   'Validating against' in err.message:
                validation_errors.append(err.message)
        if validation_errors:
            old_validation_failure_count += 1

        groups = harvest_object.package.get_groups()
        publisher = groups[0].title if groups else '(none)'

        xml = etree.fromstring(harvest_object.content.encode("utf-8"))
        valid, errors = validators.is_valid(xml)
        if not valid:
            new_validation_failure_count += 1

        report.add_row_dict({
            'Harvest Object id':
            harvest_object.id,
            'GEMINI2 id':
            harvest_object.guid,
            'Date fetched':
            harvest_object.fetch_finished,
            'Dataset name':
            harvest_object.package.name,
            'Publisher':
            publisher,
            'Source URL':
            harvest_object.source.url,
            'Old validation errors':
            '; '.join(validation_errors),
            'New validation errors':
            '; '.join(errors),
        })

    log.debug('%i results', query.count())
    log.debug('%i failed old validation', old_validation_failure_count)
    log.debug('%i failed new validation', new_validation_failure_count)
    return report
Ejemplo n.º 6
0
 def wms_check(self):
     assert len(self.args) == 2, \
         'Wrong number of args. Got %s rather than 2' % len(self.args)
     wms_url = self.args[1]
     from ckanext.spatial.harvesters import SpatialHarvester
     print SpatialHarvester._is_wms(wms_url)
Ejemplo n.º 7
0
 def wms_check(self):
     assert len(self.args) == 2, \
         'Wrong number of args. Got %s rather than 2' % len(self.args)
     wms_url = self.args[1]
     from ckanext.spatial.harvesters import SpatialHarvester
     print SpatialHarvester._is_wms(wms_url)
Ejemplo n.º 8
0
class Validation(CkanCommand):
    '''Validation commands

    Usage:
        validation report [package-name]
            Performs validation on the harvested metadata, either for all
            packages or the one specified.

        validation report-csv <filename>.csv
            Performs validation on all the harvested metadata in the db and
            writes a report in CSV format to the given filepath.
      
        validation file <filename>.xml
            Performs validation on the given metadata file.
    '''
    summary = __doc__.split('\n')[0]
    usage = __doc__
    max_args = 3
    min_args = 0

    def command(self):
        if not self.args or self.args[0] in ['--help', '-h', 'help']:
            print self.usage
            sys.exit(1)

        self._load_config()

        cmd = self.args[0]
        if cmd == 'report':
            self.report()
        elif cmd == 'report-csv':
            self.report_csv()
        elif cmd == 'file':
            self.validate_file()
        else:
            print 'Command %s not recognized' % cmd

    def report(self):
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.lib.reports import validation_report

        if len(self.args) >= 2:
            package_ref = unicode(self.args[1])
            pkg = model.Package.get(package_ref)
            if not pkg:
                print 'Package ref "%s" not recognised' % package_ref
                sys.exit(1)
        else:
            pkg = None

        report = validation_report(package_id=pkg.id)
        for row in report.get_rows_html_formatted():
            print
            for i, col_name in enumerate(report.column_names):
                print '  %s: %s' % (col_name, row[i])

    def validate_file(self):
        from ckanext.spatial.harvesters import SpatialHarvester
        from ckanext.spatial.model import GeminiDocument

        if len(self.args) > 2:
            print 'Too many parameters %i' % len(self.args)
            sys.exit(1)
        if len(self.args) < 2:
            print 'Not enough parameters %i' % len(self.args)
            sys.exit(1)
        metadata_filepath = self.args[1]
        if not os.path.exists(metadata_filepath):
            print 'Filepath %s not found' % metadata_filepath
            sys.exit(1)

        with open(metadata_filepath, 'rb') as f:
            metadata_xml = f.read()

        # this is still encoded - hopefully as UTF8. If not, then it needs
        # decoding and recoding as UTF8.

        # Check it is UTF8, as that's what etree expects.
        try:
            decoded = metadata_xml.decode("utf-8")
            reencoded = decoded.encode("utf-8")
        except UnicodeDecodeError, e:
            print 'ERROR: File was not UTF8 \'%s\': %s' % \
                  (metadata_filepath, e)
            sys.exit(1)

        # etree.fromstring accepts either a unicode string or the encoding is
        # expressed in the <xml> tag. NB 'UTF-8' is correct, 'UTF8' is wrong.
        xml = etree.fromstring(metadata_xml)

        # XML validation
        validators = SpatialHarvester()._get_validator()
        print 'Validators: %r' % validators.profiles
        valid, errors = validators.is_valid(xml)

        # CKAN read of values
        if valid:
            try:
                gemini_document = GeminiDocument(metadata_xml)
                gemini_values = gemini_document.read_values()
            except Exception, e:
                valid = False
                errors.append(
                    'CKAN exception reading values from GeminiDocument: %s' %
                    e)