Example #1
0
def test_multiplicity_warning():
    # This dataset lacks a value for Metadata Date and should
    # produce a log.warning, but not raise an exception.
    xml_string = open_xml_fixture('FCSConservancyPolygons.xml')
    iso_document = ISODocument(xml_string)
    iso_values = iso_document.read_values()
    assert_equal(iso_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
Example #2
0
def test_simple():
    ''' '''
    xml_string = open_xml_fixture(u'gemini_dataset.xml')
    iso_document = ISODocument(xml_string)
    iso_values = iso_document.read_values()
    assert_equal(iso_values[u'guid'], u'test-dataset-1')
    assert_equal(iso_values[u'metadata-date'], u'2011-09-23T10:06:08')
Example #3
0
    def get_record_type(cls, xml):
        '''
        For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"

        xml - etree of the ISO19139 XML record
        '''
        iso_parser = ISODocument(xml_tree=xml)
        return iso_parser.read_value('resource-type')[0]
Example #4
0
    def get_record_type(cls, xml):
        '''
        For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"

        xml - etree of the ISO19139 XML record
        '''
        iso_parser = ISODocument(xml_tree=xml)
        return iso_parser.read_value('resource-type')[0]
Example #5
0
    def get_record_type(cls, xml):
        '''For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"
        
        xml - etree of the ISO19139 XML record

        :param xml: 

        '''
        iso_parser = ISODocument(xml_tree=xml)
        record_types = iso_parser.read_value(u'resource-type')
        if len(record_types):
            return record_types[0]
        else:
            return u'dataset'
Example #6
0
def validate_file(metadata_filepath):
    from ckanext.spatial.harvesters import SpatialHarvester
    from ckanext.spatial.model import ISODocument

    if not os.path.exists(metadata_filepath):
        print('Filepath %s not found' % metadata_filepath)
        sys.exit(1)
    with open(metadata_filepath, 'rb') as f:
        metadata_xml = f.read()

    validators = SpatialHarvester()._get_validator()
    print('Validators: %r' % validators.profiles)
    try:
        xml_string = metadata_xml.encode("utf-8")
    except UnicodeDecodeError as e:
        print('ERROR: Unicode Error reading file \'%s\': %s' % \
              (metadata_filepath, e))
        sys.exit(1)
        #import pdb; pdb.set_trace()
    xml = etree.fromstring(xml_string)

    # XML validation
    valid, errors = validators.is_valid(xml)

    # CKAN read of values
    if valid:
        try:
            iso_document = ISODocument(xml_string)
            iso_values = iso_document.read_values()
        except Exception as e:
            valid = False
            errors.append(
                'CKAN exception reading values from ISODocument: %s' % e)

    print('***************')
    print('Summary')
    print('***************')
    print('File: \'%s\'' % metadata_filepath)
    print('Valid: %s' % valid)
    if not valid:
        print('Errors:')
        print(pprint(errors))
    print('***************')
    def _csw_resource_data_dict(self, dataset_name):
        '''Return an example open data dataset as expected as input
           to get_package_dict().'''

        xml_string = self._open_xml_fixture(dataset_name)
        iso_document = ISODocument(xml_string)
        iso_values = iso_document.read_values()
        base_harvester = SpatialHarvester()
        source = self._create_source()
        obj = HarvestObject(
            source=source,
        )
        obj.save()
        package_dict = base_harvester.get_package_dict(iso_values, obj)

        data_dict = {
            'package_dict': package_dict ,
            'iso_values': iso_values
        }
        return data_dict
def test_simple():
    xml_string = open_xml_fixture('gemini_dataset.xml')
    iso_document = ISODocument(xml_string)
    iso_values = iso_document.read_values()
    assert_equal(iso_values['guid'], 'test-dataset-1')
    assert_equal(iso_values['metadata-date'], '2011-09-23T10:06:08')
Example #9
0
    def import_stage(self, harvest_object):
        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
        }

        log = logging.getLogger(__name__ + '.import')
        log.debug('Import stage for harvest object: %s', harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        self._set_source_config(harvest_object.source.config)

        if self.force_import:
            status = 'change'
        else:
            status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = model.Session.query(HarvestObject) \
                                       .filter(HarvestObject.guid == harvest_object.guid) \
                                       .filter(HarvestObject.current == True).first() # noqa

        if status == 'delete':
            # Delete package
            context.update({
                'ignore_auth': True,
            })
            if harvest_object.package_id:
                p.toolkit.get_action('package_delete')(
                    context, {
                        'id': harvest_object.package_id
                    })
                log.info('Deleted package {0} with guid {1}'.format(
                    harvest_object.package_id, harvest_object.guid))

            return True

        # Check if it is a non ISO document
        original_document = self._get_object_extra(harvest_object,
                                                   'original_document')
        original_format = self._get_object_extra(harvest_object,
                                                 'original_format')
        if original_document and original_format:
            # DEPRECATED use the ISpatialHarvester interface method
            self.__base_transform_to_iso_called = False
            content = self.transform_to_iso(original_document, original_format,
                                            harvest_object)
            if not self.__base_transform_to_iso_called:
                log.warn(
                    'Deprecation warning: calling transform_to_iso directly is deprecated. '
                    +
                    'Please use the ISpatialHarvester interface method instead.'
                )

            for harvester in p.PluginImplementations(ISpatialHarvester):
                content = harvester.transform_to_iso(original_document,
                                                     original_format,
                                                     harvest_object)

            if content:
                harvest_object.content = content
            else:
                self._save_object_error('Transformation to ISO failed',
                                        harvest_object, 'Import')
                return False
        else:
            if harvest_object.content is None:
                self._save_object_error(
                    'Empty content for object {0}'.format(harvest_object.id),
                    harvest_object, 'Import')
                return False

            # Validate ISO document
            is_valid, profile, errors = self._validate_document(
                harvest_object.content, harvest_object)
            if not is_valid:
                # If validation errors were found, import will stop unless
                # configuration per source or per instance says otherwise
                continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors',
                                                              False)) or \
                    self.source_config.get('continue_on_validation_errors')
                if not continue_import:
                    return False

        # Parse ISO document
        try:

            iso_parser = ISODocument(harvest_object.content)
            iso_values = iso_parser.read_values()
        except Exception as e:
            self._save_object_error(
                'Error parsing ISO document for object {0}: {1}'.format(
                    harvest_object.id, six.text_type(e)), harvest_object,
                'Import')
            return False

        # Flag previous object as not current anymore
        if previous_object and not self.force_import:
            previous_object.current = False
            previous_object.add()

        # Update GUID with the one on the document
        iso_guid = iso_values['guid']
        if iso_guid and harvest_object.guid != iso_guid:
            # First make sure there already aren't current objects
            # with the same guid
            existing_object = model.Session.query(HarvestObject.id) \
                            .filter(HarvestObject.guid == iso_guid) \
                            .filter(HarvestObject.current == True).first() # noqa
            if existing_object:
                self._save_object_error(
                    'Object {0} already has this guid {1}'.format(
                        existing_object.id, iso_guid), harvest_object,
                    'Import')
                return False

            harvest_object.guid = iso_guid
            harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            m = hashlib.md5()
            m.update(harvest_object.content.encode('utf8', 'ignore'))
            harvest_object.guid = m.hexdigest()
            harvest_object.add()

        # Get document modified date
        try:
            metadata_modified_date = dateutil.parser.parse(
                iso_values['metadata-date'], ignoretz=True)
        except ValueError:
            self._save_object_error(
                'Could not extract reference date for object {0} ({1})'.format(
                    harvest_object.id, iso_values['metadata-date']),
                harvest_object, 'Import')
            return False

        harvest_object.metadata_modified_date = metadata_modified_date
        harvest_object.add()

        # Build the package dict
        package_dict = self.get_package_dict(iso_values, harvest_object)
        for harvester in p.PluginImplementations(ISpatialHarvester):
            package_dict = harvester.get_package_dict(
                context, {
                    'package_dict': package_dict,
                    'iso_values': iso_values,
                    'xml_tree': iso_parser.xml_tree,
                    'harvest_object': harvest_object,
                })
        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        # Create / update the package
        context.update({
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        })

        if self._site_user and context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty, six.text_type]

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = six.text_type(uuid.uuid4())
            package_schema['id'] = [six.text_type]

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            model.Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('Created new package %s with guid %s', package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % six.text_type(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':

            # Check if the modified date is more recent
            if not self.force_import and previous_object \
                    and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:

                # Assign the previous job id to the new object to
                # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the
                # harvest object
                if ((config.get('ckanext.spatial.harvest.reindex_unchanged',
                                True) != 'False'
                     or self.source_config.get('reindex_unchanged') != 'False')
                        and harvest_object.package_id):
                    context.update({'validate': False, 'ignore_auth': True})
                    try:
                        package_dict = logic.get_action('package_show')(
                            context, {
                                'id': harvest_object.package_id
                            })
                    except p.toolkit.ObjectNotFound:
                        pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('Document with GUID %s unchanged, skipping...' %
                         (harvest_object.guid))
            else:
                package_schema = logic.schema.default_update_package_schema()
                package_schema['tags'] = tag_schema
                context['schema'] = package_schema

                package_dict['id'] = harvest_object.package_id
                try:
                    package_id = p.toolkit.get_action('package_update')(
                        context, package_dict)
                    log.info('Updated package %s with guid %s', package_id,
                             harvest_object.guid)
                except p.toolkit.ValidationError as e:
                    self._save_object_error(
                        'Validation Error: %s' %
                        six.text_type(e.error_summary), harvest_object,
                        'Import')
                    return False

        model.Session.commit()

        return True
class Validation(toolkit.CkanCommand):
    '''Validation commands
    
    Usage:
        validation report [package-name]
            Performs validation on the harvested metadata, either for all
            packages or the one specified.
    
        validation report-csv <filename>.csv
            Performs validation on all the harvested metadata in the db and
            writes a report in CSV format to the given filepath.
    
        validation file <filename>.xml
            Performs validation on the given metadata file.


    '''
    summary = __doc__.split(u'\n')[0]
    usage = __doc__
    max_args = 3
    min_args = 0

    def command(self):
        ''' '''
        if not self.args or self.args[0] in [u'--help', u'-h', u'help']:
            print self.usage
            sys.exit(1)

        self._load_config()

        cmd = self.args[0]
        if cmd == u'report':
            self.report()
        elif cmd == u'report-csv':
            self.report_csv()
        elif cmd == u'file':
            self.validate_file()
        else:
            print u'Command %s not recognized' % cmd

    def report(self):
        ''' '''
        from ckan import model
        from ckanext.spatial.lib.reports import validation_report

        if len(self.args) >= 2:
            package_ref = unicode(self.args[1])
            pkg = model.Package.get(package_ref)
            if not pkg:
                print u'Package ref "%s" not recognised' % package_ref
                sys.exit(1)
        else:
            pkg = None

        report = validation_report(package_id=pkg.id)
        for row in report.get_rows_html_formatted():
            print
            for i, col_name in enumerate(report.column_names):
                print u'  %s: %s' % (col_name, row[i])

    def validate_file(self):
        ''' '''
        from ckanext.spatial.harvesters import SpatialHarvester
        from ckanext.spatial.model import ISODocument

        if len(self.args) > 2:
            print u'Too many parameters %i' % len(self.args)
            sys.exit(1)
        if len(self.args) < 2:
            print u'Not enough parameters %i' % len(self.args)
            sys.exit(1)
        metadata_filepath = self.args[1]
        if not os.path.exists(metadata_filepath):
            print u'Filepath %s not found' % metadata_filepath
            sys.exit(1)
        with open(metadata_filepath, u'rb') as f:
            metadata_xml = f.read()

        validators = SpatialHarvester()._get_validator()
        print u'Validators: %r' % validators.profiles
        try:
            xml_string = metadata_xml.encode(u'utf-8')
        except UnicodeDecodeError, e:
            print u'ERROR: Unicode Error reading file \'%s\': %s' % \
                  (metadata_filepath, e)
            sys.exit(1)
            # import pdb; pdb.set_trace()
        xml = etree.fromstring(xml_string)

        # XML validation
        valid, errors = validators.is_valid(xml)

        # CKAN read of values
        if valid:
            try:
                iso_document = ISODocument(xml_string)
                iso_values = iso_document.read_values()
            except Exception, e:
                valid = False
                errors.append(
                    u'CKAN exception reading values from ISODocument: %s' % e)