Example #1
0
    def import_stage(self, harvest_object):
        log = logging.getLogger(__name__ + '.import')
        log.debug('Import stage for harvest object: %r', harvest_object)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        # Save a reference
        self.obj = harvest_object

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False
        try:
            self.import_gemini_object(harvest_object.content)
            return True
        except Exception as e:
            log.error('Exception during import: %s' % text_traceback())
            if not six.text_type(e).strip():
                self._save_object_error('Error importing Gemini document.',
                                        harvest_object, 'Import')
            else:
                self._save_object_error(
                    'Error importing Gemini document: %s' % six.text_type(e),
                    harvest_object, 'Import')
            raise
            if debug_exception_mode:
                raise
Example #2
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_gather_error('Error contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        log.debug('Starting gathering for %s' % url)
        used_identifiers = []
        ids = []
        try:
            for identifier in self.csw.getidentifiers(page=10):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier in used_identifiers:
                        log.error(
                            'CSW identifier %r already used, skipping...' %
                            identifier)
                        continue
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        ## log an error here? happens with the dutch data
                        continue

                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=identifier, job=harvest_job)
                    obj.save()

                    ids.append(obj.id)
                    used_identifiers.append(identifier)
                except Exception as e:
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue

        except Exception as e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                six.text_type(e), harvest_job)
            return None

        if len(ids) == 0:
            self._save_gather_error('No records received from the CSW server',
                                    harvest_job)
            return None

        return ids
class ODMMimuSpatialCSW(GeminiCswHarvester):
    def info(self):
        return {
            'name':
            'odmcsw',
            'title':
            'ODM MIMU CSW',
            'description':
            'Gemini Harvester customised for omd mimu dataset harvester'
        }

    def import_gemini_object(self, gemini_string):
        '''Imports the Gemini metadata into CKAN.
        The harvest_source_reference is an ID that the harvest_source uses
        for the metadata document. It is the same ID the Coupled Resources
        use to link dataset and service records.
        Some errors raise Exceptions.
        '''
        log = logging.getLogger(__name__ + '.import')

        # gemini_string is unicode, but lxml doesn't like it when you tell it what
        # encoding to use. So we need to force it to use utf-8 encoding at all times.

        utf8_parser = etree.XMLParser(encoding='utf-8')

        def parse_from_unicode(unicode_str):
            s = unicode_str.encode('utf-8')
            return etree.fromstring(s, parser=utf8_parser)

        xml = parse_from_unicode(gemini_string)
        if VALIDATE:
            valid, profile, errors = self._get_validator().is_valid(xml)
            if not valid:
                out = errors[0][0] + ':\n' + '\n'.join(e[0]
                                                       for e in errors[1:])
                log.error('Errors found for object with GUID %s:' %
                          self.obj.guid)
                self._save_object_error(out, self.obj, 'Import')

        unicode_gemini_string = etree.tostring(xml, encoding=unicode)

        # may raise Exception for errors
        self.write_package_from_gemini_string(unicode_gemini_string)

    def gather_stage(self, harvest_job):
        # Changes from original -- additional error handling to skip the SyntaxError and Socket Timeouts

        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        try:
            self._setup_csw_client(url)
        except Exception, e:
            self._save_gather_error('IError contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        log.debug('Starting gathering for %s' % url)
        used_identifiers = []
        ids = []
        try:
            for identifier in self.csw.getidentifiers(page=10):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier in used_identifiers:
                        log.error(
                            'CSW identifier %r already used, skipping...' %
                            identifier)
                        continue
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        # log an error here? happens with the dutch data
                        continue

                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=identifier, job=harvest_job)
                    obj.save()
                    ids.append(obj.id)
                    used_identifiers.append(identifier)
                except Exception, e:
                    log.error(e)
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue
        except XMLSyntaxError as e:
            log.error(
                "XML Syntax error gathering the identifiers from the CSW server [%s]",
                str(e))
        except socket.timeout as e:
            log.error(
                "Timeout error gathering the identifiers from the CSW server [%s]",
                str(e))
        except Exception, e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                str(e), harvest_job)
            return None
Example #4
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('CswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_gather_error('Error contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = set(guid_to_package_id.keys())

        # extract cql filter if any
        cql = self.source_config.get('cql')

        log.debug('Starting gathering for %s' % url)
        guids_in_harvest = set()
        try:
            for identifier in self.csw.getidentifiers(
                    page=10, outputschema=self.output_schema(), cql=cql):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        continue

                    guids_in_harvest.add(identifier)
                except Exception as e:
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue

        except Exception as e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                six.text_type(e), harvest_job)
            return None

        new = guids_in_harvest - guids_in_db
        delete = guids_in_db - guids_in_harvest
        change = guids_in_db & guids_in_harvest

        ids = []
        for guid in new:
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                extras=[HOExtra(key='status', value='new')])
            obj.save()
            ids.append(obj.id)
        for guid in change:
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='change')])
            obj.save()
            ids.append(obj.id)
        for guid in delete:
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='delete')])
            model.Session.query(HarvestObject).\
                  filter_by(guid=guid).\
                  update({'current': False}, False)
            obj.save()
            ids.append(obj.id)

        if len(ids) == 0:
            self._save_gather_error('No records received from the CSW server',
                                    harvest_job)
            return None

        return ids