def import_stage(self, harvest_object): log = logging.getLogger(__name__ + '.import') log.debug('Import stage for harvest object: %r', harvest_object) if not harvest_object: log.error('No harvest object received') return False # Save a reference self.obj = harvest_object if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False try: self.import_gemini_object(harvest_object.content) return True except Exception as e: log.error('Exception during import: %s' % text_traceback()) if not six.text_type(e).strip(): self._save_object_error('Error importing Gemini document.', harvest_object, 'Import') else: self._save_object_error( 'Error importing Gemini document: %s' % six.text_type(e), harvest_object, 'Import') raise if debug_exception_mode: raise
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.CSW.gather') log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url try: self._setup_csw_client(url) except Exception as e: self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job) return None log.debug('Starting gathering for %s' % url) used_identifiers = [] ids = [] try: for identifier in self.csw.getidentifiers(page=10): try: log.info('Got identifier %s from the CSW', identifier) if identifier in used_identifiers: log.error( 'CSW identifier %r already used, skipping...' % identifier) continue if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) ## log an error here? happens with the dutch data continue # Create a new HarvestObject for this identifier obj = HarvestObject(guid=identifier, job=harvest_job) obj.save() ids.append(obj.id) used_identifiers.append(identifier) except Exception as e: self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except Exception as e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % six.text_type(e), harvest_job) return None if len(ids) == 0: self._save_gather_error('No records received from the CSW server', harvest_job) return None return ids
class ODMMimuSpatialCSW(GeminiCswHarvester): def info(self): return { 'name': 'odmcsw', 'title': 'ODM MIMU CSW', 'description': 'Gemini Harvester customised for omd mimu dataset harvester' } def import_gemini_object(self, gemini_string): '''Imports the Gemini metadata into CKAN. The harvest_source_reference is an ID that the harvest_source uses for the metadata document. It is the same ID the Coupled Resources use to link dataset and service records. Some errors raise Exceptions. ''' log = logging.getLogger(__name__ + '.import') # gemini_string is unicode, but lxml doesn't like it when you tell it what # encoding to use. So we need to force it to use utf-8 encoding at all times. utf8_parser = etree.XMLParser(encoding='utf-8') def parse_from_unicode(unicode_str): s = unicode_str.encode('utf-8') return etree.fromstring(s, parser=utf8_parser) xml = parse_from_unicode(gemini_string) if VALIDATE: valid, profile, errors = self._get_validator().is_valid(xml) if not valid: out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:]) log.error('Errors found for object with GUID %s:' % self.obj.guid) self._save_object_error(out, self.obj, 'Import') unicode_gemini_string = etree.tostring(xml, encoding=unicode) # may raise Exception for errors self.write_package_from_gemini_string(unicode_gemini_string) def gather_stage(self, harvest_job): # Changes from original -- additional error handling to skip the SyntaxError and Socket Timeouts log = logging.getLogger(__name__ + '.CSW.gather') log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url try: self._setup_csw_client(url) except Exception, e: self._save_gather_error('IError contacting the CSW server: %s' % e, harvest_job) return None log.debug('Starting gathering for %s' % url) used_identifiers = [] ids = [] try: for identifier in self.csw.getidentifiers(page=10): try: log.info('Got identifier %s from the CSW', identifier) if identifier in used_identifiers: log.error( 'CSW identifier %r already used, skipping...' % identifier) continue if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) # log an error here? happens with the dutch data continue # Create a new HarvestObject for this identifier obj = HarvestObject(guid=identifier, job=harvest_job) obj.save() ids.append(obj.id) used_identifiers.append(identifier) except Exception, e: log.error(e) self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except XMLSyntaxError as e: log.error( "XML Syntax error gathering the identifiers from the CSW server [%s]", str(e)) except socket.timeout as e: log.error( "Timeout error gathering the identifiers from the CSW server [%s]", str(e)) except Exception, e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % str(e), harvest_job) return None
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.CSW.gather') log.debug('CswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) try: self._setup_csw_client(url) except Exception as e: self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job) return None query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = set(guid_to_package_id.keys()) # extract cql filter if any cql = self.source_config.get('cql') log.debug('Starting gathering for %s' % url) guids_in_harvest = set() try: for identifier in self.csw.getidentifiers( page=10, outputschema=self.output_schema(), cql=cql): try: log.info('Got identifier %s from the CSW', identifier) if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) continue guids_in_harvest.add(identifier) except Exception as e: self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except Exception as e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % six.text_type(e), harvest_job) return None new = guids_in_harvest - guids_in_db delete = guids_in_db - guids_in_harvest change = guids_in_db & guids_in_harvest ids = [] for guid in new: obj = HarvestObject(guid=guid, job=harvest_job, extras=[HOExtra(key='status', value='new')]) obj.save() ids.append(obj.id) for guid in change: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='change')]) obj.save() ids.append(obj.id) for guid in delete: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) model.Session.query(HarvestObject).\ filter_by(guid=guid).\ update({'current': False}, False) obj.save() ids.append(obj.id) if len(ids) == 0: self._save_gather_error('No records received from the CSW server', harvest_job) return None return ids