def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate(data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject(guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[ HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items() ]) obj.save() return harvest_object_dictize(obj, context)
def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def gather_stage(self, harvest_job): log.debug('In DotStatHarvester gather_stage') # For each row of data, use its ID as the GUID and save a harvest object # Return a list of all these new harvest jobs try: harvest_obj_ids = [] self._set_config(harvest_job.source.config) base_url = harvest_job.source.url try: # Get list of endpoint ids endpoints = self.get_endpoints(base_url) except (AccessTypeNotAvailableError, KeyError): log.debug('Endpoint function failed') # Make a harvest object for each dataset # Set the GUID to the dataset's ID (DF_SDG etc.) for agency_id, _id, version in endpoints: harvest_obj = HarvestObject( guid="{}-{}".format(agency_id, _id), job=harvest_job ) harvest_obj.extras = [ HarvestObjectExtra(key='stats_guid', value=_id), HarvestObjectExtra(key='version', value=version) ] harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) log.debug('IDs: {}'.format(harvest_obj_ids)) return harvest_obj_ids except Exception as e: self._save_gather_error( 'Unable to get content for URL: %s: %s / %s' % (base_url, str(e), traceback.format_exc()), harvest_job)
def delete_geocat_ids(self, harvest_job, harvest_obj_ids, packages_to_delete): delete_harvest_obj_ids = [] for package_info in packages_to_delete: obj = HarvestObject(guid=package_info[1].name, job=harvest_job, extras=[ HarvestObjectExtra(key='import_action', value='delete') ]) obj.save() delete_harvest_obj_ids.append(obj.id) return delete_harvest_obj_ids
def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid='test1', job=harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid='test2', job=harvest_job) obj3 = HarvestObject(guid='test_to_delete', job=harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def _make_harvest_objs(datasets): '''Create HarvestObject with Socrata dataset content.''' obj_ids = [] guids = [] for d in datasets: log.debug('Creating HarvestObject for {} {}'.format( d['resource']['name'], d['resource']['id'])) obj = HarvestObject( guid=d['resource']['id'], job=harvest_job, content=json.dumps(d), extras=[HarvestObjectExtra(key='status', value='hi!')]) obj.save() obj_ids.append(obj.id) guids.append(d['resource']['id']) return obj_ids, guids
def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): ''' Given a list of guids in the remote source, checks which in the DB need to be deleted To do so it queries all guids in the DB for this source and calculates the difference. For each of these creates a HarvestObject with the dataset id, marked for deletion. Returns a list with the ids of the Harvest Objects to delete. ''' object_ids = [] # Get all previous current guids and dataset ids for this source query = model.Session.query(HarvestObject.guid, HarvestObject.package_id)\ .filter( HarvestObject.current == True # noqa ).filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = list(guid_to_package_id.keys()) # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) # Create a harvest object for each of them, flagged for deletion for guid in guids_to_delete: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) # Mark the rest of objects for this guid as not current model.Session.query(HarvestObject) \ .filter_by(guid=guid) \ .update({'current': False}, False) obj.save() object_ids.append(obj.id) return object_ids
def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): # This is the same as the method in the base class, except that a different query is used. object_ids = [] portal = self._get_portal_from_config(harvest_job.source.config) # Get all previous current guids and dataset ids for this harvested portal independent of # the harvest objects. This allows cleaning the harvest data without loosing the # dataset mappings. # Build a subquery to get all the packages of the current portal first portal_packages = model.Session.query(model.PackageExtra.package_id.label('id')) \ .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \ .filter(model.PackageExtra.value == portal) \ .subquery() # then get the extras.guid for those packages query = model.Session.query(model.PackageExtra.value, portal_packages.c.id) \ .filter(model.PackageExtra.key == 'guid') \ .filter(model.PackageExtra.package_id == portal_packages.c.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = guid_to_package_id.keys() # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) # Create a harvest object for each of them, flagged for deletion for guid in guids_to_delete: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) # Mark the rest of objects for this guid as not current model.Session.query(HarvestObject) \ .filter_by(guid=guid) \ .update({'current': False}, False) obj.save() object_ids.append(obj.id) return object_ids
def fetch_stage(self, harvest_object): # Check harvest object status status = self._get_object_extra(harvest_object,'status') if status == 'delete': # No need to fetch anything, just pass to the import stage return True # We need to fetch the remote document # Get location url = self._get_object_extra(harvest_object, 'waf_location') if not url: self._save_object_error( 'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key='original_format', value=document_format) extra.save() return True
def _run_job_for_single_document(self, harvest_job, object_id): harvester = FisbrokerPlugin() # we circumvent gather_stage() and fetch_stage() and just load the # content with a known object_id and create the harvest object: url = harvest_job.source.url # _get_content() returns XML content = harvester._get_content(url) obj = HarvestObject(guid=object_id, job=harvest_job, content=content, extras=[HarvestObjectExtra(key='status',value='new')]) obj.save() assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) harvest_job.status = u'Finished' harvest_job.save() return obj
# Get contents try: content = self._get_content_as_unicode(url) except Exception, e: msg = u'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == u'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key=u'original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key=u'original_format', value=document_format) extra.save() return True apache = parse.SkipTo(parse.CaselessLiteral(u'<a href='), include=True).suppress() + parse.quotedString.setParseAction( parse.removeQuotes).setResultsName(u'url') + parse.SkipTo(u'</a>',
class DatasetHarvesterBase(HarvesterBase): """ A Harvester for datasets. """ _user_name = None def validate_config(self, config): if not config: return config config_obj = yaml.load(config) return config def load_config(self, harvest_source): # Load the harvest source's configuration data. We expect it to be a YAML # string. Unfortunately I went ahead of CKAN on this. The stock CKAN harvester # only allows JSON in the configuration box. My fork is necessary for this # to work: https://github.com/joshdata/ckanext-harvest ret = { "filters": {}, # map data.json field name to list of values one of which must be present "defaults": {}, } source_config = yaml.load(harvest_source.config) try: ret["filters"].update(source_config["filters"]) except TypeError: pass except KeyError: pass try: ret["defaults"].update(source_config["defaults"]) except TypeError: pass except KeyError: pass return ret def _get_user_name(self): if not self._user_name: user = p.toolkit.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {}) self._user_name = user['name'] return self._user_name def context(self): return {"user": self._get_user_name(), "ignore_auth": True} # SUBCLASSES MUST IMPLEMENT def load_remote_catalog(self, harvest_job): raise Exception("Not implemented") def extra_schema(self): return { 'validator_schema': [ignore_empty, unicode, validate_schema], } def gather_stage(self, harvest_job): log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) try: source_datasets, catalog_values = self.load_remote_catalog( harvest_job) except ValueError as e: self._save_gather_error("Error loading json content: %s." % (e), harvest_job) return [] tmp_superThemes = [ "agri", "educ", "econ", "ener", "envi", "gove", "heal", "intr", "just", "regi", "soci", "tech", "tran" ] ckan_host = '' # Call to config.ini to load superTheme list if 'CKAN_CONFIG' in environ: if path.exists(environ['CKAN_CONFIG']): try: tmp_ckan_config = ConfigParser() tmp_ckan_config.read(environ['CKAN_CONFIG']) except IOError: log.warn( 'Error loading CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) except Exception: log.warn( 'Unknow error loading CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) try: ckan_host = tmp_ckan_config.get('app:main', 'ckan.site_url') except Exception: log.warn( 'Error loading \"ckan.site_url\" from CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) # Get superThemeTaxonomy try: if len(ckan_host) > 0: stt_url = '{site_url}/superThemeTaxonomy.json'.format( site_url=ckan_host) superThemeTaxonomy = requests.get(stt_url) superThemeTaxonomy = superThemeTaxonomy.json() if len(superThemeTaxonomy) < 0: raise Exception('SuperThemeTaxonomy JSON in empty') if 'id' not in [theme for theme in superThemeTaxonomy]: raise Exception( 'SuperThemeTaxonomy JSON don\'t contains \"id\" field' ) tmp_superThemes = [ theme['id'] for theme in superThemeTaxonomy ] log.info("superThemeTaxonomy loaded!") else: raise Exception( 'The field of config.ini \"site_url\" is empty.') except Exception, e: log.warn("Error getting \"ThemeTaxonomy.json\", err: %s.", e) superThemes = tmp_superThemes for dataset in source_datasets: # Delete if exists @type key try: del dataset['@type'] except Exception: pass try: foo = dataset['theme'] log.info('Theme exists and it value is:{0}.'.format(foo)) except IndexError: log.warn('The field \"theme\" not exists!... Fill it MF!') dataset.update({'theme': []}) try: tags = dataset['keyword'] themes = dataset['theme'] if len(themes) > 0: if type(tags) is list: dataset['keyword'] = tags + themes else: dataset['keyword'] = [tags] + themes except IndexError: pass try: dataset.update({'author_email': dataset['publisher']['mbox']}) except IndexError: log.warn( 'El campo \"publisher\" para \"{0}\" no contine campo \"mbox\".' .format(dataset['title'])) dataset.update({'author_mail': "unknow"}) except Exception: log.warn( 'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, ' 'se completara el campo \"mbox\". para evitar errores futuros.' .format(dataset['title'])) dataset.update({'author_email': "unknow"}) try: dataset.update({'author': dataset['publisher']['name']}) except IndexError: log.warn( 'El campo \"publisher\" para \"{0}\" no contine campo \"name\".' .format(dataset['title'])) dataset.update({'author': "unknow"}) except Exception: log.warn( 'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, ' 'se completara el campo \"name\". para evitar errores futuros.' .format(dataset['title'])) dataset.update({'author': "unknow"}) try: del dataset['publisher'] except Exception: pass try: dataset.update( {'maintainer_email': dataset['contactPoint']['hasEmail']}) dataset.update({'maintainer': dataset['contactPoint']['fn']}) del dataset['contactPoint'] except Exception: dataset.update({'maintainer_email': ""}) dataset.update({'maintainer': ""}) del dataset['contactPoint'] DATAJSON_SCHEMA = source_datasets schema_version = '1.2' parent_identifiers = set() child_identifiers = set() catalog_extras = {} if isinstance(catalog_values, dict): schema_version = '1.2' for dataset in source_datasets: parent_identifier = dataset.get('isPartOf') if parent_identifier: parent_identifiers.add(parent_identifier) child_identifiers.add(dataset.get('identifier')) # get a list of needed catalog values and put into hobj catalog_fields = ['title', 'description'] catalog_extras = dict(('catalog_' + k, v) for (k, v) in catalog_values.iteritems() if k in catalog_fields) # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. # Added: mark all existing parent datasets. existing_datasets = {} existing_parents = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue sid = self.find_extra(pkg, "identifier") is_parent = self.find_extra(pkg, "collection_metadata") if sid: existing_datasets[sid] = pkg if is_parent and pkg.get("state") == "active": existing_parents[sid] = pkg # which parent has been demoted to child level? existing_parents_demoted = set( identifier for identifier in existing_parents.keys() \ if identifier not in parent_identifiers) # which dataset has been promoted to parent level? existing_datasets_promoted = set( identifier for identifier in existing_datasets.keys() \ if identifier in parent_identifiers \ and identifier not in existing_parents.keys()) # if there is any new parents, we will have to harvest parents # first, mark the status in harvest_source config, which # triggers a children harvest_job after parents job is finished. source = harvest_job.source source_config = json.loads(source.config or '{}') # run status: None, or parents_run, or children_run? run_status = source_config.get('datajson_collection') if parent_identifiers: for parent in parent_identifiers & child_identifiers: self._save_gather_error("Collection identifier '%s' \ cannot be isPartOf another collection." \ % parent, harvest_job) new_parents = set(identifier for identifier in parent_identifiers \ if identifier not in existing_parents.keys()) if new_parents: if not run_status: # fresh start run_status = 'parents_run' source_config['datajson_collection'] = run_status source.config = json.dumps(source_config) source.save() elif run_status == 'children_run': # it means new parents are tried and failed. # but skip some which have previously reported with # parent_identifiers & child_identifiers for parent in new_parents - \ (parent_identifiers & child_identifiers): self._save_gather_error("Collection identifier '%s' \ not found. Records which are part of this \ collection will not be harvested." \ % parent, harvest_job) else: # run_status was parents_run, and did not finish. # something wrong but not sure what happened. # let's leave it as it is, let it run one more time. pass else: # all parents are already in place. run it as usual. run_status = None elif run_status: # need to clear run_status run_status = None source_config['datajson_collection'] = run_status source.config = json.dumps(source_config) source.save() # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() unique_datasets = set() filters = self.load_config(harvest_job.source)["filters"] for dataset in source_datasets: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. # Check the config's filters to see if we should import this dataset. # For each filter, check that the value specified in the data.json file # is among the permitted values in the filter specification. matched_filters = True for k, v in filters.items(): if dataset.get(k) not in v: matched_filters = False if not matched_filters: continue if parent_identifiers and new_parents \ and dataset['identifier'] not in parent_identifiers \ and dataset.get('isPartOf') in new_parents: if run_status == 'parents_run': # skip those whose parents still need to run. continue else: # which is 'children_run'. # error out since parents got issues. self._save_gather_error( "Record with identifier '%s': isPartOf '%s' points to \ an erroneous record." % (dataset['identifier'], dataset.get('isPartOf')), harvest_job) continue # Some source contains duplicate identifiers. skip all except the first one if dataset['identifier'] in unique_datasets: self._save_gather_error( "Duplicate entry ignored for identifier: '%s'." % (dataset['identifier']), harvest_job) continue unique_datasets.add(dataset['identifier']) # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg = existing_datasets[dataset["identifier"]] pkg_id = pkg["id"] seen_datasets.add(dataset['identifier']) # We store a hash of the dict associated with this dataset # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ and dataset['identifier'] not in existing_parents_demoted \ and dataset['identifier'] not in existing_datasets_promoted \ and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source, catalog_extras, schema_version): continue else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. extras = [ HarvestObjectExtra(key='schema_version', value=schema_version) ] if dataset['identifier'] in parent_identifiers: extras.append( HarvestObjectExtra(key='is_collection', value=True)) elif dataset.get('isPartOf'): parent_pkg_id = existing_parents[dataset.get('isPartOf')]['id'] extras.append( HarvestObjectExtra(key='collection_pkg_id', value=parent_pkg_id)) # FIX EXTRAS # for k,v in for k, v in catalog_extras.iteritems(): extras.append(HarvestObjectExtra(key=k, value=v)) # ---- obj = HarvestObject( guid=pkg_id, job=harvest_job, extras=extras, content=json.dumps(dataset, sort_keys=True) ) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() object_ids.append(obj.id) # Remove packages no longer in the remote catalog. for upstreamid, pkg in existing_datasets.items(): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) obj = HarvestObject( guid=pkg_id, package_id=pkg["id"], job=harvest_job, ) obj.save() object_ids.append(obj.id) return object_ids
# Get contents try: content = self._get_content_as_unicode(url) except Exception, e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key='original_format', value=document_format) extra.save() return True apache = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \ + parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url') \ + parse.SkipTo("</a>", include=True).suppress() \
def gather_stage(self, harvest_job): # The gather stage scans a remote resource (like a /data.json file) for # a list of datasets to import. log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) # Start gathering. try: source_datasets, catalog_values = self.load_remote_catalog( harvest_job) except ValueError as e: self._save_gather_error("Error loading json content: %s." % (e), harvest_job) return [] if len(source_datasets) == 0: return [] DATAJSON_SCHEMA = { "https://project-open-data.cio.gov/v1.1/schema": '1.1', } # schema version is default 1.0, or a valid one (1.1, ...) schema_version = '1.0' parent_identifiers = set() child_identifiers = set() catalog_extras = {} if isinstance(catalog_values, dict): schema_value = catalog_values.get('conformsTo', '') if schema_value not in DATAJSON_SCHEMA.keys(): self._save_gather_error('Error reading json schema value.' \ ' The given value is %s.' % ('empty' if schema_value == '' else schema_value), harvest_job) return [] schema_version = DATAJSON_SCHEMA.get(schema_value, '1.0') for dataset in source_datasets: parent_identifier = dataset.get('isPartOf') if parent_identifier: parent_identifiers.add(parent_identifier) child_identifiers.add(dataset.get('identifier')) # get a list of needed catalog values and put into hobj catalog_fields = ['@context', '@id', 'conformsTo', 'describedBy'] catalog_extras = dict(('catalog_' + k, v) for (k, v) in catalog_values.iteritems() if k in catalog_fields) # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. # Added: mark all existing parent datasets. existing_datasets = {} existing_parents = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue sid = self.find_extra(pkg, "identifier") is_parent = self.find_extra(pkg, "collection_metadata") if sid: existing_datasets[sid] = pkg if is_parent and pkg.get("state") == "active": existing_parents[sid] = pkg # which parent has been demoted to child level? existing_parents_demoted = set( identifier for identifier in existing_parents.keys() \ if identifier not in parent_identifiers) # which dataset has been promoted to parent level? existing_datasets_promoted = set( identifier for identifier in existing_datasets.keys() \ if identifier in parent_identifiers \ and identifier not in existing_parents.keys()) source = harvest_job.source source_config = self.load_config(source) if parent_identifiers: for parent in parent_identifiers & child_identifiers: self._save_gather_error("Collection identifier '%s' \ cannot be isPartOf another collection." \ % parent, harvest_job) new_parents = set(identifier for identifier in parent_identifiers \ if identifier not in existing_parents.keys()) # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() unique_datasets = set() filters = source_config["filters"] for dataset in source_datasets: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. # Check the config's filters to see if we should import this dataset. # For each filter, check that the value specified in the data.json file # is among the permitted values in the filter specification. matched_filters = True for k, v in filters.items(): if dataset.get(k) not in v: matched_filters = False if not matched_filters: continue # Some source contains duplicate identifiers. skip all except the first one if dataset['identifier'] in unique_datasets: self._save_gather_error( "Duplicate entry ignored for identifier: '%s'." % (dataset['identifier']), harvest_job) continue unique_datasets.add(dataset['identifier']) # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. log.info('Check existing dataset: {}'.format( dataset['identifier'])) if dataset['identifier'] in existing_datasets: pkg = existing_datasets[dataset["identifier"]] pkg_id = pkg["id"] seen_datasets.add(dataset['identifier']) # We store a hash of the dict associated with this dataset # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ and dataset['identifier'] not in existing_parents_demoted \ and dataset['identifier'] not in existing_datasets_promoted \ and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, source, catalog_extras, schema_version): log.info('SKIP: {}'.format(dataset['identifier'])) continue else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. extras = [ HarvestObjectExtra(key='schema_version', value=schema_version) ] if dataset['identifier'] in parent_identifiers: extras.append( HarvestObjectExtra(key='is_collection', value=True)) elif dataset.get('isPartOf'): is_part_of = dataset.get('isPartOf') existing_parent = existing_parents.get(is_part_of, None) if existing_parent is None: # maybe the parent is not harvested yet parent_pkg_id = 'IPO:{}'.format(is_part_of) else: parent_pkg_id = existing_parent['id'] extras.append( HarvestObjectExtra(key='collection_pkg_id', value=parent_pkg_id)) for k, v in catalog_extras.iteritems(): extras.append(HarvestObjectExtra(key=k, value=v)) log.info('Datajson creates a HO: {}'.format(dataset['identifier'])) obj = HarvestObject( guid=pkg_id, job=harvest_job, extras=extras, content=json.dumps(dataset, sort_keys=True) ) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() # we are sorting parent datasets in the list first and then children so that the parents are # harvested first, we then use the parent id to associate the children to the parent if dataset['identifier'] in parent_identifiers: object_ids.insert(0, obj.id) else: object_ids.append(obj.id) # Remove packages no longer in the remote catalog. for upstreamid, pkg in existing_datasets.items(): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) obj = HarvestObject( guid=pkg_id, package_id=pkg["id"], job=harvest_job, ) obj.save() object_ids.append(obj.id) return object_ids
def test_api(self, app): try: from ckanext.harvest.model import ( HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra, ) except ImportError: raise pytest.skip( "The harvester extension is needed for these tests") content1 = "<xml>Content 1</xml>" ho1 = HarvestObject( guid="test-ho-1", job=HarvestJob(source=HarvestSource(url="http://", type="xx")), content=content1, ) content2 = "<xml>Content 2</xml>" original_content2 = "<xml>Original Content 2</xml>" ho2 = HarvestObject( guid="test-ho-2", job=HarvestJob(source=HarvestSource(url="http://", type="xx")), content=content2, ) hoe = HarvestObjectExtra( key="original_document", value=original_content2, object=ho2 ) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() object_id_1 = ho1.id object_id_2 = ho2.id # Access object content url = "/harvest/object/{0}".format(object_id_1) r = app.get(url, status=200) assert( r.headers["Content-Type"] == "application/xml; charset=utf-8" ) assert( r.body == '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>' ) # Access original content in object extra (if present) url = "/harvest/object/{0}/original".format(object_id_1) r = app.get(url, status=404) url = "/harvest/object/{0}/original".format(object_id_2) r = app.get(url, status=200) assert( r.headers["Content-Type"] == "application/xml; charset=utf-8" ) assert( r.body == '<?xml version="1.0" encoding="UTF-8"?>\n' + "<xml>Original Content 2</xml>" )
def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): # This is the same as the method in the base class, except that a different query is used. object_ids = [] portal = self._get_portal_from_config(harvest_job.source.config) starttime = time.time() # Get all previous current guids and dataset ids for this harvested portal independent of # the harvest objects. This allows cleaning the harvest data without loosing the # dataset mappings. # Build a subquery to get all active packages having a GUID first subquery = model.Session.query(model.PackageExtra.value, model.Package.id) \ .join(model.Package, model.Package.id == model.PackageExtra.package_id)\ .filter(model.Package.state == model.State.ACTIVE) \ .filter(model.PackageExtra.state == model.State.ACTIVE) \ .filter(model.PackageExtra.key == 'guid') \ .subquery() # then get all active packages of the current portal and join with their GUIDs if # available (outer join) query = model.Session.query(model.Package.id, subquery.c.value) \ .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\ .outerjoin(subquery, subquery.c.id == model.Package.id)\ .filter(model.Package.state == model.State.ACTIVE) \ .filter(model.PackageExtra.state == model.State.ACTIVE) \ .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \ .filter(model.PackageExtra.value == portal) checkpoint_start = time.time() guid_to_package_id = {} for package_id, guid in query: if guid: guid_to_package_id[guid] = package_id # Also remove all packages without a GUID, use ID as GUID to share logic below else: guid_to_package_id[package_id] = package_id checkpoint_end = time.time() LOGGER.debug('Time for query harvest source related datasets : %s', str(checkpoint_end - checkpoint_start)) guids_in_db = guid_to_package_id.keys() # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) # Create a harvest object for each of them, flagged for deletion for guid in guids_to_delete: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) # Mark the rest of objects for this guid as not current model.Session.query(HarvestObject) \ .filter_by(guid=guid) \ .update({'current': False}, False) obj.save() object_ids.append(obj.id) endtime = time.time() LOGGER.debug('Found %s packages for deletion. Time total: %s', len(guids_to_delete), str(endtime - starttime)) return object_ids
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
def test_api(self): try: from ckanext.harvest.model import (HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra) except ImportError: raise SkipTest('The harvester extension is needed for these tests') content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=content1) content2 = '<xml>Content 2</xml>' original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=content2) hoe = HarvestObjectExtra( key='original_document', value=original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() object_id_1 = ho1.id object_id_2 = ho2.id app = self._get_test_app() # Test redirects for old URLs url = '/api/2/rest/harvestobject/{0}/xml'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 301) assert ('/harvest/object/{0}'.format(object_id_1) in r.headers['Location']) url = '/api/2/rest/harvestobject/{0}/html'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 301) assert ('/harvest/object/{0}/html'.format(object_id_1) in r.headers['Location']) # Access object content url = '/harvest/object/{0}'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'application/xml; charset=utf-8') assert_equals( r.body, '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>') # Access original content in object extra (if present) url = '/harvest/object/{0}/original'.format(object_id_1) r = app.get(url, status=404) assert_equals(r.status_int, 404) url = '/harvest/object/{0}/original'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'application/xml; charset=utf-8') assert_equals( r.body, '<?xml version="1.0" encoding="UTF-8"?>\n' + '<xml>Original Content 2</xml>') # Access HTML transformation url = '/harvest/object/{0}/html'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body url = '/harvest/object/{0}/html/original'.format(object_id_1) r = app.get(url, status=404) assert_equals(r.status_int, 404) url = '/harvest/object/{0}/html'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body url = '/harvest/object/{0}/html/original'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body
if sorted(previous_guids) == sorted(batch_guids): # Server does not support pagination or no more pages log.debug('Same content, no more pages') break page = page + 1 previous_guids = batch_guids # Check datasets that need to be deleted guids_to_delete = set(guids_in_db) - set(guids_in_source) for guid in guids_to_delete: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) ids.append(obj.id) model.Session.query(HarvestObject).\ filter_by(guid=guid).\ update({'current': False}, False) obj.save() return ids def fetch_stage(self, harvest_object): return True def import_stage(self, harvest_object): log.debug('In DCATJSONHarvester import_stage') if not harvest_object: log.error('No harvest object received')
def gather_stage(self, harvest_job): log.debug('In DCATHarvester gather_stage') ids = [] # Get the previous guids for this source query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = guid_to_package_id.keys() guids_in_source = [] # Get file contents url = harvest_job.source.url previous_guids = [] page = 1 while True: try: content, content_type = self._get_content_and_type( url, harvest_job, page) except requests.exceptions.HTTPError, error: if error.response.status_code == 404: if page > 1: # Server returned a 404 after the first page, no more # records log.debug('404 after first page, no more pages') break else: # Proper 404 msg = 'Could not get content. Server responded with 404 Not Found' self._save_gather_error(msg, harvest_job) return None else: # This should never happen. Raising just in case. raise if not content: return None try: batch_guids = [] for guid, as_string in self._get_guids_and_datasets(content): ''' When ABORT is received from the datanorgeHarvester.py, the dataset is skipped since it is not transport-related NOTE: This way of filtering transport related datasets should be changed when DIFI gets their new API working. With their current API, it is not possible to filter on category, so it must be done manually like this. THIS IS ALSO USED BY GEONORGE ''' if (as_string == 'ABORT'): log.debug('Dataset skipped, not relevant'.format( guid.encode('utf8'))) continue log.debug('Got identifier: {0}'.format( guid.encode('utf8'))) batch_guids.append(guid) if guid not in previous_guids: if guid in guids_in_db: # Dataset needs to be udpated obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], content=as_string, extras=[ HarvestObjectExtra(key='status', value='change') ]) else: # Dataset needs to be created obj = HarvestObject(guid=guid, job=harvest_job, content=as_string, extras=[ HarvestObjectExtra( key='status', value='new') ]) obj.save() ids.append(obj.id) if len(batch_guids) > 0: guids_in_source.extend( set(batch_guids) - set(previous_guids)) else: log.debug('Empty document, no more records') # Empty document, no more ids break except ValueError, e: msg = 'Error parsing file: {0}'.format(str(e)) self._save_gather_error(msg, harvest_job) return None
def gather_stage(self, harvest_job): log.debug('In DCATJSONHarvester gather_stage') ids = [] # Get the previous guids for this source query = \ model.Session.query(HarvestObject.guid, HarvestObject.package_id) \ .filter(HarvestObject.current == True) \ .filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = guid_to_package_id.keys() guids_in_source = [] # Get file contents url = harvest_job.source.url previous_guids = [] page = 1 while True: try: content, content_type = \ self._get_content_and_type(url, harvest_job, page) except requests.exceptions.HTTPError, error: if error.response.status_code == 404: if page > 1: # Server returned a 404 after the first page, no more # records log.debug('404 after first page, no more pages') break else: # Proper 404 msg = 'Could not get content. Server responded with ' \ '404 Not Found' self._save_gather_error(msg, harvest_job) return None else: # This should never happen. Raising just in case. raise if not content: return None try: batch_guids = [] for guid, as_string in self._get_guids_and_datasets(content): log.debug('Got identifier: {0}' .format(guid.encode('utf8'))) batch_guids.append(guid) if guid not in previous_guids: if guid in guids_in_db: # Dataset needs to be udpated obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], content=as_string, extras=[HarvestObjectExtra(key='status', value='change')]) else: # Dataset needs to be created obj = HarvestObject( guid=guid, job=harvest_job, content=as_string, extras=[HarvestObjectExtra(key='status', value='new')]) obj.save() ids.append(obj.id) if len(batch_guids) > 0: guids_in_source.extend(set(batch_guids) - set(previous_guids)) else: log.debug('Empty document, no more records') # Empty document, no more ids break except ValueError, e: msg = 'Error parsing file: {0}'.format(str(e)) self._save_gather_error(msg, harvest_job) return None
def import_stage(self, harvest_object): log.debug('In DotStatHarvester import_stage') self._set_config(harvest_object.job.source.config) if not harvest_object: log.error('No harvest object received') self._save_object_error('No harvest object received', harvest_object) return False try: base_url = harvest_object.source.url # Parse the SDMX as XML with bs4 soup = BeautifulSoup(harvest_object.content, 'xml') # Make a package dict pkg_dict = {} pkg_dict['id'] = harvest_object.guid # Added thematic string pkg_dict['thematic_area_string'] = ["Official Statistics"] # Open license for all dotStat resources pkg_dict['license_id'] = "other-open" # Get owner_org if there is one source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') pkg_dict['owner_org'] = owner_org # Match other fields with tags in XML structure agency_id = self.config['agencyId'] stats_guid = self._get_object_extra(harvest_object, 'stats_guid') structure = soup.find('Dataflow') pkg_dict['title'] = structure.find('Name', {"xml:lang" : "en"}).text pkg_dict['publisher_name'] = structure['agencyID'] pkg_dict['version'] = structure['version'] # Need to change url to point to Data Explorer de_url = 'https://stats.pacificdata.org/vis?locale=en&dataflow[datasourceId]=SPC2&dataflow[agencyId]={}&dataflow[dataflowId]={}&dataflow[version]={}'.format( agency_id, stats_guid, structure['version'] ) pkg_dict['source'] = de_url # Set resource to metadata data dictionary (if available) annotation = structure.find('Annotations') annots = annotation.find_all('Annotation') metaurl = None for annot in annots: metalink = annot.find('AnnotationType') if metalink.text == 'EXT_RESOURCE': metaurl = annot.find('AnnotationText', {'xml:lang':'en'}).text.split('|')[1] # Set default resource, and metadata pdf if it exists if metaurl: pkg_dict['resources'] = [ { 'url': 'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format( agency_id, stats_guid, structure['version'] ), 'format': 'CSV', 'mimetype': 'CSV', 'description': 'All data for {}'.format(pkg_dict['title']), 'name': '{} Data CSV'.format(pkg_dict['title']) }, { 'url': metaurl, 'format': 'PDF', 'mimetype': 'PDF', 'description': 'Detailed metadata dictionary for {}'.format(pkg_dict['title']), 'name': '{} Metadata PDF'.format(pkg_dict['title']) }] else: pkg_dict['resources'] = [ { 'url': 'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format( agency_id, stats_guid, structure['version'] ), 'format': 'CSV', 'mimetype': 'CSV', 'description': 'All data for {}'.format(pkg_dict['title']), 'name': '{} Data CSV'.format(pkg_dict['title']) }] # Get notes/description if it exists try: desc = structure.find('Description', {"xml:lang": "en"}).text desc += '\nFind more Pacific data on PDH.stat : https://stats.pacificdata.org/' pkg_dict['notes'] = desc except Exception as e: log.error("An error occured: {}".format(e)) pkg_dict['notes'] = 'Find more Pacific data on PDH.stat : https://stats.pacificdata.org/' # Add tags from CategoryScheme and ConceptScheme # List of uninteresting tags generic_schemes = ['Time', 'Frequency', 'Observation value', 'Observation Status', 'Confidentiality status', 'Unit of measure', 'Unit multiplier', 'Base period', 'Comment', 'Decimals', 'Data source', 'Pacific Island Countries and territories', 'Indicator', 'Transformation', 'Reporting type', 'Composite breakdown'] tag_strings = [] # For finding Category Schemes for tags schemes = soup.find('CategorySchemes') if schemes is not None: catschemes = schemes.find_all('CategoryScheme') for catscheme in catschemes: cats = catscheme.find_all('Category') for cat in cats: found = cat.find('Name', {'xml:lang': 'en'}).text if found not in tag_strings: tag_strings.append(found) # For finding Concept Schemes for tags concepts = soup.find('Concepts') if concepts is not None: concschemes = concepts.find_all('ConceptScheme') for concscheme in concschemes: concepts = concscheme.find_all('Concept') for concept in concepts: found = concept.find('Name', {'xml:lang': 'en'}).text if found not in tag_strings: tag_strings.append(found) # Tag cleaning psp_mapping = { 'Industry and Services': ['pacific-skills', 'industry', 'training'], 'Education level': ['pacific-skills', 'education', 'training'], 'Occupation': ['pacific-skills', 'occupation'], 'Disability': ['pacific-skills', 'disability'], 'Economic sector': ['pacific-skills', 'industry', 'training'], 'Labour force status': ['pacific-skills', 'employment'], 'Employment status': ['pacific-skills', 'employment'], 'Labour and employment status': ['pacific-skills', 'employment'] } if len(tag_strings) > 0: # Bring in PSP tags for tag in tag_strings: if tag in list(psp_mapping.keys()): tag_strings.extend(psp_mapping[tag]) # Remove duplicates tag_strings = list(set(tag_strings)) # Remove tags found in generic_schemes list tags = [x.lower() for x in tag_strings if x not in generic_schemes] # Make a string of tags for CKAN pkg_dict['tag_string'] = ', '.join([munge_tag(tag) for tag in tags]) ''' May need modifying when DF_SDG is broken into several DFs This gets the list of indicators for SDG-related dataflows Stores the list of strings in 'alternate_identifier' field ''' if soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES' }) is not None: pkg_dict['alternate_identifier'] = [] codelist = soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'}) for indic in codelist.findAll('Name', {"xml:lang" : "en"}): if not indic or indic.text == 'SDG Indicator or Series': continue pkg_dict['alternate_identifier'].append(indic.text) ''' When support for metadata endpoints arrives in .Stat, here is how more metadata may be found: # Use the metadata/flow endpoint metadata = requests.get('{}metadata/data/{}/all?detail=full'.format(base_url, harvest_object.guid)) # Parse with bs4 parsed = BeautifulSoup(metadata.text, 'xml') # Now search for tags which may be useful as metadata # example: getting the name and definition of metadata set # (may need tweaking depending on SPC's metadata setup) # We can get name from the metadata structure set = parsed.find('MetadataSet') pkg_dict['name'] = set.find('Name').text # Then we can go to the reported attribute structure for more details detail = set.find('ReportedAttribute', attrs={'id': 'DEF'}) pkg_dict['notes'] = detail.find('StructuredText', attrs={'lang': 'en'}) source_details = set.find('ReportedAttribute', attrs={'id': 'SOURCE_DEF'}) pkg_dict['source'] = source_details.find('StructuredText', attrs={'lang': 'en'}) ''' log.debug('package dict: %s' % pkg_dict) content_hash = str(_hashify(pkg_dict)) harvest_object.extras = [ HarvestObjectExtra(key='content_hash', value=content_hash) ] harvest_object.save() prev_object = model.Session.query(HarvestObject).filter( HarvestObject.source == harvest_object.source, HarvestObject.guid == harvest_object.guid, ~HarvestObject.import_finished.is_(None)).order_by( HarvestObject.import_finished.desc()).first() obj_hash = self._get_object_extra(prev_object, 'content_hash') if obj_hash and obj_hash == content_hash: log.debug('Content is not changed. Skip..') return True # Create or update the package return self._create_or_update_package( pkg_dict, harvest_object, package_dict_form='package_show') except Exception as e: self._save_object_error(('Exception in import stage: %r / %s' % (e, traceback.format_exc())), harvest_object) return False