def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In datajson import_stage') # Get the metadata that we stored in the HarvestObject's content field. dataset = json.loads(harvest_object.content) # Assemble basic information about the dataset. pkg = { "name": self.make_package_name(dataset["title"], harvest_object.guid), "extras": [{ "key": "source_datajson_url", "value": harvest_object.source.url, }, { "key": "source_datajson_identifier", "value": dataset["identifier"], }] } from parse_datajson import parse_datajson_entry parse_datajson_entry(dataset, pkg) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] existing_pkg.update(pkg) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by(package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() return True
def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In datajson import_stage') # Get the metadata that we stored in the HarvestObject's content field. dataset = json.loads(harvest_object.content) # Assemble basic information about the dataset. pkg = { "name": self.make_package_name(dataset["title"], harvest_object.guid), "extras": [{ "key": "source_datajson_url", "value": harvest_object.source.url, }, { "key": "source_datajson_identifier", "value": dataset["identifier"], }] } from parse_datajson import parse_datajson_entry parse_datajson_entry(dataset, pkg) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] existing_pkg.update( pkg ) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by( package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() return True
def set_dataset_info(self, pkg, dataset, dataset_defaults): from parse_datajson import parse_datajson_entry parse_datajson_entry(dataset, pkg, dataset_defaults)
def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version): parse_datajson_entry(dataset, pkg, dataset_defaults, schema_version)