def _amend_package(self, harvest_object): ''' Amend package information. ''' package = json.loads(harvest_object.content) if 'extras' not in package: package['extras'] = [] portal = self._get_portal_from_config(harvest_object.source.config) set_extras_field(package, EXTRA_KEY_HARVESTED_PORTAL, portal) # ensure all resources have a (recent) license for resource in package.get('resources', []): log_prefix = u'{0}: Resource {1} of package {2} (GUID {3})'.format( harvest_object.source.title, resource.get('uri', ''), package.get('name', ''), harvest_object.guid) if resource.get(RES_EXTRA_KEY_LICENSE, '') == '': LOGGER.info(log_prefix + u' has no license. Adding default value.') resource[RES_EXTRA_KEY_LICENSE] = self._get_fallback_license() elif self.licenses_upgrade: current_license = resource.get(RES_EXTRA_KEY_LICENSE) new_license = self.licenses_upgrade.get(current_license, '') if new_license == '': LOGGER.info(log_prefix + u' has a deprecated or unknown license {0}. '\ u'Keeping old value.'.format(current_license)) elif current_license != new_license: LOGGER.info(log_prefix + u' had old license {0}. '\ u'Updated value to recent DCAT list.'.format(current_license)) resource[RES_EXTRA_KEY_LICENSE] = new_license # write changes back to harvest object content harvest_object.content = json.dumps(package)
def parse_dataset(self, dataset_dict, dataset_ref): """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """ # Manage different versions of DCATDE namespaces first. # Ensure that they are ordered from oldest to newest version, such that older values get overwritten # in case of multiple definitions dcatde_versions = [DCATDE_1_0, DCATDE] # geocodingText and legalbasisText got renamed, so handle them separately for key, predicate, in ( ('legalbasisText', DCATDE_1_0.legalbasisText), ('geocodingText', DCATDE_1_0.geocodingText), ('legalbasisText', DCATDE.legalBasis), ('geocodingText', DCATDE.geocodingDescription), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.set_extras_field(dataset_dict, key, json.dumps(values)) # iterate over all namespaces to import as much as possible for dcatde_namespace in dcatde_versions: # Simple additional fields for key, predicate in ( ('qualityProcessURI', dcatde_namespace.qualityProcessURI), ('politicalGeocodingLevelURI', dcatde_namespace.politicalGeocodingLevelURI), ): value = self._object_value(dataset_ref, predicate) if value: ds_utils.set_extras_field(dataset_dict, key, value) # List fields for key, predicate, in ( ('contributorID', dcatde_namespace.contributorID), ('politicalGeocodingURI', dcatde_namespace.politicalGeocodingURI), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.set_extras_field(dataset_dict, key, json.dumps(values)) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.originator, 'originator', True) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.maintainer, 'maintainer', False) # Add additional distribution fields for distribution in self.g.objects(dataset_ref, DCAT.distribution): for resource_dict in dataset_dict.get('resources', []): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_uri(resource_dict): for key, predicate in ( ('licenseAttributionByText', dcatde_namespace.licenseAttributionByText), ('plannedAvailability', dcatde_namespace.plannedAvailability)): value = self._object_value(distribution, predicate) if value: ds_utils.insert_resource_extra( resource_dict, key, value) # -- end loop over dcatde namespaces -- # additions in other namespaces than DCATDE self._parse_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor', True) self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author', False) # dcat:landingPage landing_page = self._object_value(dataset_ref, DCAT.landingPage) if landing_page: ds_utils.set_extras_field(dataset_dict, 'metadata_original_html', landing_page) # dcat:contactPoint # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer? contact = self._object(dataset_ref, DCAT.contactPoint) self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL) contact_tel = self._object_value(contact, VCARD.hasTelephone) if contact_tel: ds_utils.insert(dataset_dict, 'maintainer_tel', self._without_tel(contact_tel), True) self._add_maintainer_field(dataset_dict, contact, 'street', VCARD.hasStreetAddress) self._add_maintainer_field(dataset_dict, contact, 'city', VCARD.hasLocality) self._add_maintainer_field(dataset_dict, contact, 'zip', VCARD.hasPostalCode) self._add_maintainer_field(dataset_dict, contact, 'country', VCARD.hasCountryName) # Groups groups = self._get_dataset_value(dataset_dict, 'groups') if not groups: groups = [] for obj in self.g.objects(dataset_ref, DCAT.theme): current_theme = unicode(obj) if current_theme.startswith(dcat_theme_prefix): group = current_theme.replace(dcat_theme_prefix, '').lower() groups.append({'id': group, 'name': group}) dataset_dict['groups'] = groups return dataset_dict
def amend_package(self, package, portal): if 'extras' not in package: package['extras'] = [] set_extras_field(package, EXTRA_KEY_HARVESTED_PORTAL, portal)
def parse_dataset(self, dataset_dict, dataset_ref): """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """ # Different implementation of clean tags for keywords do_clean_tags = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) if do_clean_tags: cleaned_tags = [ _munge_tag(tag) for tag in self._keywords(dataset_ref) ] tags = [{'name': tag} for tag in cleaned_tags] dataset_dict['tags'] = tags # Manage different versions of DCATDE namespaces first. # Ensure that they are ordered from oldest to newest version, such that older values get overwritten # in case of multiple definitions dcatde_versions = [DCATDE_1_0, DCATDE_1_0_1, DCATDE] # iterate over all namespaces to import as much as possible for dcatde_namespace in dcatde_versions: # Simple additional fields for key, predicate in ( ('qualityProcessURI', dcatde_namespace.qualityProcessURI), ('politicalGeocodingLevelURI', dcatde_namespace.politicalGeocodingLevelURI), ): value = self._object_value(dataset_ref, predicate) if value: ds_utils.set_extras_field(dataset_dict, key, value) # geocodingText and legalbasisText got renamed after 1.0, so assign the respective names legalbasisTextProperty = dcatde_namespace.legalBasis geocodingTextProperty = dcatde_namespace.geocodingDescription if (dcatde_namespace == DCATDE_1_0): legalbasisTextProperty = DCATDE_1_0.legalbasisText geocodingTextProperty = DCATDE_1_0.geocodingText # List fields for key, predicate, in ( ('contributorID', dcatde_namespace.contributorID), ('politicalGeocodingURI', dcatde_namespace.politicalGeocodingURI), ('legalbasisText', legalbasisTextProperty), ('geocodingText', geocodingTextProperty), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.set_extras_field(dataset_dict, key, json.dumps(values)) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.originator, 'originator', True) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.maintainer, 'maintainer', False) # Add additional distribution fields for distribution in self.g.objects(dataset_ref, DCAT.distribution): for resource_dict in dataset_dict.get('resources', []): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_dict.get('uri'): for key, predicate in ( ('licenseAttributionByText', dcatde_namespace.licenseAttributionByText), ('plannedAvailability', dcatde_namespace.plannedAvailability)): value = self._object_value(distribution, predicate) if value: ds_utils.insert_resource_extra( resource_dict, key, value) # -- end loop over dcatde namespaces -- # additions in other namespaces than DCATDE self._parse_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor', True) self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author', False) # Simple additional fields to DCAT-AP 1.1 for key, predicate in (('metadata_original_html', DCAT.landingPage), ('granularity', DCAT.granularity)): value = self._object_value(dataset_ref, predicate) if value: ds_utils.set_extras_field(dataset_dict, key, value) # dcat:contactPoint # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer? self._parse_contact_vcard(dataset_dict, dataset_ref, DCAT.contactPoint, 'maintainer') # Groups groups = self._get_dataset_value(dataset_dict, 'groups') if not groups: groups = [] for obj in self.g.objects(dataset_ref, DCAT.theme): current_theme = unicode(obj) if current_theme.startswith(dcat_theme_prefix): group = current_theme.replace(dcat_theme_prefix, '').lower() groups.append({'id': group, 'name': group}) dataset_dict['groups'] = groups return dataset_dict
def migrate_contributor_identifier(self): ''' Add govdata-contributor-IDs to datasets that are missing one ''' util.get_migrator_log().info('Migrating dcatde:contributorID' + ( ' [dry run without saving]' if self.dry_run else '')) starttime = time.time() package_obj_to_update = gather_dataset_ids() endtime = time.time() print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \ (len(package_obj_to_update), str(endtime - starttime)) organization_list = tk.get_action('organization_list')( self.create_context(), { 'all_fields': True, 'include_extras': True }) updated_count = created_count = 0 starttime = time.time() for dataset in self.iterate_datasets(package_obj_to_update.keys()): print u'Updating dataset: {}'.format(dataset['title']) dataset_org_id = dataset['organization']['id'] dataset_org = next((item for item in organization_list if item['id'] == dataset_org_id), None) if not dataset_org: print u'Did not find a Organization for ID: ' + dataset_org_id continue org_contributor_field = get_extras_field(dataset_org, EXTRA_KEY_CONTRIBUTOR_ID) if not org_contributor_field: print u'Did not find a contributor ID for Organization: ' + dataset_org_id continue try: org_contributor_id_list = json.loads( org_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string org_contributor_id_list = [org_contributor_field['value']] dataset_contributor_field = get_extras_field( dataset, EXTRA_KEY_CONTRIBUTOR_ID) requires_update = False if not dataset_contributor_field: # Contributor-id field does not exist yet set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(org_contributor_id_list)) created_count = created_count + 1 requires_update = True else: try: current_ids_list = json.loads( dataset_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string current_ids_list = [dataset_contributor_field['value']] for contributor_id in org_contributor_id_list: if contributor_id not in current_ids_list: current_ids_list.append(contributor_id) requires_update = True if requires_update: updated_count = updated_count + 1 set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(current_ids_list)) if requires_update: self.update_dataset(dataset) endtime = time.time() print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \ created_count print "INFO: %s datasets were updated. Total time: %s." % ( updated_count, str(endtime - starttime)) util.get_migrator_log().info( 'Finished migration of dcatde:contributorID' + (' [dry run without saving]' if self.dry_run else ''))