def metadata_original_portal(self, dataset): '''metadata_original_portal -> contributorID''' orig_field = ds_utils.get_extras_field(dataset, u'metadata_original_portal') target_field = ds_utils.get_extras_field(dataset, EXTRA_KEY_HARVESTED_PORTAL) if orig_field: util.rename_extras_field_migration(dataset, u'metadata_original_portal', u'contributorID', True, False) if target_field is None: ds_utils.insert_new_extras_field(dataset, EXTRA_KEY_HARVESTED_PORTAL, orig_field['value'], False)
def contacts_role_veroeffentlichende_stelle(self, dataset): '''contacts.role.veroeffentlichende_stelle -> extras.publisher''' fields = util.get_extras_contacts_data(dataset, 'veroeffentlichende_stelle') target_field = ds_utils.get_extras_field(dataset, u'publisher_contacttype') # only add if the field hasn't been migrated before (check for added field) if target_field is None: if fields is not None: ds_utils.insert_new_extras_field(dataset, u'publisher_name', fields.pop('name', ''), False) ds_utils.insert_new_extras_field(dataset, u'publisher_email', fields.pop('email', ''), False) ds_utils.insert_new_extras_field(dataset, u'publisher_url', fields.pop('url', ''), False) util.update_extras_contacts_data(dataset, 'veroeffentlichende_stelle', fields) # Additional field ds_utils.insert_new_extras_field(dataset, u'publisher_contacttype', u'Organization', False) util.move_extras_contacts_address(dataset, 'veroeffentlichende_stelle', 'publisher', fields)
def update_extras_json_list_data(dataset, extras_field, check_key, expected_val, content): '''Updates an extras value with key extras_field. This field is expected to contain a list of dicts as JSON string. The method updates the content of the list entry having check_key: expected_val. If the given content is nonempty, and if it contains more key-value pairs than the checked pair, the data is updated. Otherwise, the dict is dropped from the list. If the whole list became empty, the extras field is dropped.''' fld_content = ds_utils.get_extras_field(dataset, extras_field) if fld_content is not None: fld_list = json.loads(fld_content['value'], encoding='utf-8') for index, entry in enumerate(fld_list): if entry.get(check_key) == expected_val: # update only if there are additional values given, # otherwise drop the entry if content and content != {check_key: expected_val}: fld_list[index] = content else: del fld_list[index] if fld_list: fld_content['value'] = unicode(json.dumps(fld_list, sort_keys=True)) else: # drop contacts if it became empty ds_utils.delete_extras_field(dataset, extras_field) else: log_warn(dataset, 'Could not update data, no field "' + extras_field + '" in extras')
def contacts_role_ansprechpartner(self, dataset): '''contacts.role.ansprechpartner -> extras.maintainer''' fields = util.get_extras_contacts_data(dataset, 'ansprechpartner') target_field = ds_utils.get_extras_field(dataset, u'maintainer_contacttype') # only add if the field hasn't been migrated before (check for added field) if target_field is None: if fields is not None: if fields.get('name') and fields.get('email'): dataset['maintainer'] = fields.pop('name', '') dataset['maintainer_email'] = fields.pop('email', '') ds_utils.insert_new_extras_field(dataset, u'maintainer_url', fields.pop('url', ''), False) util.update_extras_contacts_data(dataset, 'ansprechpartner', fields) # Additional field ds_utils.insert_new_extras_field( dataset, u'maintainer_contacttype', u'Organization', False) util.move_extras_contacts_address(dataset, 'ansprechpartner', 'maintainer', fields)
def move_extras_contacts_address(dataset, role, new_role, contact_data=None): # load the data if no preloaded dict is available if contact_data is None: contact_data = get_extras_contacts_data(dataset, role) if contact_data is not None: if 'address' in contact_data: parsed_addr = addr_parse(contact_data['address']) keys = ['addressee', 'details', 'street', 'zip', 'city', 'country'] # first, check if any of the new fields is present. If yes, skip # the movement to avoid corrupt datasets for k in keys: if ds_utils.get_extras_field(dataset, new_role + '_' + k): return for k in keys: if k in parsed_addr: ds_utils.insert_new_extras_field(dataset, new_role + '_' + k, parsed_addr[k], False) addr_field_new = parsed_addr.get('unknown') if addr_field_new: contact_data['address'] = addr_field_new log_warn(dataset, u'The following address parts of role ' + role + u' were not recognized: "' + addr_field_new + u'"') else: del contact_data['address'] update_extras_contacts_data(dataset, role, contact_data)
def migrate_dates_field(dataset, from_field, to_field): '''extras.dates.<<from_field>> -> extras.<<to_field>>''' extras_dates = get_extras_dates_data(dataset, from_field) target_field = ds_utils.get_extras_field(dataset, to_field) if target_field is None and extras_dates: ds_utils.insert_new_extras_field(dataset, to_field, extras_dates.pop('date', ''), False) update_extras_dates_data(dataset, from_field, extras_dates)
def geographical_granularity(self, dataset): '''geographical_granularity -> politicalGeocodingLevelUri''' valid_values = { 'bund': 'federal', 'land': 'state', 'kommune': 'municipality', 'stadt': 'municipality', # DCAT values (without URI part) stay the same 'federal': 'federal', 'state': 'state', 'municipality': 'municipality', # Additional non-OGD value 'kreis': 'administrativeDistrict' } geo_level = ds_utils.get_extras_field(dataset, 'geographical_granularity') target_field = ds_utils.get_extras_field( dataset, u'politicalGeocodingLevelURI') # only add if the field hasn't been migrated before if target_field is None: if geo_level is not None: geo_level_value = geo_level['value'].lower() if geo_level_value in valid_values: geo_level_value = ( 'http://dcat-ap.de/def/politicalGeocoding/Level/' + valid_values.get(geo_level_value)) else: util.log_error( dataset, 'INVALID: politicalGeocodingLevelURI: ' + geo_level_value) geo_level['value'] = geo_level_value util.rename_extras_field_migration( dataset, u'geographical_granularity', u'politicalGeocodingLevelURI', False)
def get_extras_json_list_data(dataset, extras_field, check_key, expected_val): '''Gets data from extras_field. The field is expected to contain a list of dicts as JSON string. This method returns the deserialized list entry having expected_val in check_key, or None if no such element exists.''' fld_content = ds_utils.get_extras_field(dataset, extras_field) if fld_content is not None: fld_list = json.loads(fld_content['value'], encoding='utf-8') for entry in fld_list: if entry.get(check_key) == expected_val: return entry return None
def languages(self, dataset): '''convert ISO 639-1 language codes to DCAT-AP conform URIs (containing ISO 639-3 codes''' field_name = u'language' # dataset language_field = ds_utils.get_extras_field(dataset, field_name) if language_field: util.update_language_in(dataset, language_field, 'value', 'language') # resources if 'resources' in dataset and dataset['resources']: for resource in dataset['resources']: if resource.get(field_name): util.update_language_in(dataset, resource, field_name, 'Resource->language')
def terms_of_use_attribution_text(self, dataset): ''' Add attribution text to every resource handles: dataset['extras']['terms_of_use']: "{\"attribution_text\": \"bla\"}" ''' fieldname = u'terms_of_use' resources = dataset['resources'] terms_of_use = ds_utils.get_extras_field(dataset, fieldname) if terms_of_use is not None: text = json.loads( terms_of_use.get('value')).get('attribution_text') if text and resources: for resource in resources: if '__extras' not in resource: resource['__extras'] = dict() resource['__extras'][u'licenseAttributionByText'] = text
def spatial_reference_text(self, dataset): '''spatial_reference.text -> extras.geocodingText''' spatial_reference = ds_utils.get_extras_field(dataset, 'spatial_reference') if spatial_reference is not None: sr_value = spatial_reference['value'] else: sr_value = None if sr_value is not None: # Convert string representation of dictionary to actual dictionary sr_value_dict = json.loads(sr_value, encoding='utf-8') field = sr_value_dict.get('text') if field is not None: ds_utils.insert_new_extras_field(dataset, u'geocodingText', field, True) sr_value_dict.pop('text', None) spatial_reference['value'] = unicode( json.dumps(sr_value_dict, sort_keys=True))
def migrate_adms_identifier(self): util.get_migrator_log().info( 'Migrating adms:identifier to dct:identifier' + (' [dry run without saving]' if self.dry_run else '')) for dataset in self.iterate_adms_id_datasets(): # only migrate if dct:identifier is not already present if not dataset_utils.get_extras_field(dataset, EXTRA_KEY_DCT_IDENTIFIER): util.rename_extras_field_migration(dataset, EXTRA_KEY_ADMS_IDENTIFIER, EXTRA_KEY_DCT_IDENTIFIER, False) self.update_dataset(dataset) else: util.get_migrator_log().info( '%sSkipping package as it already has a dct:identifier', util.log_dataset_prefix(dataset)) util.get_migrator_log().info( 'Finished migration of adms:identifier to dct:identifier' + (' [dry run without saving]' if self.dry_run else ''))
def migrate_contributor_identifier(self): ''' Add govdata-contributor-IDs to datasets that are missing one ''' util.get_migrator_log().info('Migrating dcatde:contributorID' + ( ' [dry run without saving]' if self.dry_run else '')) starttime = time.time() package_obj_to_update = gather_dataset_ids() endtime = time.time() print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \ (len(package_obj_to_update), str(endtime - starttime)) organization_list = tk.get_action('organization_list')( self.create_context(), { 'all_fields': True, 'include_extras': True }) updated_count = created_count = 0 starttime = time.time() for dataset in self.iterate_datasets(package_obj_to_update.keys()): print u'Updating dataset: {}'.format(dataset['title']) dataset_org_id = dataset['organization']['id'] dataset_org = next((item for item in organization_list if item['id'] == dataset_org_id), None) if not dataset_org: print u'Did not find a Organization for ID: ' + dataset_org_id continue org_contributor_field = get_extras_field(dataset_org, EXTRA_KEY_CONTRIBUTOR_ID) if not org_contributor_field: print u'Did not find a contributor ID for Organization: ' + dataset_org_id continue try: org_contributor_id_list = json.loads( org_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string org_contributor_id_list = [org_contributor_field['value']] dataset_contributor_field = get_extras_field( dataset, EXTRA_KEY_CONTRIBUTOR_ID) requires_update = False if not dataset_contributor_field: # Contributor-id field does not exist yet set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(org_contributor_id_list)) created_count = created_count + 1 requires_update = True else: try: current_ids_list = json.loads( dataset_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string current_ids_list = [dataset_contributor_field['value']] for contributor_id in org_contributor_id_list: if contributor_id not in current_ids_list: current_ids_list.append(contributor_id) requires_update = True if requires_update: updated_count = updated_count + 1 set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(current_ids_list)) if requires_update: self.update_dataset(dataset) endtime = time.time() print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \ created_count print "INFO: %s datasets were updated. Total time: %s." % ( updated_count, str(endtime - starttime)) util.get_migrator_log().info( 'Finished migration of dcatde:contributorID' + (' [dry run without saving]' if self.dry_run else ''))