def amend_package(self, package): extras = package['extras'] if package['license_id'] == '': package['license_id'] = 'notspecified' # if sector is not set, set it to 'oeffentlich' (default) if not extras.get('sector'): extras['sector'] = 'oeffentlich' if package['extras']['sector'] != 'oeffentlich': return False valid_types = ['datensatz', 'dokument', 'app'] if not package.get('type') or package['type'] not in valid_types: package['type'] = 'datensatz' package['groups'] = translate_groups(package['groups'], 'berlin') default_portal = 'http://datenregister.berlin.de' if not extras.get('metadata_original_portal'): extras['metadata_original_portal'] = default_portal for resource in package['resources']: resource['format'] = resource['format'].lower() return True
def amend_package(self, package): ''' Amends the package data ''' GovDataHarvester.amend_package(self, package) if 'license_id' not in package or package['license_id'] == '': package['license_id'] = 'notspecified' extras = Extras(package['extras']) # if sector is not set, set it to 'oeffentlich' (default) if not extras.key('sector', disallow_empty=True): extras.update('sector', 'oeffentlich', True) if extras.value('sector') != 'oeffentlich': return False # avoid ValidationError when extra dict # key 'type' is also used by the internal CKAN validation, # see GOVDATA-651 if extras.key('type'): extras.remove('type') package['extras'] = extras.get() valid_types = ['datensatz', 'dokument', 'app'] if not package.get('type') or package['type'] not in valid_types: package['type'] = 'datensatz' package['groups'] = translate_groups(package['groups'], 'berlin') return True
def test_unmapable_group_flat_list(self): translate_result = translate_groups( ['test-unmapable-1', 'test-unmapable-2'], 'hamburg' ) self.assertEquals( translate_result, [] )
def amend_package(self, package): '''This function fixes some differences in the datasets retrieved from Bremen and our schema such as: - fix groups - set metadata_original_portal - fix terms_of_use - copy veroeffentlichende_stelle to maintainer - set spatial text ''' package['id'] = self.generate_id_from_name(package['name']) GovDataHarvester.amend_package(self, package) # set correct groups if not package['groups']: package['groups'] = [] groups_before_log_message = 'groups before translate: {groups}'.format( groups=json.dumps(package['groups']) ) LOGGER.debug(groups_before_log_message) package['groups'] = translate_groups(package['groups'], 'bremen') groups_after_log_message = 'groups after translate: {groups}'.format( groups=json.dumps(package['groups']) ) LOGGER.debug(groups_after_log_message) # copy veroeffentlichende_stelle to maintainer extras = Extras(package['extras']) if extras.key('contacts'): contacts_dict = json.loads(extras.value('contacts')) quelle = filter( lambda x: x['role'] == 'veroeffentlichende_stelle', contacts_dict ) if quelle: package['maintainer'] = quelle[0]['name'] package['maintainer_email'] = quelle[0]['email'] else: LOGGER.info('Unable to resolve maintainer details') # fix typos in terms of use package['license_id'] = u'notspecified' if extras.key('terms_of_use'): self.fix_terms_of_use(extras) terms_of_use_dict = json.loads(extras.value('terms_of_use')) package['license_id'] = terms_of_use_dict['license_id'] if not extras.key('spatial-text'): extras.update('spatial-text', 'Bremen 04 0 11 000', True) package['extras'] = extras.get()
def amend_package(self, package): if not package['groups']: package['groups'] = [] #fix groups if not package['groups']: package['groups'] = [] package['groups'] = [x for x in translate_groups(package['groups'], 'govapps') if len(x) > 0] #generate id based on OID namespace and package name, this makes sure, #that packages with the same name get the same id package['id'] = str(uuid.uuid5(uuid.NAMESPACE_OID, str(package['name'])))
def amend_package(self, package): ''' This function fixes some differences in the datasets retrieved from Bremen and our schema such as: - fix groups - set metadata_original_portal - fix terms_of_use - copy veroeffentlichende_stelle to maintainer - set spatial text ''' #set metadata original portal package['extras'][ 'metadata_original_portal'] = 'http://daten.bremen.de/sixcms/detail.php?template=export_daten_json_d' # set correct groups if not package['groups']: package['groups'] = [] package['groups'] = translate_groups(package['groups'], 'bremen') #copy veroeffentlichende_stelle to maintainer if 'contacts' in package['extras']: quelle = filter(lambda x: x['role'] == 'veroeffentlichende_stelle', package['extras']['contacts'])[0] package['maintainer'] = quelle['name'] package['maintainer_email'] = quelle['email'] #fix typos in terms of use if 'terms_of_use' in package['extras']: self.fix_terms_of_use(package['extras']['terms_of_use']) #copy license id package['license_id'] = package['extras']['terms_of_use'][ 'license_id'] else: package['license_id'] = u'notspecified' if not "spatial-text" in package["extras"]: package["extras"]["spatial-text"] = 'Bremen 04 0 11 000' #generate id based on OID namespace and package name, this makes sure, #that packages with the same name get the same id package['id'] = str( uuid.uuid5(uuid.NAMESPACE_OID, str(package['name']))) for resource in package['resources']: resource['format'] = resource['format'].lower() for resource in package['resources']: resource['format'] = resource['format'].lower()
def amend_package(self, package): if not package['groups']: package['groups'] = [] #fix groups if not package['groups']: package['groups'] = [] package['groups'] = [ x for x in translate_groups(package['groups'], 'govapps') if len(x) > 0 ] #generate id based on OID namespace and package name, this makes sure, #that packages with the same name get the same id package['id'] = str( uuid.uuid5(uuid.NAMESPACE_OID, str(package['name']))) for resource in package['resources']: resource['format'] = resource['format'].lower()
def amend_package(self, package): """ This function fixes some differences in the datasets retrieved from Bremen and our schema such as: - fix groups - set metadata_original_portal - fix terms_of_use - copy veroeffentlichende_stelle to maintainer - set spatial text """ # set metadata original portal package['extras'][ 'metadata_original_portal'] = 'http://daten.bremen.de/sixcms/detail.php?template=export_daten_json_d' # set correct groups if not package['groups']: package['groups'] = [] package['groups'] = translate_groups(package['groups'], 'bremen') # copy veroeffentlichende_stelle to maintainer if 'contacts' in package['extras']: quelle = filter(lambda x: x['role'] == 'veroeffentlichende_stelle', package['extras']['contacts'])[0] package['maintainer'] = quelle['name'] package['maintainer_email'] = quelle['email'] # fix typos in terms of use if 'terms_of_use' in package['extras']: self.fix_terms_of_use(package['extras']['terms_of_use']) # copy license id package['license_id'] = package['extras']['terms_of_use']['license_id'] else: package['license_id'] = u'notspecified' if "spatial-text" not in package["extras"]: package["extras"]["spatial-text"] = 'Bremen 04 0 11 000' # generate id based on OID namespace and package name, this makes sure, # that packages with the same name get the same id package['id'] = str(uuid.uuid5(uuid.NAMESPACE_OID, str(package['name']))) for resource in package['resources']: resource['format'] = resource['format'].lower() for resource in package['resources']: resource['format'] = resource['format'].lower()
def test_mapable_group_flat_list(self): translate_result = translate_groups( ['bevolkerung', 'umwelt-und-klima', 'transport-und-verkehr'], 'hamburg' ) self.assertEquals( type(translate_result).__name__, 'list' ) self.assertEquals( translate_result.__len__(), 3 ) self.assertEquals( translate_result, [{'id': u'bevoelkerung', 'name': u'bevoelkerung'}, {'id': u'umwelt_klima', 'name': u'umwelt_klima'}, {'id': u'transport_verkehr', 'name': u'transport_verkehr'}] )
def test_mapable_group_dict_list(self): dict_list = [{ "vocabulary_id": 1, "state": "active", "display_name": "offene-daten-k\u00f6ln", "id": "07767723-df63-44fa-8bb1-002cf932c2f6", "name": "bevolkerung" }, { "vocabulary_id": 2, "state": "active", "display_name": "offene-daten-k\u00f6ln", "id": "07767723-df63-44fa-8bb1-002cf932c2f6", "name": "umwelt-und-klima" }, { "vocabulary_id": 3, "state": "active", "display_name": "offene-daten-k\u00f6ln", "id": "07767723-df63-44fa-8bb1-002cf932c2f6", "name": "transport-und-verkehr" }] translate_result = translate_groups( dict_list, 'hamburg' ) self.assertEquals( type(translate_result).__name__, 'list' ) self.assertEquals( translate_result.__len__(), 3 ) self.assertEquals( translate_result, [{'id': u'bevoelkerung', 'name': u'bevoelkerung'}, {'id': u'umwelt_klima', 'name': u'umwelt_klima'}, {'id': u'transport_verkehr', 'name': u'transport_verkehr'}] )
def test_unmapable_group_dict_list(self): dict_list = [{ "vocabulary_id": 1, "state": "active", "display_name": "offene-daten-k\u00f6ln", "id": "07767723-df63-44fa-8bb1-002cf932c2f6", "name": "Group 1" }, { "vocabulary_id": 2, "state": "active", "display_name": "offene-daten-k\u00f6ln", "id": "07767723-df63-44fa-8bb1-002cf932c2f6", "name": "Group 2" }] translate_result = translate_groups( dict_list, 'hamburg' ) self.assertEquals( translate_result, [] )
def amend_package(self, package): # check if latestVersion of package extras = package['extras'] is_latest_version = extras.get('latestVersion', None) if is_latest_version == "true": log.debug('received latestVersion == true. Continue with this dataset') # get metadata_original_id # TODO subject to change in the future remote_metadata_original_id = extras.get('metadata_original_id', None) registry = ckanapi.RemoteCKAN('http://localhost:80/ckan') local_search_result = registry.action.package_search(q='metadata_original_id:"' + remote_metadata_original_id + '"') if local_search_result['count'] == 0: log.debug('Did not find this metadata original id. Import accepted.') elif local_search_result['count'] == 1: log.debug('Found local dataset for particular metadata_original_id') local_dataset_from_action_api = local_search_result['results'][0] # copy name and id from local dataset to remote dataset log.debug('Copy id and name to remote dataset') log.debug(package['id']) log.debug(package['name']) package['id'] = local_dataset_from_action_api['id'] package['name'] = local_dataset_from_action_api['name'] log.debug(package['id']) log.debug(package['name']) else : log.debug('Found more than one local dataset for particular metadata_original_id. Offending metadata_original_id is:') log.debug(remote_metadata_original_id) elif is_latest_version == 'false': # do not import or update this particular remote dataset log.debug('received latestVersion == false. Skip this dataset') return False # check if import is desired if package['type'] == 'document': # check if tag 'govdata' exists if not [tag for tag in package['tags'] if tag.lower() == 'govdata']: log.debug('Found invalid package') return False package['type'] = 'dokument' # check if import is desired elif package['type'] == 'dokument': # check if tag 'govdata' exists if not [tag for tag in package['tags'] if tag.lower() == 'govdata']: log.debug('Found invalid package') return False elif package['type'] == 'dataset': package['type'] = 'datensatz' # fix groups log.debug("Before: ") log.debug(package['groups']) package['groups'] = translate_groups(package['groups'], 'hamburg') log.debug("After: ") log.debug(package['groups']) # set original portal if not extras.get('metadata_original_portal'): extras['metadata_original_portal'] = self.PORTAL assert_author_fields(package, package.get('maintainer'), package.get('maintainer_email')) return True
def amend_package(self, package): ''' Amends the package data ''' GovDataHarvester.amend_package(self, package) context = self.build_context() extras = Extras(package['extras']) is_latest_version = None if extras.key('latestVersion'): is_latest_version = extras.value('latestVersion') if is_latest_version == 'true': LOGGER.debug( 'received latestVersion == true. Continue with this dataset') remote_metadata_original_id = extras.value( 'metadata_original_id' ) # compare harvested OGD-Dataset with local DCAT-AP.de-Dataset data_dict = {"q": 'identifier:"' + remote_metadata_original_id + '"'} local_search_result = get_action("package_search")(context, data_dict) if local_search_result['count'] == 0: LOGGER.debug( 'Did not find this metadata original id. Import accepted.') elif local_search_result['count'] == 1: LOGGER.debug( 'Found local dataset for particular metadata_original_id') local_dataset_from_action_api = local_search_result[ 'results'][0] # copy name and id from local dataset to remote dataset LOGGER.debug('Copy id and name to remote dataset') LOGGER.debug(package['id']) LOGGER.debug(package['name']) package['id'] = local_dataset_from_action_api['id'] package['name'] = local_dataset_from_action_api['name'] LOGGER.debug(package['id']) LOGGER.debug(package['name']) else: log_message = 'Found more than one local dataset for ' log_message = log_message + 'particular metadata_original_id. ' log_message = log_message + 'Offending metadata_original_id ' log_message = log_message + 'is:' LOGGER.debug(log_message) LOGGER.debug(remote_metadata_original_id) elif is_latest_version == 'false': # do not import or update this particular remote dataset LOGGER.debug('received latestVersion == false. Skip this dataset') return False # check if import is desired if package['type'] == 'document' or package['type'] == 'dokument': if not self.has_tag(package['tags'], 'govdata'): LOGGER.debug("Found invalid package with 'govdata' tag") return False package['type'] = 'dokument' elif package['type'] == 'dataset': package['type'] = 'datensatz' # fix groups LOGGER.debug('Before: ') LOGGER.debug(package['groups']) package['groups'] = translate_groups(package['groups'], 'hamburg') LOGGER.debug('After: ') LOGGER.debug(package['groups']) self.assert_author_fields( package, package.get('maintainer'), package.get('maintainer_email') ) return True