def test_returns_false_on_dict_list_with_empty_or_none_value(self): extras = Extras([ { 'key': 'bar', 'value': 'bar-value' }, { 'key': 'baz', 'value': 'baz-value' }, { 'key': 'foo', 'value': '' }, { 'key': 'fuz', 'value': ' ' }, { 'key': 'muz', 'value': None }, ]) self.assertFalse(extras.key('foo', disallow_empty=True)) self.assertFalse(extras.key('fuz', disallow_empty=True)) self.assertFalse(extras.key('muz', disallow_empty=True))
def test_returns_false_on_dict_list(self): extras = Extras([ { 'key': 'bar', 'value': 'bar-value' }, { 'key': 'baz', 'value': 'baz-value' }, ]) self.assertFalse(extras.key('foo'))
def test_returns_true_on_dict_list(self): extras = Extras([ { 'key': 'foo', 'value': 'foo-value' }, { 'key': 'bar', 'value': 'bar-value' }, ]) self.assertTrue(extras.key('bar'))
def test_key_on_large_dict_list(self): extras_in = [{ "key": "contacts", "value": "[{'url': 'www.open.nrw.de', 'role': 'vertrieb', 'name': 'Gesch\\u00e4ftsstelle Open.NRW', 'email': '*****@*****.**'}]" }, { "key": "dates", "value": "[{'date': '2016-06-08T12:31:11+02:00', 'role': 'erstellt'}, {'date': '2014-05-26T12:39:03+02:00', 'role': 'veroeffentlicht'}, {'date': '2016-06-08T12:31:11+02:00', 'role': 'aktualisiert'}]" }, { "key": "images", "value": "['https://open.nrw/profiles/nrw_ressort/themes/custom/nrw_base/images/grayish-blue/files/koeln_klein.png']" }, { "key": "metadata_original_portal", "value": "http://open.nrw/" }, { "key": "metadata_transformer", "value": "" }, { "key": "non_open", "value": "false" }, { "key": "opennrw_spatial", "value": "Stadt Köln" }, { "key": "original_groups", "value": "['Politik und Wahlen']" }, { "key": "spatial", "value": "{'type': 'polygon', 'coordinates': [[[6.7838099999999999, 50.825465999999999], [7.1533170000000004, 50.825465999999999], [7.1533170000000004, 51.090167999999998], [6.7838099999999999, 51.090167999999998], [6.7838099999999999, 50.825465999999999]]]}" }] extras = Extras(extras_in) for extra in extras_in: self.assertTrue(extras.key(extra['key']))
def handle_duplicates(cls, harvest_object_content): '''Compares new dataset with existing and checks, if a dataset should be imported.''' method_prefix = 'handle_duplicates: ' context = cls.build_context() remote_dataset = json.loads(harvest_object_content) remote_dataset_extras = Extras(remote_dataset['extras']) remote_dataset_name = remote_dataset.get('name', '') has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER) if has_orig_id: orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER) if orig_id: try: data_dict = { "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"' } # Add filter that local dataset guid is not equal to guid of the remote dataset if (remote_dataset_extras.key('guid')): data_dict[ 'fq'] = '-guid:"' + remote_dataset_extras.value( 'guid') + '"' local_search_result = p.toolkit.get_action( "package_search")(context, data_dict) if local_search_result['count'] == 0: LOGGER.debug('%sDid not find any existing dataset in the database. ' \ 'Import accepted for %s.', method_prefix, remote_dataset_name) return True elif local_search_result['count'] == 1: LOGGER.debug('%sFound duplicate entry for dataset %s.', method_prefix, remote_dataset_name) local_dataset = local_search_result['results'][0] local_dataset_extras = Extras(local_dataset['extras']) # TODO : Im Zweifel das CKAN-Feld "metadata_modified" des lokalen Datensatzes nutzen, # falls modified nicht enthalten ist? if remote_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED) and \ local_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED): return cls.compare_metadata_modified( remote_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED)) else: LOGGER.info( '%sFound duplicate entry with the value "%s" in field "identifier", but ' \ 'remote and/or local dataset does not contain a modified date. ' \ '-> Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) else: LOGGER.info('%sFound multiple duplicates with the value "%s" in field ' \ '"identifier". -> Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) except Exception as exception: LOGGER.error(exception) else: LOGGER.debug( '%sNo original id in field identifier found. Import accepted for %s.', method_prefix, remote_dataset_name) return True else: LOGGER.debug( '%sNo field identifier found. Import accepted for %s.', method_prefix, remote_dataset_name) return True return False
def test_returns_false_on_flat_list_with_empty_or_none_value(self): extras = Extras({'dates': 'foo', 'bar': 'baz', 'foo': '', 'fuz': None}) self.assertFalse(extras.key('foo', disallow_empty=True)) self.assertFalse(extras.key('fuz', disallow_empty=True))
def test_returns_false_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertFalse(extras.key('foo'))
def test_returns_true_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertTrue(extras.key('bar'))
def test_returns_false_on_empty_extras(self): extras = Extras([]) self.assertFalse(extras.key('foo'))
def handle_duplicates(harvest_object_content): '''Compares new dataset with existing and checks, if a dataset should be imported.''' method_prefix = 'handle_duplicates: ' context = HarvestUtils.build_context() remote_dataset = json.loads(harvest_object_content) remote_dataset_extras = Extras(remote_dataset['extras']) remote_dataset_name = remote_dataset.get('name', '') has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER) if has_orig_id: orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER) # remote dataset contains identifier if orig_id: try: data_dict = { "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"' } # Add filter that local dataset guid is not equal to guid of the remote dataset if remote_dataset_extras.key('guid'): data_dict[ 'fq'] = '-guid:"' + remote_dataset_extras.value( 'guid') + '"' # search for other datasets with the same identifier local_search_result = p.toolkit.get_action( "package_search")(context, data_dict) if local_search_result['count'] == 0: # no other dataset with the same identifier was found, import accepted LOGGER.debug(u'%sDid not find any existing dataset in the database with ' \ u'Identifier %s. Import accepted for dataset %s.', method_prefix, orig_id, remote_dataset_name) return True else: # other dataset with the same identifier was found LOGGER.debug( u'%sFound duplicate entries with Identifier %s for dataset %s.', method_prefix, orig_id, remote_dataset_name) remote_is_latest = True local_dataset_has_modified = False latest_local_dataset = {} if not remote_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): remote_is_latest = False # compare modified date with all local datasets for local_dataset in local_search_result['results']: local_dataset_extras = Extras( local_dataset['extras']) if local_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): local_dataset_has_modified = True # notice the local dataset with the latest date _set_or_update_latest_dataset( latest_local_dataset, local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset['id']) # compare dct:modified if remote and local dataset contain the field # "modified" and remote dataset is still not detected as older if remote_is_latest and remote_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): remote_is_latest = HarvestUtils.compare_metadata_modified( remote_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED)) if remote_is_latest: # Import accepted. Delete all local datasets with the same identifier. LOGGER.debug(u'%sRemote dataset with Identifier %s is the latest. '\ u'Modified date: %s. Import accepted for dataset %s.', method_prefix, orig_id, remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED), remote_dataset_name) packages_deleted = _delete_packages_keep( local_search_result['results']) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) return True elif local_dataset_has_modified: # Skip import. Delete local datasets, but keep the dataset with latest date in # the field "modified". LOGGER.info(u'%sRemote dataset with Identifier %s is NOT the latest. '\ u'Modified date: %s. Keep local dataset with ' \ u'latest date in field "modified". Skipping import for dataset %s!', method_prefix, orig_id, remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED, 'n/a'), remote_dataset_name) packages_deleted = _delete_packages_keep( local_search_result['results'], latest_local_dataset) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) else: # Skip import, because remote dataset and no other local dataset contains the # field "modified". Delete local datasets, but keep the dataset last modified in # database. LOGGER.info( u'%sFound duplicate entries with the value "%s" in field "identifier", but ' \ u'remote and local datasets does not contain a modified date. ' \ u'Keep local dataset last modified in database. Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) last_modified_local_dataset = {} for local_dataset in local_search_result[ 'results']: # notice the local dataset with the latest date _set_or_update_latest_dataset( last_modified_local_dataset, local_dataset.get('metadata_modified', None), local_dataset['id']) packages_deleted = _delete_packages_keep( local_search_result['results'], last_modified_local_dataset) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) except Exception as exception: LOGGER.error(exception) else: LOGGER.debug( u'%sNo original id in field identifier found. Import accepted for dataset %s.', method_prefix, remote_dataset_name) return True else: LOGGER.debug( u'%sNo field identifier found. Import accepted for dataset %s.', method_prefix, remote_dataset_name) return True return False