def test_returns_false_on_dict_list_with_empty_or_none_value(self): extras = Extras([ { 'key': 'bar', 'value': 'bar-value' }, { 'key': 'baz', 'value': 'baz-value' }, { 'key': 'foo', 'value': '' }, { 'key': 'fuz', 'value': ' ' }, { 'key': 'muz', 'value': None }, ]) self.assertFalse(extras.key('foo', disallow_empty=True)) self.assertFalse(extras.key('fuz', disallow_empty=True)) self.assertFalse(extras.key('muz', disallow_empty=True))
def test_returns_modified_extras(self): extras = Extras([ { 'key': 'terms_of_use', 'value': [{ 'license_id': 'some-id', 'license_url': 'some-url', }] }, ]) expected_value = [{ 'license_id': 'license-id', 'license_url': 'license-url', 'license_type': 'license-mit', }] extras.update( 'terms_of_use', expected_value, ) expected_extras = [ { 'key': 'terms_of_use', 'value': [{ 'license_id': 'license-id', 'license_url': 'license-url', 'license_type': 'license-mit', }] }, ] self.assertEquals(expected_extras, extras.get())
def test_returns_default_on_dict_list(self): extras = Extras([ { 'key': 'foo', 'value': 'foo-value' }, { 'key': 'bar', 'value': 'baz' }, ]) self.assertEquals('OhNo', extras.value('baz', 'OhNo'))
def test_returns_value_on_dict_list(self): extras = Extras([ { 'key': 'foo', 'value': 'foo-value' }, { 'key': 'baz', 'value': 'baz-value' }, ]) self.assertEquals('foo-value', extras.value('foo'))
def test_returns_true_on_dict_list(self): extras = Extras([ { 'key': 'foo', 'value': 'foo-value' }, { 'key': 'bar', 'value': 'bar-value' }, ]) self.assertTrue(extras.key('bar'))
def test_returns_false_on_dict_list(self): extras = Extras([ { 'key': 'bar', 'value': 'bar-value' }, { 'key': 'baz', 'value': 'baz-value' }, ]) self.assertFalse(extras.key('foo'))
def test_removes_on_dict_list(self): extras = Extras([ { 'key': 'one', 'value': 1 }, { 'key': 'two', 'value': 2 }, ]) self.assertTrue(extras.remove('two')) self.assertEquals(1, extras.len())
def test_update_on_dict_list_works_as_expected(self): extras = Extras([ { 'key': 'hash', 'value': 'tag' }, { 'key': 'label', 'value': 'dot' }, ]) self.assertTrue(extras.update('label', 'doubledot')) self.assertEquals('doubledot', extras.value('label'))
def upsert_on_dict_list_works_as_expected(self): extras = Extras([ { 'key': 'one', 'value': 1 }, { 'key': 'two', 'value': 2 }, ]) self.assertTrue(extras.update('three', 3, True)) self.assertEquals(3, extras.value('three')) self.assertEquals(3, extras.len())
def returns_modified_sector(self): extras = Extras([ { 'key': 'metadata_original_portal', 'value': None }, { 'key': 'sector', 'value': None }, ]) self.assertTrue(extras.update('sector', 'privat')) self.assertEquals('privat', extras.value('sector'))
def test_original_groups_are_updated_as_expected(self): extras_in = [{ "key": "contacts", "value": "[{'url': 'www.open.nrw.de', 'role': 'vertrieb', 'name': 'Gesch\\u00e4ftsstelle Open.NRW', 'email': '*****@*****.**'}]" }, { "key": "dates", "value": "[{'date': '2016-06-08T12:31:11+02:00', 'role': 'erstellt'}, {'date': '2014-05-26T12:39:03+02:00', 'role': 'veroeffentlicht'}, {'date': '2016-06-08T12:31:11+02:00', 'role': 'aktualisiert'}]" }, { "key": "images", "value": "['https://open.nrw/profiles/nrw_ressort/themes/custom/nrw_base/images/grayish-blue/files/koeln_klein.png']" }, { "key": "metadata_original_portal", "value": "http://open.nrw/" }, { "key": "metadata_transformer", "value": "boo" }, { "key": "non_open", "value": "false" }, { "key": "opennrw_spatial", "value": "Stadt Köln" }, { "key": "original_groups", "value": "['Politik und Wahlen']" }, { "key": "spatial", "value": "{'type': 'polygon', 'coordinates': [[[6.7838099999999999, 50.825465999999999], [7.1533170000000004, 50.825465999999999], [7.1533170000000004, 51.090167999999998], [6.7838099999999999, 51.090167999999998], [6.7838099999999999, 50.825465999999999]]]}" }] extras = Extras(extras_in) self.assertTrue( extras.update('original_groups', ['group one', 'group two'])) self.assertEquals(2, len(extras.value('original_groups')))
def test_returns_value_on_dict_list_nested(self): extras = Extras([ { 'key': 'foo', 'value': { 'nested': 'nested-value', 'zoo': 'zoo-value', } }, { 'key': 'baz', 'value': 'baz-value' }, ]) expected_value = { 'nested': 'nested-value', 'zoo': 'zoo-value', } self.assertEquals(expected_value, extras.value('foo'))
def test_extras_len_works_as_expected(self): extras = Extras([]) self.assertEquals(0, extras.len()) extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertEquals(2, extras.len()) extras = Extras([ { 'key': 'one', 'value': [1] }, { 'key': 'two', 'value': [2] }, { 'key': 'three', 'value': [3] }, ]) self.assertEquals(3, extras.len())
def upsert_on_empty_dict_list_works_as_expected(self): extras = Extras() expected_extras = [{ 'key': 'three', 'value': 3, }] self.assertTrue(extras.update('three', 3, True)) self.assertEquals(3, extras.value('three')) self.assertEquals(1, extras.len()) self.assertEquals(expected_extras, extras.get())
def test_alternates_structure_as_expected(self): extras = Extras([ { 'key': 'terms_of_use', 'value': [{ 'licence_id': 'some-id', 'licence_url': 'some-url', }] }, ]) expected_value = [{ 'license_id': 'some-id', 'license_url': 'some-url', }] extras.update( 'terms_of_use', expected_value, ) self.assertEquals(expected_value, extras.value('terms_of_use')) self.assertEquals(1, len(extras.value('terms_of_use')))
def test_raises_error_when_key_not_found_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) extras.value('raiser')
def test_returns_default_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertEquals('Default', extras.value('foo', 'Default'))
def test_returns_false_on_flat_list_with_empty_or_none_value(self): extras = Extras({'dates': 'foo', 'bar': 'baz', 'foo': '', 'fuz': None}) self.assertFalse(extras.key('foo', disallow_empty=True)) self.assertFalse(extras.key('fuz', disallow_empty=True))
def test_removes_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz', 'some': 'thing'}) self.assertTrue(extras.remove('bar')) self.assertEquals(2, extras.len())
def test_returns_false_on_empty_extras(self): extras = Extras([]) self.assertFalse(extras.key('foo'))
def test_raises_error_when_key_not_found_on_dict_list(self): extras = Extras([{'dates': 'foo'}, {'bar': 'baz'}]) extras.value('raiser')
def test_returns_false_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertFalse(extras.key('foo'))
def test_update_on_flat_list_works_as_expected(self): extras = Extras({'dates': 'foo', 'bar': 'baz', 'some': 'thing'}) self.assertTrue(extras.update('some', 'one')) self.assertEquals('one', extras.value('some'))
def test_returns_value_on_flat_list_with_dict(self): extras = Extras({'terms_of_use': {'license_id': 'some-license'}}) self.assertEquals({'license_id': 'some-license'}, extras.value('terms_of_use'))
def test_returns_true_on_flat_list(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) self.assertTrue(extras.key('bar'))
def test_raises_error_when_list_empty(self): extras = Extras([]) extras.value('raiser')
def upsert_on_flat_list_works_as_expected(self): extras = Extras({'dates': 'foo', 'bar': 'baz', 'some': 'thing'}) self.assertTrue(extras.update('new', 'kid', True)) self.assertEquals('kid', extras.value('new')) self.assertEquals(4, extras.len())
def test_raises_error_when_key_not_found_for_update(self): extras = Extras({'dates': 'foo', 'bar': 'baz'}) extras.update('raiser', 'foo')
def handle_duplicates(cls, harvest_object_content): '''Compares new dataset with existing and checks, if a dataset should be imported.''' method_prefix = 'handle_duplicates: ' context = cls.build_context() remote_dataset = json.loads(harvest_object_content) remote_dataset_extras = Extras(remote_dataset['extras']) remote_dataset_name = remote_dataset.get('name', '') has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER) if has_orig_id: orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER) if orig_id: try: data_dict = { "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"' } # Add filter that local dataset guid is not equal to guid of the remote dataset if (remote_dataset_extras.key('guid')): data_dict[ 'fq'] = '-guid:"' + remote_dataset_extras.value( 'guid') + '"' local_search_result = p.toolkit.get_action( "package_search")(context, data_dict) if local_search_result['count'] == 0: LOGGER.debug('%sDid not find any existing dataset in the database. ' \ 'Import accepted for %s.', method_prefix, remote_dataset_name) return True elif local_search_result['count'] == 1: LOGGER.debug('%sFound duplicate entry for dataset %s.', method_prefix, remote_dataset_name) local_dataset = local_search_result['results'][0] local_dataset_extras = Extras(local_dataset['extras']) # TODO : Im Zweifel das CKAN-Feld "metadata_modified" des lokalen Datensatzes nutzen, # falls modified nicht enthalten ist? if remote_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED) and \ local_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED): return cls.compare_metadata_modified( remote_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED)) else: LOGGER.info( '%sFound duplicate entry with the value "%s" in field "identifier", but ' \ 'remote and/or local dataset does not contain a modified date. ' \ '-> Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) else: LOGGER.info('%sFound multiple duplicates with the value "%s" in field ' \ '"identifier". -> Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) except Exception as exception: LOGGER.error(exception) else: LOGGER.debug( '%sNo original id in field identifier found. Import accepted for %s.', method_prefix, remote_dataset_name) return True else: LOGGER.debug( '%sNo field identifier found. Import accepted for %s.', method_prefix, remote_dataset_name) return True return False
def handle_duplicates(harvest_object_content): '''Compares new dataset with existing and checks, if a dataset should be imported.''' method_prefix = 'handle_duplicates: ' context = HarvestUtils.build_context() remote_dataset = json.loads(harvest_object_content) remote_dataset_extras = Extras(remote_dataset['extras']) remote_dataset_name = remote_dataset.get('name', '') has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER) if has_orig_id: orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER) # remote dataset contains identifier if orig_id: try: data_dict = { "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"' } # Add filter that local dataset guid is not equal to guid of the remote dataset if remote_dataset_extras.key('guid'): data_dict[ 'fq'] = '-guid:"' + remote_dataset_extras.value( 'guid') + '"' # search for other datasets with the same identifier local_search_result = p.toolkit.get_action( "package_search")(context, data_dict) if local_search_result['count'] == 0: # no other dataset with the same identifier was found, import accepted LOGGER.debug(u'%sDid not find any existing dataset in the database with ' \ u'Identifier %s. Import accepted for dataset %s.', method_prefix, orig_id, remote_dataset_name) return True else: # other dataset with the same identifier was found LOGGER.debug( u'%sFound duplicate entries with Identifier %s for dataset %s.', method_prefix, orig_id, remote_dataset_name) remote_is_latest = True local_dataset_has_modified = False latest_local_dataset = {} if not remote_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): remote_is_latest = False # compare modified date with all local datasets for local_dataset in local_search_result['results']: local_dataset_extras = Extras( local_dataset['extras']) if local_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): local_dataset_has_modified = True # notice the local dataset with the latest date _set_or_update_latest_dataset( latest_local_dataset, local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset['id']) # compare dct:modified if remote and local dataset contain the field # "modified" and remote dataset is still not detected as older if remote_is_latest and remote_dataset_extras.key( EXTRAS_KEY_DCT_MODIFIED): remote_is_latest = HarvestUtils.compare_metadata_modified( remote_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED), local_dataset_extras.value( EXTRAS_KEY_DCT_MODIFIED)) if remote_is_latest: # Import accepted. Delete all local datasets with the same identifier. LOGGER.debug(u'%sRemote dataset with Identifier %s is the latest. '\ u'Modified date: %s. Import accepted for dataset %s.', method_prefix, orig_id, remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED), remote_dataset_name) packages_deleted = _delete_packages_keep( local_search_result['results']) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) return True elif local_dataset_has_modified: # Skip import. Delete local datasets, but keep the dataset with latest date in # the field "modified". LOGGER.info(u'%sRemote dataset with Identifier %s is NOT the latest. '\ u'Modified date: %s. Keep local dataset with ' \ u'latest date in field "modified". Skipping import for dataset %s!', method_prefix, orig_id, remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED, 'n/a'), remote_dataset_name) packages_deleted = _delete_packages_keep( local_search_result['results'], latest_local_dataset) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) else: # Skip import, because remote dataset and no other local dataset contains the # field "modified". Delete local datasets, but keep the dataset last modified in # database. LOGGER.info( u'%sFound duplicate entries with the value "%s" in field "identifier", but ' \ u'remote and local datasets does not contain a modified date. ' \ u'Keep local dataset last modified in database. Skipping import for %s!', method_prefix, orig_id, remote_dataset_name) last_modified_local_dataset = {} for local_dataset in local_search_result[ 'results']: # notice the local dataset with the latest date _set_or_update_latest_dataset( last_modified_local_dataset, local_dataset.get('metadata_modified', None), local_dataset['id']) packages_deleted = _delete_packages_keep( local_search_result['results'], last_modified_local_dataset) LOGGER.debug(u'%sDeleted packages: %s', method_prefix, ','.join(packages_deleted)) except Exception as exception: LOGGER.error(exception) else: LOGGER.debug( u'%sNo original id in field identifier found. Import accepted for dataset %s.', method_prefix, remote_dataset_name) return True else: LOGGER.debug( u'%sNo field identifier found. Import accepted for dataset %s.', method_prefix, remote_dataset_name) return True return False