def test_dataset_get_by_name(ckan_client_hl): client = ckan_client_hl dataset_dict = generate_dataset() dataset_dict['name'] = 'example-dataset-name' dataset = CkanDataset(dataset_dict) created = client.create_dataset(dataset) assert created.is_equivalent(dataset) dataset_id = created.id # Try getting by id dataset_1 = client.get_dataset(dataset_id) assert created == dataset_1 # Try getting by name dataset_2 = client.get_dataset_by_name('example-dataset-name') assert created == dataset_2 # Try getting by id, but passing name instead with pytest.raises(HTTPError) as excinfo: client.get_dataset('example-dataset-name') assert excinfo.value.status_code == 404 # Try getting by name, but passing id instead with pytest.raises(HTTPError) as excinfo: client.get_dataset_by_name(dataset_id) assert excinfo.value.status_code == 404
def test_ckandataset_creation(): dataset = CkanDataset({ 'name': 'example-dataset', 'title': 'Example Dataset', 'author': 'Foo Bar', 'author_email': '*****@*****.**', 'extras': { 'foo': 'bar', 'baz': 'SPAM!' }, 'groups': ['one', 'two', 'three'], }) assert dataset.name == 'example-dataset' assert dataset.title == 'Example Dataset' assert dataset.groups == set(['one', 'two', 'three']) assert dataset.extras == {'foo': 'bar', 'baz': 'SPAM!'} assert isinstance(dataset.resources, ResourcesList) assert len(dataset.resources) == 0 # The order of groups doesn't matter.. _serialized = dataset.serialize() assert sorted(_serialized.pop('groups')) == sorted(['one', 'two', 'three']) assert _serialized == { 'id': None, 'name': 'example-dataset', 'title': 'Example Dataset', 'author': 'Foo Bar', 'author_email': '*****@*****.**', 'license_id': '', 'maintainer': '', 'maintainer_email': '', 'notes': '', 'owner_org': '', 'private': False, 'state': 'active', 'type': 'dataset', 'url': '', 'extras': { 'foo': 'bar', 'baz': 'SPAM!' }, 'resources': [], 'tags': [], }
def test_dataset_update_base_fields(ckan_client_hl): client = ckan_client_hl # shortcut ckp = MutableCheckpoint() # to check objects mutation # Create our dataset dataset_dict = generate_dataset() ckp.add(dataset_dict) dataset = CkanDataset(generate_dataset()) dataset.author = 'original author' dataset.author_email = '*****@*****.**' dataset.license_id = 'cc-zero' created = client.create_dataset(dataset) # Store a copy of the original dataset original_dataset = client.get_dataset(created.id) assert created.is_equivalent(original_dataset) ckp.add(original_dataset) # Update some base fields, send back & check to_be_updated = copy.deepcopy(original_dataset) to_be_updated.author = 'NEW_AUTHOR' to_be_updated.author_email = 'NEW_AUTHOR_EMAIL' to_be_updated.license_id = 'cc-by-sa' assert to_be_updated.is_modified() # Update, get back, check updated = client.update_dataset(to_be_updated) updated_2 = client.get_dataset(created.id) assert updated.is_equivalent(to_be_updated) assert updated.is_equivalent(updated_2) diffs = diff_mappings( original_dataset.serialize(), updated.serialize()) assert diffs['differing'] == set([ 'author', 'author_email', 'license_id', ]) assert diffs['left'] == set() assert diffs['right'] == set() # Make sure dicts did not mutate ckp.check()
def test_dataset_update_base_fields(ckan_client_hl): client = ckan_client_hl # shortcut ckp = MutableCheckpoint() # to check objects mutation # Create our dataset dataset_dict = generate_dataset() ckp.add(dataset_dict) dataset = CkanDataset(generate_dataset()) dataset.author = 'original author' dataset.author_email = '*****@*****.**' dataset.license_id = 'cc-zero' created = client.create_dataset(dataset) # Store a copy of the original dataset original_dataset = client.get_dataset(created.id) assert created.is_equivalent(original_dataset) ckp.add(original_dataset) # Update some base fields, send back & check to_be_updated = copy.deepcopy(original_dataset) to_be_updated.author = 'NEW_AUTHOR' to_be_updated.author_email = 'NEW_AUTHOR_EMAIL' to_be_updated.license_id = 'cc-by-sa' assert to_be_updated.is_modified() # Update, get back, check updated = client.update_dataset(to_be_updated) updated_2 = client.get_dataset(created.id) assert updated.is_equivalent(to_be_updated) assert updated.is_equivalent(updated_2) diffs = diff_mappings(original_dataset.serialize(), updated.serialize()) assert diffs['differing'] == set([ 'author', 'author_email', 'license_id', ]) assert diffs['left'] == set() assert diffs['right'] == set() # Make sure dicts did not mutate ckp.check()
def take_action(self, parsed_args): client = self._get_client(parsed_args) raw_data = self._read_file(parsed_args.filename) dataset_json = json.loads(raw_data) # Load dataset from file dataset = CkanDataset(dataset_json) # todo: we need to check whether this dataset exists # -> try getting and check.. dataset.id = None dataset.owner_org = None # todo: fill this dataset.groups = [] # todo: fill this for resource in dataset.resources: resource.id = None created = client.create_dataset(dataset) self.app.stdout.write(json.dumps(created.serialize()))
def test_ckandataset_creation(): dataset = CkanDataset({ 'name': 'example-dataset', 'title': 'Example Dataset', 'author': 'Foo Bar', 'author_email': '*****@*****.**', 'extras': {'foo': 'bar', 'baz': 'SPAM!'}, 'groups': ['one', 'two', 'three'], }) assert dataset.name == 'example-dataset' assert dataset.title == 'Example Dataset' assert dataset.groups == set(['one', 'two', 'three']) assert dataset.extras == {'foo': 'bar', 'baz': 'SPAM!'} assert isinstance(dataset.resources, ResourcesList) assert len(dataset.resources) == 0 # The order of groups doesn't matter.. _serialized = dataset.serialize() assert sorted(_serialized.pop('groups')) == sorted(['one', 'two', 'three']) assert _serialized == { 'id': None, 'name': 'example-dataset', 'title': 'Example Dataset', 'author': 'Foo Bar', 'author_email': '*****@*****.**', 'license_id': '', 'maintainer': '', 'maintainer_email': '', 'notes': '', 'owner_org': '', 'private': False, 'state': 'active', 'type': 'dataset', 'url': '', 'extras': {'foo': 'bar', 'baz': 'SPAM!'}, 'resources': [], 'tags': [], }
def test_dataset_update_resources(ckan_client_hl): client = ckan_client_hl # shortcut ds_dict = generate_dataset() ds_dict['resources'] = [ { 'name': 'example-csv-1', 'url': 'http://example.com/dataset-1.csv', 'format': 'CSV' }, { 'name': 'example-json-1', 'url': 'http://example.com/dataset-1.json', 'format': 'JSON' }, ] stage_1pre = CkanDataset(ds_dict) stage_1 = client.create_dataset(stage_1pre) # -------------------------------------------------- # Try adding a new resource stage_2pre = client.get_dataset(stage_1.id) stage_2pre.resources.append({ 'name': 'example-csv-2', 'url': 'http://example.com/dataset-2.csv', 'format': 'CSV' }) assert len(stage_2pre.resources) == 3 assert len(stage_2pre.serialize()['resources']) == 3 stage_2 = client.update_dataset(stage_2pre) assert len(stage_2.resources) == 3 assert len(stage_2.serialize()['resources']) == 3 # -------------------------------------------------- # Try prepending adding a new resource stage_3pre = client.get_dataset(stage_1.id) stage_3pre.resources.insert(0, { 'url': 'http://example.com/dataset-2.json', 'format': 'JSON' }) assert len(stage_3pre.resources) == 4 assert len(stage_3pre.serialize()['resources']) == 4 stage_3 = client.update_dataset(stage_3pre) assert len(stage_3.resources) == 4 assert len(stage_3.serialize()['resources']) == 4
def test_dataset_update_extras(ckan_client_hl): client = ckan_client_hl # shortcut ds_dict = generate_dataset() ds_dict['extras'] = { 'key-0': 'value-0', 'key-1': 'value-1', 'key-2': 'value-2', 'key-3': 'value-3', 'key-4': 'value-4', 'key-5': 'value-5', 'key-6': 'value-6', 'key-7': 'value-7', 'key-8': 'value-8', 'key-9': 'value-9', } stage_1pre = CkanDataset(ds_dict) stage_1 = client.create_dataset(stage_1pre) # -------------------------------------------------- # Try adding a new record stage_1b = client.get_dataset(stage_1.id) stage_2pre = copy.deepcopy(stage_1b) stage_2pre.extras['NEW_FIELD_NAME'] = 'NEW_FIELD_VALUE' stage_2 = client.update_dataset(stage_2pre) assert stage_2.is_equivalent(client.get_dataset(stage_1.id)) diffs = diff_mappings(stage_1b.serialize(), stage_2.serialize()) assert diffs['left'] == diffs['right'] == set() assert diffs['differing'] == set(['extras']) del stage_1b, stage_2pre, stage_2, diffs # -------------------------------------------------- # Try removing the custom field stage_2pre = client.get_dataset(stage_1.id) del stage_2pre.extras['NEW_FIELD_NAME'] stage_2 = client.update_dataset(stage_2pre) assert stage_2.is_equivalent(client.get_dataset(stage_1.id)) assert 'NEW_FIELD_NAME' not in stage_2.extras stage_2b = client.get_dataset(stage_1.id) assert stage_2 == stage_2b # Make sure we brought it back to its original state assert stage_1.is_equivalent(stage_2) del stage_2pre, stage_2
def test_ckan_dataset_resources(): dataset = CkanDataset({ 'name': 'example-dataset', }) assert dataset.is_modified() is False # By asking for resources, a copy will be made, # but the two items should match.. assert isinstance(dataset.resources, ResourcesList) assert len(dataset.resources) == 0 assert dataset.is_modified() is False # Resources can be passed as normal objects and # will be converted to CkanResource() objects. dataset.resources = [ {'name': 'resource-1'}, {'name': 'resource-2'}, ] # Make sure type conversions have been applied assert isinstance(dataset.resources, ResourcesList) for item in dataset.resources: assert isinstance(item, CkanResource) # Make sure dataset is marked as modified assert dataset.is_modified() is True # We allow comparison to plain objects assert dataset.resources == [ {'name': 'resource-1'}, {'name': 'resource-2'}, ] # Or to the actual types used internally, of course assert dataset.resources == ResourcesList([ CkanResource({'name': 'resource-1'}), CkanResource({'name': 'resource-2'}), ]) # Do some tests for object serialization serialized = dataset.serialize() assert isinstance(serialized['resources'], list) assert len(serialized['resources']) == 2 assert isinstance(serialized['resources'][0], dict) assert serialized['resources'][0]['name'] == 'resource-1' assert isinstance(serialized['resources'][1], dict) assert serialized['resources'][1]['name'] == 'resource-2' # Serialized data must be json-serializable json.dumps(serialized)
def test_dataset_wipe(ckan_client_hl): client = ckan_client_hl # ------------------------------------------------------------ # Now delete normally and try inserting another # one with the same name. Should fail with 409 dataset = CkanDataset(generate_dataset()) dataset.name = 'dataset-to-delete' created = client.create_dataset(dataset) assert created.is_equivalent(dataset) client.delete_dataset(created.id) new_dataset = CkanDataset(generate_dataset()) new_dataset.name = 'dataset-to-delete' with pytest.raises(HTTPError) as excinfo: client.create_dataset(new_dataset) assert excinfo.value.status_code == 409 del dataset, created, new_dataset, excinfo # ------------------------------------------------------------ # Now let's try updating + deleting dataset = CkanDataset(generate_dataset()) dataset.name = 'dataset-to-delete-2' created = client.create_dataset(dataset) assert created.is_equivalent(dataset) client.wipe_dataset(created.id) new_dataset = CkanDataset(generate_dataset()) new_dataset.name = 'dataset-to-delete-2' # Should not fail anymore created = client.create_dataset(new_dataset) assert created.name == 'dataset-to-delete-2'
def test_dataset_delete(ckan_client_hl): client = ckan_client_hl dataset_dict = generate_dataset() dataset = CkanDataset(dataset_dict) created = client.create_dataset(dataset) assert created.is_equivalent(dataset) # Make sure it is in lists assert created.id in client.list_datasets() # Delete it client.delete_dataset(created.id) assert created.id not in client.list_datasets() # Test that our workarounds work as expected.. with pytest.raises(HTTPError) as excinfo: client.get_dataset(created.id) assert excinfo.value.status_code == 404 retrieved = client.get_dataset(created.id, allow_deleted=True) assert retrieved.state == 'deleted'
def add_extras(): ds = CkanDataset() ds.extras = {'KEY': 'VALUE'} return ds
def repl_extras(): ds = CkanDataset({'extras': {'KEY': 'ORIGINAL-VALUE'}}) ds.extras = {'KEY': 'VALUE'} return ds
def set_extras(): ds = CkanDataset() ds.extras['KEY'] = 'VALUE' return ds
def upd_extras(): ds = CkanDataset({'extras': {'KEY': 'ORIGINAL-VALUE'}}) ds.extras['KEY'] = 'VALUE' return ds
def test_dataset_create(ckan_client_hl): client = ckan_client_hl dataset_dict = generate_dataset() dataset = CkanDataset(dataset_dict) created = client.create_dataset(dataset) assert created.is_equivalent(dataset)
def init_extras(): return CkanDataset({'extras': {'KEY': 'VALUE'}})
def test_ckan_dataset(): raw_data = { 'id': 'dataset-1', 'author': 'DATASET-AUTHOR', 'author_email': 'DATASET-AUTHOR_EMAIL', 'license_id': 'DATASET-LICENSE_ID', 'maintainer': 'DATASET-MAINTAINER', 'maintainer_email': 'DATASET-MAINTAINER_EMAIL', 'name': 'DATASET-NAME', 'notes': 'DATASET-NOTES', 'owner_org': 'DATASET-OWNER_ORG', 'private': 'DATASET-PRIVATE', 'state': 'DATASET-STATE', 'type': 'DATASET-TYPE', 'url': 'DATASET-URL', 'extras': { 'EXTRA_KEY_1': 'EXTRA-VALUE-1', 'EXTRA_KEY_2': 'EXTRA-VALUE-2', 'EXTRA_KEY_3': 'EXTRA-VALUE-3', }, 'groups': ['GROUP-1', 'GROUP-2', 'GROUP-3'], 'relationships': [], 'resources': [ { 'id': 'resource-1', 'description': 'RES1-DESCRIPTION', 'format': 'RES1-FORMAT', 'mimetype': 'RES1-MIMETYPE', 'mimetype_inner': 'RES1-MIMETYPE_INNER', 'name': 'RES1-NAME', 'position': 'RES1-POSITION', 'resource_type': 'RES1-RESOURCE_TYPE', 'size': 'RES1-SIZE', 'url': 'RES1-URL', 'url_type': 'RES1-URL_TYPE', }, { 'id': 'resource-2', 'description': 'RES2-DESCRIPTION', 'format': 'RES2-FORMAT', 'mimetype': 'RES2-MIMETYPE', 'mimetype_inner': 'RES2-MIMETYPE_INNER', 'name': 'RES2-NAME', 'position': 'RES2-POSITION', 'resource_type': 'RES2-RESOURCE_TYPE', 'size': 'RES2-SIZE', 'url': 'RES2-URL', 'url_type': 'RES2-URL_TYPE', }, { 'id': 'resource-3', 'description': 'RES3-DESCRIPTION', 'format': 'RES3-FORMAT', 'mimetype': 'RES3-MIMETYPE', 'mimetype_inner': 'RES3-MIMETYPE_INNER', 'name': 'RES3-NAME', 'position': 'RES3-POSITION', 'resource_type': 'RES3-RESOURCE_TYPE', 'size': 'RES3-SIZE', 'url': 'RES3-URL', 'url_type': 'RES3-URL_TYPE', }, ] } _raw_data = copy.deepcopy(raw_data) assert raw_data == _raw_data dataset = CkanDataset.from_dict(_raw_data) assert raw_data == _raw_data assert dataset.to_dict() == raw_data dataset.author = 'My author' assert dataset.is_modified() assert dataset.to_dict()['author'] == 'My author' # Create a new dataset dataset = CkanDataset.from_dict(raw_data) assert not dataset.is_modified() del dataset.resources[2] # delete 'resource-3' assert dataset.is_modified() dataset.resources.append(CkanResource.from_dict({ 'id': 'resource-4', 'description': 'RES4-DESCRIPTION', 'format': 'RES4-FORMAT', 'mimetype': 'RES4-MIMETYPE', 'mimetype_inner': 'RES4-MIMETYPE_INNER', 'name': 'RES4-NAME', 'position': 'RES4-POSITION', 'resource_type': 'RES4-RESOURCE_TYPE', 'size': 'RES4-SIZE', 'url': 'RES4-URL', 'url_type': 'RES4-URL_TYPE', })) assert dataset.to_dict()['resources'] == [ { 'id': 'resource-1', 'description': 'RES1-DESCRIPTION', 'format': 'RES1-FORMAT', 'mimetype': 'RES1-MIMETYPE', 'mimetype_inner': 'RES1-MIMETYPE_INNER', 'name': 'RES1-NAME', 'position': 'RES1-POSITION', 'resource_type': 'RES1-RESOURCE_TYPE', 'size': 'RES1-SIZE', 'url': 'RES1-URL', 'url_type': 'RES1-URL_TYPE', }, { 'id': 'resource-2', 'description': 'RES2-DESCRIPTION', 'format': 'RES2-FORMAT', 'mimetype': 'RES2-MIMETYPE', 'mimetype_inner': 'RES2-MIMETYPE_INNER', 'name': 'RES2-NAME', 'position': 'RES2-POSITION', 'resource_type': 'RES2-RESOURCE_TYPE', 'size': 'RES2-SIZE', 'url': 'RES2-URL', 'url_type': 'RES2-URL_TYPE', }, { 'id': 'resource-4', 'description': 'RES4-DESCRIPTION', 'format': 'RES4-FORMAT', 'mimetype': 'RES4-MIMETYPE', 'mimetype_inner': 'RES4-MIMETYPE_INNER', 'name': 'RES4-NAME', 'position': 'RES4-POSITION', 'resource_type': 'RES4-RESOURCE_TYPE', 'size': 'RES4-SIZE', 'url': 'RES4-URL', 'url_type': 'RES4-URL_TYPE', }, ]
"title": "Lure Dispenser comparison trial, thrips, Australia, Perth", "notes": "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)", "private": False, "owner_org": "plant-and-food-research-nz", "author": "Mette Nielson", } dataset_dict2 = { "name": "lure-dispenser-comparison-trial", "title": "Lure Dispenser comparison trial, thrips, Australia, Perth", "notes": "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)", "private": False, "owner_org": "plant-and-food-research-nz", "state": "active", "project_code": "P/1234", "author": "Mette Nielson", "project_leader_email": "*****@*****.**", "data_steward": "Mette Nielson", "data_steward_email": "*****@*****.**", "other_researcher": "David Teulon", "biometrician": "Ruth Butler", "credits": "Mel Walker", "license_id": "PFR Internal Use only" } new_dataset = client.create_dataset(CkanDataset(dataset_dict)) print(new_dataset)
def test_ckan_dataset_resources(): dataset = CkanDataset({ 'name': 'example-dataset', }) assert dataset.is_modified() is False # By asking for resources, a copy will be made, # but the two items should match.. assert isinstance(dataset.resources, ResourcesList) assert len(dataset.resources) == 0 assert dataset.is_modified() is False # Resources can be passed as normal objects and # will be converted to CkanResource() objects. dataset.resources = [ { 'name': 'resource-1' }, { 'name': 'resource-2' }, ] # Make sure type conversions have been applied assert isinstance(dataset.resources, ResourcesList) for item in dataset.resources: assert isinstance(item, CkanResource) # Make sure dataset is marked as modified assert dataset.is_modified() is True # We allow comparison to plain objects assert dataset.resources == [ { 'name': 'resource-1' }, { 'name': 'resource-2' }, ] # Or to the actual types used internally, of course assert dataset.resources == ResourcesList([ CkanResource({'name': 'resource-1'}), CkanResource({'name': 'resource-2'}), ]) # Do some tests for object serialization serialized = dataset.serialize() assert isinstance(serialized['resources'], list) assert len(serialized['resources']) == 2 assert isinstance(serialized['resources'][0], dict) assert serialized['resources'][0]['name'] == 'resource-1' assert isinstance(serialized['resources'][1], dict) assert serialized['resources'][1]['name'] == 'resource-2' # Serialized data must be json-serializable json.dumps(serialized)
def test_ckandataset_resources_update(): def _typecheck_resources(resources): assert isinstance(resources, ResourcesList) for item in resources: assert isinstance(item, CkanResource) dataset = CkanDataset({ 'name': 'example-dataset', 'resources': [ {'name': 'resource-1'}, {'name': 'resource-2'}, ] }) assert dataset.is_modified() is False assert dataset.resources == [ {'name': 'resource-1'}, {'name': 'resource-2'}, ] # Getting should not affect is_modified(), although # it is manipulating things internally.. assert dataset.is_modified() is False dataset.resources.append({'name': 'resource-3'}) assert dataset.is_modified() is True assert dataset.resources == [ {'name': 'resource-1'}, {'name': 'resource-2'}, {'name': 'resource-3'}, ] _typecheck_resources(dataset.resources) dataset.resources.insert(0, {'name': 'resource-0'}) assert dataset.is_modified() is True assert dataset.resources == [ {'name': 'resource-0'}, {'name': 'resource-1'}, {'name': 'resource-2'}, {'name': 'resource-3'}, ] _typecheck_resources(dataset.resources) dataset.resources[2] = {'name': 'RESOURCE-2'} assert dataset.is_modified() is True assert dataset.resources == [ {'name': 'resource-0'}, {'name': 'resource-1'}, {'name': 'RESOURCE-2'}, {'name': 'resource-3'}, ] _typecheck_resources(dataset.resources) dataset.resources = [{'name': 'Hello'}] assert dataset.is_modified() is True assert dataset.resources == [ {'name': 'Hello'}, ] _typecheck_resources(dataset.resources) # "Contains" test is successful as fields left to # default values just get ignored during comparison. assert {'name': 'Hello'} in dataset.resources # assert {'name': 'WTF'} not in dataset.resources assert {'name': 'WTF, seriously'} not in dataset.resources
def sync(self, source_name, data): """ Synchronize data from a source into Ckan. - datasets are matched by _harvest_source - groups and organizations are matched by name :param source_name: String identifying the source of the data. Used to build ids that will be used in further synchronizations. :param data: Data to be synchronized. Should be a dict (or dict-like) with top level keys coresponding to the object type, mapping to dictionaries of ``{'id': <object>}``. """ groups = dict( (key, CkanGroup(val)) for key, val in data['group'].iteritems()) organizations = dict( (key, CkanOrganization(val)) for key, val in data['organization'].iteritems()) # Upsert groups and organizations groups_map = self._upsert_groups(groups) orgs_map = self._upsert_organizations(organizations) # Create list of datasets to be synced source_datasets = {} for source_id, dataset_dict in data['dataset'].iteritems(): _dataset_dict = copy.deepcopy(dataset_dict) # We need to make sure "source" datasets # don't have (otherwise misleading) ids _dataset_dict.pop('id', None) # We need to update groups and organizations, # to map their name from the source into a # ckan id _dataset_dict['groups'] = [ groups_map.to_ckan(grp_id) for grp_id in _dataset_dict['groups'] ] _dataset_dict['owner_org'] = \ orgs_map.to_ckan(_dataset_dict['owner_org']) dataset = CkanDataset(_dataset_dict) # We also want to add the "source id", used for further # synchronizations to find stuff dataset.extras[HARVEST_SOURCE_ID_FIELD] = \ self._join_source_id(source_name, source_id) source_datasets[source_id] = dataset # Retrieve list of datasets from Ckan ckan_datasets = self._find_datasets_by_source(source_name) # Compare collections to find differences differences = self._compare_collections( ckan_datasets, source_datasets) # ------------------------------------------------------------ # We now need to create/update/delete datasets. # todo: we need to make sure dataset names are not # already used by another dataset. The only # way is to randomize resource names and hope # a 409 response indicates duplicate name.. # We delete first, in order to (possibly) deallocate # some already-used names.. for source_id in differences['left']: ckan_id = ckan_datasets[source_id].id logger.info('Deleting dataset {0}'.format(ckan_id)) self._client.delete_dataset(ckan_id) def force_dataset_operation(operation, dataset, retry=5): # Maximum dataset name length is 100 characters # We trim it down to 80 just to be safe. # Note: we generally want to preserve the original name # and there should *never* be problems with that # when updating.. _orig_name = dataset.name[:80] dataset.name = _orig_name while True: try: result = operation(dataset) except HTTPError, e: if e.status_code != 409: raise retry -= 1 if retry < 0: raise dataset.name = '{0}-{1:06d}'.format( _orig_name, random.randint(0, 999999)) logger.debug('Got 409: trying to rename dataset to {0}' .format(dataset.name)) else: return result
def test_ckandataset_resources_update(): def _typecheck_resources(resources): assert isinstance(resources, ResourcesList) for item in resources: assert isinstance(item, CkanResource) dataset = CkanDataset({ 'name': 'example-dataset', 'resources': [ { 'name': 'resource-1' }, { 'name': 'resource-2' }, ] }) assert dataset.is_modified() is False assert dataset.resources == [ { 'name': 'resource-1' }, { 'name': 'resource-2' }, ] # Getting should not affect is_modified(), although # it is manipulating things internally.. assert dataset.is_modified() is False dataset.resources.append({'name': 'resource-3'}) assert dataset.is_modified() is True assert dataset.resources == [ { 'name': 'resource-1' }, { 'name': 'resource-2' }, { 'name': 'resource-3' }, ] _typecheck_resources(dataset.resources) dataset.resources.insert(0, {'name': 'resource-0'}) assert dataset.is_modified() is True assert dataset.resources == [ { 'name': 'resource-0' }, { 'name': 'resource-1' }, { 'name': 'resource-2' }, { 'name': 'resource-3' }, ] _typecheck_resources(dataset.resources) dataset.resources[2] = {'name': 'RESOURCE-2'} assert dataset.is_modified() is True assert dataset.resources == [ { 'name': 'resource-0' }, { 'name': 'resource-1' }, { 'name': 'RESOURCE-2' }, { 'name': 'resource-3' }, ] _typecheck_resources(dataset.resources) dataset.resources = [{'name': 'Hello'}] assert dataset.is_modified() is True assert dataset.resources == [ { 'name': 'Hello' }, ] _typecheck_resources(dataset.resources) # "Contains" test is successful as fields left to # default values just get ignored during comparison. assert {'name': 'Hello'} in dataset.resources # assert {'name': 'WTF'} not in dataset.resources assert {'name': 'WTF, seriously'} not in dataset.resources
def test_ckan_dataset(): raw_data = { 'id': 'dataset-1', 'author': 'DATASET-AUTHOR', 'author_email': 'DATASET-AUTHOR_EMAIL', 'license_id': 'DATASET-LICENSE_ID', 'maintainer': 'DATASET-MAINTAINER', 'maintainer_email': 'DATASET-MAINTAINER_EMAIL', 'name': 'DATASET-NAME', 'notes': 'DATASET-NOTES', 'owner_org': 'DATASET-OWNER_ORG', 'private': 'DATASET-PRIVATE', 'state': 'DATASET-STATE', 'type': 'DATASET-TYPE', 'url': 'DATASET-URL', 'extras': { 'EXTRA_KEY_1': 'EXTRA-VALUE-1', 'EXTRA_KEY_2': 'EXTRA-VALUE-2', 'EXTRA_KEY_3': 'EXTRA-VALUE-3', }, 'groups': ['GROUP-1', 'GROUP-2', 'GROUP-3'], 'relationships': [], 'resources': [ { 'id': 'resource-1', 'description': 'RES1-DESCRIPTION', 'format': 'RES1-FORMAT', 'mimetype': 'RES1-MIMETYPE', 'mimetype_inner': 'RES1-MIMETYPE_INNER', 'name': 'RES1-NAME', 'position': 'RES1-POSITION', 'resource_type': 'RES1-RESOURCE_TYPE', 'size': 'RES1-SIZE', 'url': 'RES1-URL', 'url_type': 'RES1-URL_TYPE', }, { 'id': 'resource-2', 'description': 'RES2-DESCRIPTION', 'format': 'RES2-FORMAT', 'mimetype': 'RES2-MIMETYPE', 'mimetype_inner': 'RES2-MIMETYPE_INNER', 'name': 'RES2-NAME', 'position': 'RES2-POSITION', 'resource_type': 'RES2-RESOURCE_TYPE', 'size': 'RES2-SIZE', 'url': 'RES2-URL', 'url_type': 'RES2-URL_TYPE', }, { 'id': 'resource-3', 'description': 'RES3-DESCRIPTION', 'format': 'RES3-FORMAT', 'mimetype': 'RES3-MIMETYPE', 'mimetype_inner': 'RES3-MIMETYPE_INNER', 'name': 'RES3-NAME', 'position': 'RES3-POSITION', 'resource_type': 'RES3-RESOURCE_TYPE', 'size': 'RES3-SIZE', 'url': 'RES3-URL', 'url_type': 'RES3-URL_TYPE', }, ] } _raw_data = copy.deepcopy(raw_data) assert raw_data == _raw_data dataset = CkanDataset.from_dict(_raw_data) assert raw_data == _raw_data assert dataset.to_dict() == raw_data dataset.author = 'My author' assert dataset.is_modified() assert dataset.to_dict()['author'] == 'My author' # Create a new dataset dataset = CkanDataset.from_dict(raw_data) assert not dataset.is_modified() del dataset.resources[2] # delete 'resource-3' assert dataset.is_modified() dataset.resources.append( CkanResource.from_dict({ 'id': 'resource-4', 'description': 'RES4-DESCRIPTION', 'format': 'RES4-FORMAT', 'mimetype': 'RES4-MIMETYPE', 'mimetype_inner': 'RES4-MIMETYPE_INNER', 'name': 'RES4-NAME', 'position': 'RES4-POSITION', 'resource_type': 'RES4-RESOURCE_TYPE', 'size': 'RES4-SIZE', 'url': 'RES4-URL', 'url_type': 'RES4-URL_TYPE', })) assert dataset.to_dict()['resources'] == [ { 'id': 'resource-1', 'description': 'RES1-DESCRIPTION', 'format': 'RES1-FORMAT', 'mimetype': 'RES1-MIMETYPE', 'mimetype_inner': 'RES1-MIMETYPE_INNER', 'name': 'RES1-NAME', 'position': 'RES1-POSITION', 'resource_type': 'RES1-RESOURCE_TYPE', 'size': 'RES1-SIZE', 'url': 'RES1-URL', 'url_type': 'RES1-URL_TYPE', }, { 'id': 'resource-2', 'description': 'RES2-DESCRIPTION', 'format': 'RES2-FORMAT', 'mimetype': 'RES2-MIMETYPE', 'mimetype_inner': 'RES2-MIMETYPE_INNER', 'name': 'RES2-NAME', 'position': 'RES2-POSITION', 'resource_type': 'RES2-RESOURCE_TYPE', 'size': 'RES2-SIZE', 'url': 'RES2-URL', 'url_type': 'RES2-URL_TYPE', }, { 'id': 'resource-4', 'description': 'RES4-DESCRIPTION', 'format': 'RES4-FORMAT', 'mimetype': 'RES4-MIMETYPE', 'mimetype_inner': 'RES4-MIMETYPE_INNER', 'name': 'RES4-NAME', 'position': 'RES4-POSITION', 'resource_type': 'RES4-RESOURCE_TYPE', 'size': 'RES4-SIZE', 'url': 'RES4-URL', 'url_type': 'RES4-URL_TYPE', }, ]
def sync(self, source_name, data): """ Synchronize data from a source into Ckan. - datasets are matched by _harvest_source - groups and organizations are matched by name :param source_name: String identifying the source of the data. Used to build ids that will be used in further synchronizations. :param data: Data to be synchronized. Should be a dict (or dict-like) with top level keys coresponding to the object type, mapping to dictionaries of ``{'id': <object>}``. """ groups = dict( (key, CkanGroup(val)) for key, val in data['group'].iteritems()) organizations = dict( (key, CkanOrganization(val)) for key, val in data['organization'].iteritems()) # Upsert groups and organizations groups_map = self._upsert_groups(groups) orgs_map = self._upsert_organizations(organizations) # Create list of datasets to be synced logger.info('Creating list of datasets to be synchronized') source_datasets = {} for source_id, dataset_dict in data['dataset'].iteritems(): _dataset_dict = copy.deepcopy(dataset_dict) # We need to make sure "source" datasets # don't have (otherwise misleading) ids _dataset_dict.pop('id', None) # We need to update groups and organizations, # to map their name from the source into a # ckan id _dataset_dict['groups'] = [ groups_map.to_ckan(grp_id) for grp_id in _dataset_dict['groups'] ] _dataset_dict['owner_org'] = \ orgs_map.to_ckan(_dataset_dict['owner_org']) dataset = CkanDataset(_dataset_dict) # We also want to add the "source id", used for further # synchronizations to find stuff dataset.extras[HARVEST_SOURCE_ID_FIELD] = \ self._join_source_id(source_name, source_id) source_datasets[source_id] = dataset # Retrieve list of datasets from Ckan logger.info('Retrieving current status from Ckan') ckan_datasets = self._find_datasets_by_source(source_name) # Compare collections to find differences differences = self._compare_collections( ckan_datasets, source_datasets) # ------------------------------------------------------------ # We now need to create/update/delete datasets. # todo: we need to make sure dataset names are not # already used by another dataset. The only # way is to randomize resource names and hope # a 409 response indicates duplicate name.. # _progress_total = sum(len(differences[x]) # for x in ('left', 'right', 'differing')) # _progress_next = itertools.count(1).next # report_progress(0, _progress_total) _prog_tot_add = len(differences['right']) _prog_next_add = itertools.count(1).next _prog_tot_remove = len(differences['left']) _prog_next_remove = itertools.count(1).next _prog_tot_update = len(differences['differing']) _prog_next_update = itertools.count(1).next # Create progress bars early.. report_progress(('datasets', 'delete'), 0, _prog_tot_remove) report_progress(('datasets', 'create'), 0, _prog_tot_add) report_progress(('datasets', 'update'), 0, _prog_tot_update) # We delete first, in order to (possibly) deallocate # some already-used names.. for source_id in differences['left']: ckan_id = ckan_datasets[source_id].id logger.info('Deleting dataset {0}'.format(ckan_id)) self._client.delete_dataset(ckan_id) report_progress(('datasets', 'delete'), _prog_next_remove(), _prog_tot_remove) def force_dataset_operation(operation, dataset, retry=5): # Maximum dataset name length is 100 characters # We trim it down to 80 just to be safe. # Note: we generally want to preserve the original name # and there should *never* be problems with that # when updating.. _orig_name = dataset.name[:80] dataset.name = _orig_name while True: try: result = operation(dataset) except HTTPError, e: if e.status_code != 409: raise retry -= 1 if retry < 0: raise dataset.name = '{0}-{1:06d}'.format( _orig_name, random.randint(0, 999999)) logger.debug('Got 409: trying to rename dataset to {0}' .format(dataset.name)) else: return result