def test_gather_normal(self): source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) job = HarvestJobObj(source=source) harvester = CKANHarvester() obj_ids = harvester.gather_stage(job) assert_equal(type(obj_ids), list) assert_equal(len(obj_ids), len(mock_ckan.DATASETS)) harvest_object = harvest_model.HarvestObject.get(obj_ids[0]) assert_equal(harvest_object.guid, mock_ckan.DATASETS[0]['id'])
def test_get_content_handles_request_exception(self, mock_requests_get, mock_config, mock_pyopenssl_inject): mock_config.return_value = {} harvester = CKANHarvester() with assert_raises(ContentFetchError) as context: harvester._get_content("http://test.example.gov.uk") assert str(context.exception) == 'Request error: Test exception'
def test_get_content_handles_http_error(self, mock_requests_get, mock_config, mock_pyopenssl_inject): mock_config.return_value = {} harvester = CKANHarvester() with pytest.raises(ContentFetchError) as context: harvester._get_content("http://test.example.gov.uk") assert str( context.value) == 'HTTP error: 404 http://test.example.gov.uk'
def test_fetch_normal(self): source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) job = HarvestJobObj(source=source) harvest_object = HarvestObjectObj(guid=mock_ckan.DATASETS[0]['id'], job=job, content=json.dumps( mock_ckan.DATASETS[0])) harvester = CKANHarvester() result = harvester.fetch_stage(harvest_object) assert_equal(harvest_object.errors, []) assert_equal(result, True)
def test_gather_normal(self): source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) job = HarvestJobObj(source=source) harvester = CKANHarvester() obj_ids = harvester.gather_stage(job) assert job.gather_errors == [] assert type(obj_ids) == list assert len(obj_ids) == len(mock_ckan.DATASETS) harvest_object = harvest_model.HarvestObject.get(obj_ids[0]) assert harvest_object.guid == mock_ckan.DATASETS[0]['id'] assert json.loads(harvest_object.content) == mock_ckan.DATASETS[0]
def test_fetch_normal(self): source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) job = HarvestJobObj(source=source) harvest_object = HarvestObjectObj( guid=mock_ckan.DATASETS[0]['id'], job=job, content=json.dumps(mock_ckan.DATASETS[0])) harvester = CKANHarvester() result = harvester.fetch_stage(harvest_object) assert_equal(harvest_object.errors, []) assert_equal(result, True)
def test_import_normal(self): org = Organization() harvest_object = HarvestObjectObj(guid=mock_ckan.DATASETS[0]['id'], content=json.dumps( mock_ckan.DATASETS[0]), job__source__owner_org=org['id']) harvester = CKANHarvester() result = harvester.import_stage(harvest_object) assert_equal(harvest_object.errors, []) assert_equal(result, True) assert harvest_object.package_id dataset = model.Package.get(harvest_object.package_id) assert_equal(dataset.name, mock_ckan.DATASETS[0]['name'])
def test_harvest_not_modified(self): run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) results_by_guid = run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) # The metadata_modified was the same for this dataset so the import # would have returned 'unchanged' result = results_by_guid[mock_ckan.DATASETS[1]['name']] assert_equal(result['state'], 'COMPLETE') assert_equal(result['report_status'], 'not modified') assert 'dataset' not in result assert_equal(result['errors'], [])
def test_import_normal(self): org = Organization() harvest_object = HarvestObjectObj( guid=mock_ckan.DATASETS[0]['id'], content=json.dumps(mock_ckan.DATASETS[0]), job__source__owner_org=org['id']) harvester = CKANHarvester() result = harvester.import_stage(harvest_object) assert_equal(harvest_object.errors, []) assert_equal(result, True) assert harvest_object.package_id dataset = model.Package.get(harvest_object.package_id) assert_equal(dataset.name, mock_ckan.DATASETS[0]['name'])
def test_harvest_not_modified(self): run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) results_by_guid = run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) # The metadata_modified was the same for this dataset so the import # would have returned 'unchanged' result = results_by_guid[mock_ckan.DATASETS[1]['id']] assert result['state'] == 'COMPLETE' assert result['report_status'] == 'not modified' assert 'dataset' not in result assert result['errors'] == [] assert was_last_job_considered_error_free()
def test_default_extras(self): config = { 'default_extras': { 'encoding': 'utf8', 'harvest_url': '{harvest_source_url}/dataset/{dataset_id}' } } tmp_c = toolkit.c try: # c.user is used by the validation (annoying), # however patch doesn't work because it's a weird # StackedObjectProxy, so we swap it manually toolkit.c = MagicMock(user='') results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) finally: toolkit.c = tmp_c assert_equal(results_by_guid['dataset1-id']['errors'], []) extras = results_by_guid['dataset1-id']['dataset']['extras'] extras_dict = dict((e['key'], e['value']) for e in extras) assert_equal(extras_dict['encoding'], 'utf8') assert_equal(extras_dict['harvest_url'], 'http://localhost:8998/dataset/dataset1-id')
def test_default_groups(self): Group(id='group1-id', name='group1') Group(id='group2-id', name='group2') Group(id='group3-id', name='group3') config = { 'default_groups': ['group2-id', 'group3'], 'remote_groups': 'only_local' } tmp_c = toolkit.c try: # c.user is used by the validation (annoying), # however patch doesn't work because it's a weird # StackedObjectProxy, so we swap it manually toolkit.c = MagicMock(user='') results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) finally: toolkit.c = tmp_c assert_equal(results_by_guid['dataset1-id']['errors'], []) groups = results_by_guid['dataset1-id']['dataset']['groups'] group_names = set(group['name'] for group in groups) # group1 comes from the harvested dataset # group2 & 3 come from the default_groups assert_equal(group_names, set(('group1', 'group2', 'group3')))
def gather_stage(self, harvest_job): # make sure we have all the right organizations url = harvest_job.source.url session = requests.Session() r = session.get("{}/api/action/organization_list".format(url)) if r.json()["success"]: remote_organizations = r.json()['result'] local_organizations = model.Group.all("organization") local_organization_names = [ org.name for org in local_organizations ] for remote_org in remote_organizations: if remote_org not in local_organization_names: context = { 'model': model, 'session': Session, 'user': self._get_user_name(), 'ignore_auth': True, } session = requests.Session() r = session.get( "{}/api/action/organization_show?id={}".format( url, remote_org)) if r.json()["success"]: remote_organization = r.json()['result'] new_package = p.toolkit.get_action( "organization_create")(context, remote_organization) return CKANHarvester.gather_stage(self, harvest_job)
def was_last_job_considered_error_free(): last_job = model.Session.query(harvest_model.HarvestJob) \ .order_by(harvest_model.HarvestJob.created.desc()) \ .first() job = MagicMock() job.source = last_job.source job.id = '' return bool(CKANHarvester._last_error_free_job(job))
def test_default_tags_invalid(self): config = {'default_tags': ['geo']} # should be list of dicts with assert_raises(toolkit.ValidationError) as harvest_context: run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert_in('default_tags must be a list of dictionaries', str(harvest_context.exception))
def test_default_extras_invalid(self): config = { 'default_extras': 'utf8', # value should be a dict } assert_raises(run_harvest, url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config))
def test_harvest_whilst_datasets_added(self): results_by_guid = run_harvest( url='http://localhost:%s/datasets_added' % mock_ckan.PORT, harvester=CKANHarvester()) assert_equal( sorted(results_by_guid.keys()), [mock_ckan.DATASETS[1]['id'], mock_ckan.DATASETS[0]['id']])
def test_include_groups(self): config = {'groups_filter_include': ['group1']} results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert 'dataset1-id' in results_by_guid assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
def test_exclude_organizations(self): config = {'organizations_filter_exclude': ['org1']} results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert 'dataset1-id' not in results_by_guid assert mock_ckan.DATASETS[1]['id'] in results_by_guid
def was_last_job_considered_error_free(): last_job = (ckan.model.Session.query( ckanext.harvest.model.HarvestJob).order_by( ckanext.harvest.model.HarvestJob.created.desc()).first()) job = mock.MagicMock() job.source = last_job.source job.id = '' return bool(CKANHarvester.last_error_free_job(job))
def test_default_tags(self): config = {'default_tags': [{'name': 'geo'}]} results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) tags = results_by_guid['dataset1-id']['dataset']['tags'] tag_names = [tag['name'] for tag in tags] assert 'geo' in tag_names
def test_remote_groups_create(self): config = {'remote_groups': 'create'} results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert 'dataset1-id' in results_by_guid # Check that the remote group was created locally call_action('group_show', {}, id=mock_ckan.GROUPS[0]['id'])
def test_default_groups_invalid(self): Group(id='group2-id', name='group2') # should be list of strings config = {'default_tags': [{'name': 'group2'}]} assert_raises(run_harvest, url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config))
def test_default_extras_invalid(self): config = { 'default_extras': 'utf8', # value should be a dict } with assert_raises(toolkit.ValidationError) as harvest_context: run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert_in('default_extras must be a dictionary', str(harvest_context.exception))
def test_harvest_invalid_tag(self): from nose.plugins.skip import SkipTest; raise SkipTest() results_by_guid = run_harvest( url='http://localhost:%s/invalid_tag' % mock_ckan.PORT, harvester=CKANHarvester()) result = results_by_guid['dataset1-id'] assert_equal(result['state'], 'COMPLETE') assert_equal(result['report_status'], 'added') assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name'])
def test_default_groups_invalid(self): Group(id='group2-id', name='group2') # should be list of strings config = {'default_groups': [{'name': 'group2'}]} with assert_raises(toolkit.ValidationError) as harvest_context: run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert_in('default_groups must be a list of group names/ids', str(harvest_context.exception))
def test_harvest_twice(self): run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) # change the modified date datasets = copy.deepcopy(mock_ckan.DATASETS) datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366' with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS', datasets): results_by_guid = run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) # updated the dataset which has revisions result = results_by_guid[mock_ckan.DATASETS[1]['name']] assert_equal(result['state'], 'COMPLETE') assert_equal(result['report_status'], 'updated') assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name']) assert_equal(result['errors'], []) # the other dataset is unchanged and not harvested assert mock_ckan.DATASETS[1]['name'] not in result
def test_harvest_info_in_package_show(self): results_by_guid = run_harvest(url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester()) assert 'dataset1-id' in results_by_guid # Check that the dataset extras has the harvest_object_id, harvest_source_id, and harvest_source_title dataset = call_action('package_show', {"for_view": True}, id=mock_ckan.DATASETS[0]['id']) extras_dict = dict((e['key'], e['value']) for e in dataset['extras']) assert 'harvest_object_id' in extras_dict assert 'harvest_source_id' in extras_dict assert 'harvest_source_title' in extras_dict
def test_default_extras(self): config = { 'default_extras': { 'encoding': 'utf8', 'harvest_url': '{harvest_source_url}/dataset/{dataset_id}' }} results_by_guid = run_harvest( url='http://localhost:%s' % mock_ckan.PORT, harvester=CKANHarvester(), config=json.dumps(config)) assert_equal(results_by_guid['dataset1-id']['errors'], []) extras = results_by_guid['dataset1-id']['dataset']['extras'] extras_dict = dict((e['key'], e['value']) for e in extras) assert_equal(extras_dict['encoding'], 'utf8') assert_equal(extras_dict['harvest_url'], 'http://localhost:8998/dataset/dataset1-id')
def test_harvest(self): results_by_guid = run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) result = results_by_guid['dataset1-id'] assert_equal(result['state'], 'COMPLETE') assert_equal(result['report_status'], 'added') assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name']) assert_equal(result['errors'], []) result = results_by_guid[mock_ckan.DATASETS[1]['id']] assert_equal(result['state'], 'COMPLETE') assert_equal(result['report_status'], 'added') assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name']) assert_equal(result['errors'], [])
def test_harvest(self): results_by_guid = run_harvest(url='http://localhost:%s/' % mock_ckan.PORT, harvester=CKANHarvester()) result = results_by_guid['dataset1-id'] assert result['state'] == 'COMPLETE' assert result['report_status'] == 'added' assert result['dataset']['name'] == mock_ckan.DATASETS[0]['name'] assert result['errors'] == [] result = results_by_guid[mock_ckan.DATASETS[1]['id']] assert result['state'] == 'COMPLETE' assert result['report_status'] == 'added' assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name'] assert result['errors'] == [] assert was_last_job_considered_error_free()
def get_harvested_package_dict(cls, harvest_object): package = CKANHarvester.get_harvested_package_dict(harvest_object) # change the DKAN-isms into CKAN-style try: if 'extras' not in package: package['extras'] = {} if 'name' not in package: package['name'] = munge.munge_title_to_name(package['title']) if 'description' in package: package['notes'] = package['description'] for license in model.Package.get_license_register().values(): if license.title == package['license_title']: package['license_id'] = license.id break else: package['license_id'] = 'notspecified' if 'resources' not in package: raise PackageDictError('Dataset has no resources') for resource in package['resources']: resource['description'] = resource['title'] if 'revision_id' in resource: del resource['revision_id'] if 'format' not in resource: resource['format'] = MIMETYPE_FORMATS.get( resource.get('mimetype'), '') if 'private' in package: # DKAN appears to have datasets with private=True which are # still public: https://github.com/NuCivic/dkan/issues/950. If # they were really private then we'd not get be able to access # them, so assume they are not private. package['private'] = False return package except (Exception) as e: cls._save_object_error( 'Unable to get convert DKAN to CKAN package: %s' % e, harvest_object) return None