def test_include_org_filter(ckan): # create 2 organizations with 2 datasets each org = ckan.action('organization_create', {'name': 'org-1'})['result'] included_ids = [ d['id'] for d in [ package_factory(ckan, owner_org=org['id']), package_factory(ckan, owner_org=org['id']), ] ] org2 = ckan.action('organization_create', {'name': 'org-2'})['result'] package_factory(ckan, owner_org=org2['id']) package_factory(ckan, owner_org=org2['id']) source = HarvestSourceFactory( backend='ckan', url=ckan.BASE_URL, config={'filters': [{ 'key': 'organization', 'value': org['name'] }]}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) == len(included_ids) for dataset in Dataset.objects: assert dataset.extras['harvest:remote_id'] in included_ids
def feed_ckan_and_harvest(request, source, ckan, app): ''' This fixture feed CKAN with data from data fixtures, then perform the harvesting and return the data and results for this module tests ''' module = request.module session = request.session items = [item for item in session.items if item.module == module] rundata = {} fixtures = { i.get_closest_marker('ckan_data').args[0] for i in items if i.get_closest_marker('ckan_data') } for fixture in fixtures: values = request.getfixturevalue(fixture) data, kwargs = values if isinstance(values, tuple) else (values, {}) result = ckan.action('package_create', data) rundata[fixture] = data, result, kwargs with app.app_context(): actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) == len(fixtures) return rundata
def test_can_have_multiple_filters(ckan): # create 2 organizations with 2 datasets each org = ckan.action('organization_create', {'name': 'org-1'})['result'] package = package_factory(ckan, owner_org=org['id'], tags=[{ 'name': 'tag-1' }]) package_factory(ckan, owner_org=org['id'], tags=[{'name': 'tag-2'}]) org2 = ckan.action('organization_create', {'name': 'org-2'})['result'] package_factory(ckan, owner_org=org2['id'], tags=[{'name': 'tag-1'}]), package_factory(ckan, owner_org=org2['id'], tags=[{'name': 'tag-2'}]), source = HarvestSourceFactory(backend='ckan', url=ckan.BASE_URL, config={ 'filters': [ { 'key': 'organization', 'value': org['name'] }, { 'key': 'tags', 'value': 'tag-1' }, ] }) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) == 1 assert Dataset.objects.count() == 1 assert Dataset.objects.first().extras['harvest:remote_id'] == package['id']
def test_dkan_french_w_license(app, rmock): '''CKAN Harvester should accept the minimum dataset payload''' DKAN_URL = 'https://harvest.me/' API_URL = '{}api/3/action/'.format(DKAN_URL) PACKAGE_LIST_URL = '{}package_list'.format(API_URL) PACKAGE_SHOW_URL = '{}package_show'.format(API_URL) with open(data_path('dkan-french-w-license.json')) as ifile: data = json.loads(ifile.read()) org = OrganizationFactory() source = HarvestSourceFactory(backend='dkan', url=DKAN_URL, organization=org) rmock.get(PACKAGE_LIST_URL, json={'success': True, 'result': ['fake-name']}, status_code=200, headers={'Content-Type': 'application/json'}) rmock.get(PACKAGE_SHOW_URL, json=data, status_code=200, headers={'Content-Type': 'application/json'}) actions.run(source.slug) source.reload() assert source.get_last_job().status == 'done' datasets = Dataset.objects.filter(organization=org) assert len(datasets) > 0 q = {'extras__harvest:remote_id': '04be6288-696d-4331-850d-a144871a7e3a'} dataset = datasets.get(**q) assert dataset.created_at == datetime(2019, 12, 10, 0, 0) assert dataset.last_modified == datetime(2019, 9, 30, 0, 0) assert len(dataset.resources) == 2 assert 'xlsx' in [r.format for r in dataset.resources]
def test_simple(): org = OrganizationFactory() source = HarvestSourceFactory(backend='{{ cookiecutter.identifier }}', url=faker.url(), organization=org) # TODO: mock remote endpoints responses actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) > 0
def test_minimal_ckan_response(rmock): '''CKAN Harvester should accept the minimum dataset payload''' CKAN_URL = 'https://harvest.me/' API_URL = '{}api/3/action/'.format(CKAN_URL) PACKAGE_LIST_URL = '{}package_list'.format(API_URL) PACKAGE_SHOW_URL = '{}package_show'.format(API_URL) name = faker.unique_string() json = { 'success': True, 'result': { 'id': faker.uuid4(), 'name': name, 'title': faker.sentence(), 'maintainer': faker.name(), 'tags': [], 'private': False, 'maintainer_email': faker.email(), 'license_id': None, 'metadata_created': faker.iso8601(), 'organization': None, 'metadata_modified': faker.iso8601(), 'author': None, 'author_email': None, 'notes': faker.paragraph(), 'license_title': None, 'state': None, 'revision_id': faker.unique_string(), 'type': 'dataset', 'resources': [], # extras is not always present so we exclude it from the minimal payload } } source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(PACKAGE_LIST_URL, json={ 'success': True, 'result': [name] }, status_code=200, headers={'Content-Type': 'application/json'}) rmock.get(PACKAGE_SHOW_URL, json=json, status_code=200, headers={'Content-Type': 'application/json'}) actions.run(source.slug) source.reload() assert source.get_last_job().status == 'done'
def test_dkan_demo_harvest(source, app): ''' Harvest DKAN_TEST_INSTANCE and check some datasets are created ''' with app.app_context(): actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) > 0 datasets = Dataset.objects.filter(organization=source.organization) assert len(job.items) == datasets.count() for dataset in datasets: assert len(dataset.resources) > 0 assert job.status == 'done'
def test_standard_api_json_error(rmock): json = {'success': False, 'error': 'an error'} source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(API_URL, json=json, status_code=200, headers={'Content-Type': 'application/json'}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) is 0 assert len(job.errors) is 1 error = job.errors[0] assert error.message == 'an error'
def test_plain_text_error(rmock, code): source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(API_URL, text='"Some error"', status_code=code, headers={'Content-Type': 'text/plain'}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) is 0 assert len(job.errors) is 1 error = job.errors[0] # Raw quoted string is properly unquoted assert error.message == 'Some error'
def test_html_error(rmock, code): # Happens with wrong source URL (html is returned instead of json) html = '<html><body>Error</body></html>' source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(API_URL, text=html, status_code=code, headers={'Content-Type': 'text/html'}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) is 0 assert len(job.errors) is 1 error = job.errors[0] # HTML is detected and does not clutter the message assert html not in error.message
def test_tag_filter(ckan): # create 2 datasets with a different tag each tag = faker.word() package = package_factory(ckan, tags=[{'name': tag}]) package_factory(ckan, tags=[{'name': faker.word()}]) source = HarvestSourceFactory( backend='ckan', url=ckan.BASE_URL, config={'filters': [{ 'key': 'tags', 'value': tag }]}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) == 1 assert Dataset.objects.count() == 1 assert Dataset.objects.first().extras['harvest:remote_id'] == package['id']
def test_standard_api_json_error_with_details_and_type(rmock): json = { 'success': False, 'error': { 'message': 'Access denied', '__type': 'Authorization Error', } } source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(API_URL, json=json, status_code=200, headers={'Content-Type': 'application/json'}) actions.run(source.slug) source.reload() job = source.get_last_job() assert len(job.items) is 0 assert len(job.errors) is 1 error = job.errors[0] assert error.message == 'Authorization Error: Access denied'
def test_simple(rmock): for license_id in set(OdsBackend.LICENSES.values()): License.objects.create(id=license_id, title=license_id) org = OrganizationFactory() source = HarvestSourceFactory(backend='ods', url=ODS_URL, organization=org) api_url = ''.join((ODS_URL, '/api/datasets/1.0/search/')) rmock.get(api_url, text=ods_response('search.json'), headers={'Content-Type': 'application/json'}) actions.run(source.slug) assert parse_qs(urlparse(rmock.last_request.url).query) == { 'start': ['0'], 'rows': ['50'], 'interopmetas': ['true'] } source.reload() job = source.get_last_job() assert len(job.items) == 4 assert job.status == 'done' datasets = {d.extras['harvest:remote_id']: d for d in Dataset.objects} assert len(datasets) == 2 assert 'test-a' in datasets d = datasets['test-a'] assert d.title == 'test-a' assert d.description == 'test-a-description' assert d.tags == [ 'culture', 'environment', 'heritage', 'keyword1', 'keyword2' ] assert d.extras['ods:references'] == 'http://example.com' assert d.extras['ods:has_records'] assert d.extras['harvest:remote_id'] == 'test-a' assert d.extras['harvest:domain'] == 'etalab-sandbox.opendatasoft.com' assert d.extras[ 'ods:url'] == 'http://etalab-sandbox.opendatasoft.com/explore/dataset/test-a/' # noqa assert d.license.id == 'fr-lo' assert len(d.resources) == 2 resource = d.resources[0] assert resource.title == 'Export au format CSV' assert resource.description is not None assert resource.format == 'csv' assert resource.mime == 'text/csv' assert isinstance(resource.modified, datetime) assert resource.url == ('http://etalab-sandbox.opendatasoft.com/' 'explore/dataset/test-a/download' '?format=csv&timezone=Europe/Berlin' '&use_labels_for_header=true') resource = d.resources[1] assert resource.title == 'Export au format JSON' assert resource.description is not None assert resource.format == 'json' assert resource.mime == 'application/json' assert isinstance(resource.modified, datetime) assert resource.url == ('http://etalab-sandbox.opendatasoft.com/' 'explore/dataset/test-a/download' '?format=json&timezone=Europe/Berlin' '&use_labels_for_header=true') # test-b has geo feature assert 'test-b' in datasets test_b = datasets['test-b'] assert test_b.tags == [ 'buildings', 'equipment', 'housing', 'keyword1', 'spatial-planning', 'town-planning' ] assert len(test_b.resources) == 4 resource = test_b.resources[2] assert resource.title == 'Export au format GeoJSON' assert resource.description is not None assert resource.format == 'json' assert resource.mime == 'application/vnd.geo+json' assert resource.url == ('http://etalab-sandbox.opendatasoft.com/' 'explore/dataset/test-b/download' '?format=geojson&timezone=Europe/Berlin' '&use_labels_for_header=true') resource = test_b.resources[3] assert resource.title == 'Export au format Shapefile' assert resource.description is not None assert resource.format == 'shp' assert resource.mime is None assert resource.url == ('http://etalab-sandbox.opendatasoft.com/' 'explore/dataset/test-b/download' '?format=shp&timezone=Europe/Berlin' '&use_labels_for_header=true') # test-c has no data assert 'test-c' not in datasets # test-d is INSPIRE assert 'test-d' not in datasets