def test_list_harvest_sources_with_pagination(mock_requests): """Test list_harvest_sources pagination with multiple source types.""" ckan = RemoteCKAN(mock_url) expected_harvest_source_1 = mock.sentinel.harvest_source_1 expected_harvest_source_2 = mock.sentinel.harvest_source_2 # Grab the generator harvest_sources = ckan.list_harvest_sources(start=0, page_size=1) # First page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 1', 'name': 'dataset-1', 'state': 'active', 'type': 'harest', 'source_type': 'waf', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_1 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ] mock_requests.reset_mock() # Second page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2) mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 2', 'name': 'dataset-2', 'state': 'active', 'source_type': 'ckan', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_2 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ]
def test_list_all_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 results = {} for hs in ckan.list_harvest_sources(skip_full_source_info=True): total += 1 results[hs['name']] = hs assert 'doi-open-data' in results assert total == 1083
def test_list_ckan_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 expected_names = ['doi-open-data', 'test-2016'] results = {} for hs in ckan.list_harvest_sources(source_type='ckan'): total += 1 assert hs['source_type'] == 'ckan' assert hs['name'] in expected_names results[hs['name']] = hs assert total == 2 assert results['doi-open-data']['url'] == 'https://data.doi.gov' assert results['doi-open-data']['status']['job_count'] == 1
def test_load_from_name(): """ Test source using force_all config. """ ckan = RemoteCKAN(url='https://catalog.data.gov') ckan.set_destination(ckan_url='http://ckan:5000', ckan_api_key='0602d7ed-1517-40a0-a92f-049d724962df') print('Getting harvest source ...') name = 'doi-open-data' full_hs = ckan.get_full_harvest_source(hs={'name': name}) ckan.create_harvest_source(data=full_hs) assert 'created' in ckan.harvest_sources[name].keys() assert ckan.harvest_sources[name]['created'] assert 'updated' in ckan.harvest_sources[name].keys() assert not ckan.harvest_sources[name]['updated'] assert 'error' in ckan.harvest_sources[name].keys() assert not ckan.harvest_sources[name]['error'] print(ckan.harvest_sources[name]) # check the force_all config cfg = ckan.harvest_sources[name]['ckan_package']['config'] cfg_data = json.loads(cfg) assert type(cfg_data['force_all']) == bool assert cfg_data['force_all']
def test_list_datajson_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 results = {} for hs in ckan.list_harvest_sources(source_type='datajson'): total += 1 # some sources fails in production (didn't return the full source) assert hs.get('source_type', 'datajson') == 'datajson' results[hs['name']] = hs # just for the real requests # sleep(2) assert total == 152 assert results['doj-json']['url'] == 'http://www.justice.gov/data.json' assert results['doj-json']['frequency'] == 'DAILY' assert results['doj-json']['status']['job_count'] == 235 assert results['doj-json']['status']['total_datasets'] == 1236
def test_load_from_url(): """ Test with some previous harvester already saved Use a pytest cassette so real requests are not required. We import 3 harvest sources (so they already exists) and then run this test with 6 sources. """ ckan = RemoteCKAN(url='https://catalog.data.gov') ckan.set_destination(ckan_url='http://*****:*****@fdic.gov\r\[email protected]' assert expected_email_list in [ extra['value'] for extra in extras if extra['key'] == 'email_list' ] extras = ckan.organizations['fcc-gov'].get('extras', []) expected_email_list = '[email protected]\r\[email protected]' assert expected_email_list in [ extra['value'] for extra in extras if extra['key'] == 'email_list' ] assert len(ckan.groups), 1 assert 'local' in ckan.groups assert ckan.groups['local']['display_name'] == 'Local Government' print( 'Finished: {} harvest sources. {} Added, {} already exists, {} failed'. format(total, created, updated, errors)) assert total == len(ckan.harvest_sources) assert created == 4 assert updated == 3 assert errors == 0
help="Wait this number of seconds between API calls to prevent timeout") parser.add_argument( "--wait_for_create", type=int, default=5, help="Wait this number of seconds between API calls to prevent timeout") args = parser.parse_args() if (args.destination_api_key is None): api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.key') api_key = open(api_key_file).read().rstrip() args.destination_api_key = api_key ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent) ckan.set_destination(ckan_url=args.destination_url, ckan_api_key=args.destination_api_key) # define the final list of sources to import (from type o a list) sources_to_import = [] if args.names is not None: # we get a list of names from a file or list of source names if os.path.isfile(args.names): f = open(args.names) names = f.read().splitlines() f.close() else: names = args.names.split(',')
def import_groups(origin_url, user_agent, destination_url, destination_api_key, groups='ALL', skip_groups=''): ckan = RemoteCKAN(url=origin_url, user_agent=user_agent) ckan.set_destination(ckan_url=destination_url, ckan_api_key=destination_api_key) groups_processed = [] groups_skipped = [] not_found = [] already_in_group = [] added_to_group = [] failed_to_add = [] if groups == 'ALL': groups = ckan.get_group_list() else: groups = groups.split(',') for group in groups: print('Group Found {}'.format(group)) if group in skip_groups.split(','): print('Skipping group') groups_skipped.append(group) continue groups_processed.append(group) # create this group at destination ckan.create_group(group) # get all datasets from this group and (if exist) add dataset to this group packages = ckan.get_datasets_in_group(group_name=group) for package in packages: name = package['name'] # if this dataset exists in the new CKAN instance we need to update to add this group package = ckan.get_full_package(name_or_id=name, url=destination_url) if package is None: print('Package not found {}'.format(name)) not_found.append({'group': group, 'dataset_name': name}) continue # check if the groups already exist at the destination package if group in [grp['name'] for grp in package.get('groups', [])]: print('Group {} already exists for {}'.format(group, name)) already_in_group.append(package['name']) continue # TODO update the dataset at the new environment to set the group package_update_url = f'{destination_url}/api/3/action/package_update' print(' ** Updating package {}'.format(name)) package["groups"].append({'name': group}) updated, status, error = ckan.request_ckan(url=package_update_url, method='POST', data=package) if updated: added_to_group.append(package['name']) else: failed_to_add.append(package['name']) print(' ** Updated ** Status {} ** Error {} **'.format(status, error)) if len(ckan.errors) > 0: print('*******\nWITH ERRORS\n*******') print('\n\t'.join(ckan.errors)) print('Datasets not found: {}'.format(len(not_found))) for nf in not_found: print('\tDataset {} at group {}'.format(nf['dataset_name'], nf['group'])) print('Final results:') ret = { "groups_processed": groups_processed, "groups_skipped": groups_skipped, "not_found": not_found, "already_in_group": already_in_group, "added_to_group": added_to_group, "failed_to_add":failed_to_add } print(ret) return ret
''' import os import argparse import csv from remote_ckan.lib import RemoteCKAN parser = argparse.ArgumentParser() parser.add_argument("--file_name", type=str, default='report-harvests', help="Name of file to save") parser.add_argument("--origin_url", type=str, default='https://data.doi.gov', help="CKAN instance URL") parser.add_argument("--source_type", type=str, default='ALL', help="Type of harvest source: ALL|datajson|csw|waf etc") parser.add_argument("--user_agent", type=str, default='CKAN-harvest-source-importer 1.0', help="User agent") parser.add_argument("--limit", type=int, default=0, help="Limit the amount of Harvest sources to import") args = parser.parse_args() ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent) csv_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.file_name + '.csv') csvfile = open(csv_output, 'w') fieldnames = ['title', 'name', 'type', 'url', 'frequency', 'job_count', 'total_datasets', 'last_job_errored', 'last_job_created', 'last_job_finished', 'last_job_status'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() harvest_sources = [] total = 0 for hs in ckan.list_harvest_sources(source_type=args.source_type): if args.limit > 0:
default='http://localhost:5000', help="CKAN destination instance URL") parser.add_argument("--destination_api_key", type=str, help="CKAN destination instance API KEY") args = parser.parse_args() if (args.destination_api_key is None): api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.key') api_key = open(api_key_file).read().rstrip() args.destination_api_key = api_key # We will check locally from sources and import if not exists local_ckan = RemoteCKAN(url=args.destination_url) remote_ckan = RemoteCKAN(url=args.origin_url) remote_ckan.set_destination(ckan_url=args.destination_url, ckan_api_key=args.destination_api_key) # we get a list of names from a file or list of source names if args.names_to_test is not None: if os.path.isfile(args.names_to_test): f = open(args.names_to_test) names = f.read().splitlines() f.close() else: names = args.names_to_test.split(',') else: names = [] # for hs in local_ckan.list_harvest_sources(source_type=args.source_type):