def test_list_harvest_sources_with_pagination(mock_requests): """Test list_harvest_sources pagination with multiple source types.""" ckan = RemoteCKAN(mock_url) expected_harvest_source_1 = mock.sentinel.harvest_source_1 expected_harvest_source_2 = mock.sentinel.harvest_source_2 # Grab the generator harvest_sources = ckan.list_harvest_sources(start=0, page_size=1) # First page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 1', 'name': 'dataset-1', 'state': 'active', 'type': 'harest', 'source_type': 'waf', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_1 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ] mock_requests.reset_mock() # Second page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2) mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 2', 'name': 'dataset-2', 'state': 'active', 'source_type': 'ckan', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_2 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ]
def test_list_all_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 results = {} for hs in ckan.list_harvest_sources(skip_full_source_info=True): total += 1 results[hs['name']] = hs assert 'doi-open-data' in results assert total == 1083
def test_list_ckan_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 expected_names = ['doi-open-data', 'test-2016'] results = {} for hs in ckan.list_harvest_sources(source_type='ckan'): total += 1 assert hs['source_type'] == 'ckan' assert hs['name'] in expected_names results[hs['name']] = hs assert total == 2 assert results['doi-open-data']['url'] == 'https://data.doi.gov' assert results['doi-open-data']['status']['job_count'] == 1
def test_list_datajson_sources(): """ Test the list of sources """ ckan = RemoteCKAN(url='https://catalog.data.gov') total = 0 results = {} for hs in ckan.list_harvest_sources(source_type='datajson'): total += 1 # some sources fails in production (didn't return the full source) assert hs.get('source_type', 'datajson') == 'datajson' results[hs['name']] = hs # just for the real requests # sleep(2) assert total == 152 assert results['doj-json']['url'] == 'http://www.justice.gov/data.json' assert results['doj-json']['frequency'] == 'DAILY' assert results['doj-json']['status']['job_count'] == 235 assert results['doj-json']['status']['total_datasets'] == 1236
source_list_position = 0 for hs in [{'name': name} for name in names]: time.sleep(args.wait_for_show) source_list_position = source_list_position + 1 print('****** collecting {}: {} of {} sources'.format( hs['name'], source_list_position, len(names))) rhs = ckan.get_full_harvest_source(hs) if rhs is None: print('ERROR GETTING EXTERNAL SOURCE: {}'.format(hs['name'])) continue sources_to_import.append(rhs) else: for hs in ckan.list_harvest_sources(source_type=args.source_type, start=args.offset, limit=args.limit): sources_to_import.append(hs) source_list_position = 0 for hs in sources_to_import: # save to destination CKAN source_list_position = source_list_position + 1 print(' ****** creating {}: {} of {} sources'.format( hs['name'], source_list_position, len(sources_to_import))) if hs.get('error', False): print('Skipping failed source: {}'.format(hs['name'])) continue time.sleep(args.wait_for_create) ckan.create_harvest_source(data=hs) assert 'created' in ckan.harvest_sources[hs['name']].keys()
args = parser.parse_args() ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent) csv_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.file_name + '.csv') csvfile = open(csv_output, 'w') fieldnames = ['title', 'name', 'type', 'url', 'frequency', 'job_count', 'total_datasets', 'last_job_errored', 'last_job_created', 'last_job_finished', 'last_job_status'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() harvest_sources = [] total = 0 for hs in ckan.list_harvest_sources(source_type=args.source_type): if args.limit > 0: if total >= args.limit: break harvest_sources.append(hs) status = hs.get('status', {}) last_job = status.get('last_job', {}) if last_job is None: last_job = {} stats = last_job.get('stats', {}) row = {'title': hs.get('title', 'undefined'), 'name': hs['name'], 'type': hs.get('source_type', 'undefined'),