def test_list_harvest_sources_with_pagination(mock_requests):
    """Test list_harvest_sources pagination with multiple source types."""
    ckan = RemoteCKAN(mock_url)
    expected_harvest_source_1 = mock.sentinel.harvest_source_1
    expected_harvest_source_2 = mock.sentinel.harvest_source_2

    # Grab the generator
    harvest_sources = ckan.list_harvest_sources(start=0, page_size=1)

    # First page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 1',
                    'name': 'dataset-1',
                    'state': 'active',
                    'type': 'harest',
                    'source_type': 'waf',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_1
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
    mock_requests.reset_mock()

    # Second page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2)
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 2',
                    'name': 'dataset-2',
                    'state': 'active',
                    'source_type': 'ckan',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_2
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
Beispiel #2
0
def test_list_all_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    
    results = {}
    for hs in ckan.list_harvest_sources(skip_full_source_info=True):
        total += 1
        results[hs['name']] = hs
        
    assert 'doi-open-data' in results
    assert total == 1083
    
Beispiel #3
0
def test_list_ckan_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    expected_names = ['doi-open-data', 'test-2016']

    results = {}
    for hs in ckan.list_harvest_sources(source_type='ckan'):
        total += 1
        assert hs['source_type'] == 'ckan'
        assert hs['name'] in expected_names
        results[hs['name']] = hs

    assert total == 2
    assert results['doi-open-data']['url'] == 'https://data.doi.gov'
    assert results['doi-open-data']['status']['job_count'] == 1
Beispiel #4
0
def test_list_datajson_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    
    results = {}
    for hs in ckan.list_harvest_sources(source_type='datajson'):
        total += 1
        # some sources fails in production (didn't return the full source)
        assert hs.get('source_type', 'datajson') == 'datajson'
        results[hs['name']] = hs
        # just for the real requests
        # sleep(2)
        
    assert total == 152
    assert results['doj-json']['url'] == 'http://www.justice.gov/data.json'
    assert results['doj-json']['frequency'] == 'DAILY'
    assert results['doj-json']['status']['job_count'] == 235
    assert results['doj-json']['status']['total_datasets'] == 1236
Beispiel #5
0
    source_list_position = 0
    for hs in [{'name': name} for name in names]:
        time.sleep(args.wait_for_show)
        source_list_position = source_list_position + 1
        print('****** collecting {}: {} of {} sources'.format(
            hs['name'], source_list_position, len(names)))
        rhs = ckan.get_full_harvest_source(hs)
        if rhs is None:
            print('ERROR GETTING EXTERNAL SOURCE: {}'.format(hs['name']))
            continue
        sources_to_import.append(rhs)

else:
    for hs in ckan.list_harvest_sources(source_type=args.source_type,
                                        start=args.offset,
                                        limit=args.limit):
        sources_to_import.append(hs)

source_list_position = 0
for hs in sources_to_import:
    # save to destination CKAN
    source_list_position = source_list_position + 1
    print(' ****** creating {}: {} of {} sources'.format(
        hs['name'], source_list_position, len(sources_to_import)))
    if hs.get('error', False):
        print('Skipping failed source: {}'.format(hs['name']))
        continue
    time.sleep(args.wait_for_create)
    ckan.create_harvest_source(data=hs)
    assert 'created' in ckan.harvest_sources[hs['name']].keys()
Beispiel #6
0
args = parser.parse_args()

ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent)

csv_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.file_name + '.csv')
csvfile = open(csv_output, 'w')
fieldnames = ['title', 'name', 'type', 'url', 'frequency',
              'job_count', 'total_datasets', 'last_job_errored', 'last_job_created',
              'last_job_finished', 'last_job_status']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

writer.writeheader()

harvest_sources = []
total = 0
for hs in ckan.list_harvest_sources(source_type=args.source_type):

    if args.limit > 0:
        if total >= args.limit:
            break

    harvest_sources.append(hs)
    status = hs.get('status', {})
    last_job = status.get('last_job', {})
    if last_job is None:
        last_job = {}
    stats = last_job.get('stats', {})

    row = {'title': hs.get('title', 'undefined'),
           'name': hs['name'],
           'type': hs.get('source_type', 'undefined'),