def test_list_harvest_sources_with_pagination(mock_requests):
    """Test list_harvest_sources pagination with multiple source types."""
    ckan = RemoteCKAN(mock_url)
    expected_harvest_source_1 = mock.sentinel.harvest_source_1
    expected_harvest_source_2 = mock.sentinel.harvest_source_2

    # Grab the generator
    harvest_sources = ckan.list_harvest_sources(start=0, page_size=1)

    # First page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 1',
                    'name': 'dataset-1',
                    'state': 'active',
                    'type': 'harest',
                    'source_type': 'waf',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_1
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
    mock_requests.reset_mock()

    # Second page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2)
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 2',
                    'name': 'dataset-2',
                    'state': 'active',
                    'source_type': 'ckan',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_2
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
Ejemplo n.º 2
0
def test_load_from_name():
    """ Test source using force_all config. """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    ckan.set_destination(ckan_url='http://ckan:5000',
                         ckan_api_key='0602d7ed-1517-40a0-a92f-049d724962df')

    print('Getting harvest source ...')

    name = 'doi-open-data'
    full_hs = ckan.get_full_harvest_source(hs={'name': name})
    ckan.create_harvest_source(data=full_hs)
    assert 'created' in ckan.harvest_sources[name].keys()
    assert ckan.harvest_sources[name]['created']
    assert 'updated' in ckan.harvest_sources[name].keys()
    assert not ckan.harvest_sources[name]['updated']
    assert 'error' in ckan.harvest_sources[name].keys()
    assert not ckan.harvest_sources[name]['error']

    print(ckan.harvest_sources[name])

    # check the force_all config
    cfg = ckan.harvest_sources[name]['ckan_package']['config']
    cfg_data = json.loads(cfg)
    assert type(cfg_data['force_all']) == bool
    assert cfg_data['force_all']
Ejemplo n.º 3
0
def test_load_from_url():
    """ Test with some previous harvester already saved
        Use a pytest cassette so real requests are not required. 
        We import 3 harvest sources (so they already exists) 
        and then run this test with 6 sources. """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    ckan.set_destination(ckan_url='http://*****:*****@fdic.gov\r\[email protected]'
    assert expected_email_list in [
        extra['value'] for extra in extras if extra['key'] == 'email_list'
    ]

    extras = ckan.organizations['fcc-gov'].get('extras', [])
    expected_email_list = '[email protected]\r\[email protected]'
    assert expected_email_list in [
        extra['value'] for extra in extras if extra['key'] == 'email_list'
    ]

    assert len(ckan.groups), 1
    assert 'local' in ckan.groups
    assert ckan.groups['local']['display_name'] == 'Local Government'

    print(
        'Finished: {} harvest sources. {} Added, {} already exists, {} failed'.
        format(total, created, updated, errors))

    assert total == len(ckan.harvest_sources)
    assert created == 4
    assert updated == 3
    assert errors == 0
Ejemplo n.º 4
0
        f.close()
    else:
        names = args.names.split(',')

    if args.offset > 0:
        names = names[args.offset:]
    if args.limit > 0:
        names = names[:args.limit]

    source_list_position = 0
    for hs in [{'name': name} for name in names]:
        time.sleep(args.wait_for_show)
        source_list_position = source_list_position + 1
        print('****** collecting {}: {} of {} sources'.format(
            hs['name'], source_list_position, len(names)))
        rhs = ckan.get_full_harvest_source(hs)
        if rhs is None:
            print('ERROR GETTING EXTERNAL SOURCE: {}'.format(hs['name']))
            continue
        sources_to_import.append(rhs)

else:
    for hs in ckan.list_harvest_sources(source_type=args.source_type,
                                        start=args.offset,
                                        limit=args.limit):
        sources_to_import.append(hs)

source_list_position = 0
for hs in sources_to_import:
    # save to destination CKAN
    source_list_position = source_list_position + 1
c = 0
for name in names:
    c += 1
    print(' ****** {}/{}: {}'.format(c, len(names), name))

    # skips already checked sources
    file_name = f'source-checks-{args.source_type}-{name}.txt'
    full_path = os.path.join(remote_ckan.temp_data, file_name)
    if os.path.isfile(full_path):
        print(f'SKIP already checked source {args.source_type} {name}')
        continue

    row = {'name': name, 'time': time.time()}

    # check if already exists locally
    hs = local_ckan.get_full_harvest_source(hs={'name': name})
    if hs is None:  # some error
        # not exists locally, import
        rhs = remote_ckan.get_full_harvest_source(hs={'name': name})
        if rhs is None:
            print(f'ERROR GETTING EXTERNAL SOURCE: {name}')
            row['status'] = 'Failed to get external source'
            writer.writerow(row)
            continue

        # save it locally
        remote_ckan.create_harvest_source(data=rhs)
        # get this new source data
        hs = local_ckan.get_full_harvest_source(hs={'name': name})

    title = hs['title']