def test_include_org_filter(ckan):
    # create 2 organizations with 2 datasets each
    org = ckan.action('organization_create', {'name': 'org-1'})['result']
    included_ids = [
        d['id'] for d in [
            package_factory(ckan, owner_org=org['id']),
            package_factory(ckan, owner_org=org['id']),
        ]
    ]
    org2 = ckan.action('organization_create', {'name': 'org-2'})['result']
    package_factory(ckan, owner_org=org2['id'])
    package_factory(ckan, owner_org=org2['id'])

    source = HarvestSourceFactory(
        backend='ckan',
        url=ckan.BASE_URL,
        config={'filters': [{
            'key': 'organization',
            'value': org['name']
        }]})

    actions.run(source.slug)
    source.reload()

    job = source.get_last_job()
    assert len(job.items) == len(included_ids)

    for dataset in Dataset.objects:
        assert dataset.extras['harvest:remote_id'] in included_ids
def feed_ckan_and_harvest(request, source, ckan, app):
    '''
    This fixture feed CKAN with data from data fixtures,
    then perform the harvesting and return the data and
    results for this module tests
    '''
    module = request.module
    session = request.session
    items = [item for item in session.items if item.module == module]
    rundata = {}

    fixtures = {
        i.get_closest_marker('ckan_data').args[0]
        for i in items if i.get_closest_marker('ckan_data')
    }

    for fixture in fixtures:
        values = request.getfixturevalue(fixture)
        data, kwargs = values if isinstance(values, tuple) else (values, {})
        result = ckan.action('package_create', data)
        rundata[fixture] = data, result, kwargs

    with app.app_context():
        actions.run(source.slug)
        source.reload()
        job = source.get_last_job()
        assert len(job.items) == len(fixtures)

    return rundata
Beispiel #3
0
def test_can_have_multiple_filters(ckan):
    # create 2 organizations with 2 datasets each
    org = ckan.action('organization_create', {'name': 'org-1'})['result']
    package = package_factory(ckan,
                              owner_org=org['id'],
                              tags=[{
                                  'name': 'tag-1'
                              }])
    package_factory(ckan, owner_org=org['id'], tags=[{'name': 'tag-2'}])
    org2 = ckan.action('organization_create', {'name': 'org-2'})['result']
    package_factory(ckan, owner_org=org2['id'], tags=[{'name': 'tag-1'}]),
    package_factory(ckan, owner_org=org2['id'], tags=[{'name': 'tag-2'}]),

    source = HarvestSourceFactory(backend='ckan',
                                  url=ckan.BASE_URL,
                                  config={
                                      'filters': [
                                          {
                                              'key': 'organization',
                                              'value': org['name']
                                          },
                                          {
                                              'key': 'tags',
                                              'value': 'tag-1'
                                          },
                                      ]
                                  })

    actions.run(source.slug)
    source.reload()

    job = source.get_last_job()
    assert len(job.items) == 1
    assert Dataset.objects.count() == 1
    assert Dataset.objects.first().extras['harvest:remote_id'] == package['id']
Beispiel #4
0
def test_dkan_french_w_license(app, rmock):
    '''CKAN Harvester should accept the minimum dataset payload'''
    DKAN_URL = 'https://harvest.me/'
    API_URL = '{}api/3/action/'.format(DKAN_URL)
    PACKAGE_LIST_URL = '{}package_list'.format(API_URL)
    PACKAGE_SHOW_URL = '{}package_show'.format(API_URL)

    with open(data_path('dkan-french-w-license.json')) as ifile:
        data = json.loads(ifile.read())

    org = OrganizationFactory()
    source = HarvestSourceFactory(backend='dkan', url=DKAN_URL, organization=org)
    rmock.get(PACKAGE_LIST_URL, json={'success': True, 'result': ['fake-name']}, status_code=200,
              headers={'Content-Type': 'application/json'})
    rmock.get(PACKAGE_SHOW_URL, json=data, status_code=200,
              headers={'Content-Type': 'application/json'})
    actions.run(source.slug)
    source.reload()
    assert source.get_last_job().status == 'done'

    datasets = Dataset.objects.filter(organization=org)
    assert len(datasets) > 0

    q = {'extras__harvest:remote_id': '04be6288-696d-4331-850d-a144871a7e3a'}
    dataset = datasets.get(**q)
    assert dataset.created_at == datetime(2019, 12, 10, 0, 0)
    assert dataset.last_modified == datetime(2019, 9, 30, 0, 0)
    assert len(dataset.resources) == 2
    assert 'xlsx' in [r.format for r in dataset.resources]
Beispiel #5
0
def test_simple():
    org = OrganizationFactory()
    source = HarvestSourceFactory(backend='{{ cookiecutter.identifier }}',
                                  url=faker.url(),
                                  organization=org)

    # TODO: mock remote endpoints responses

    actions.run(source.slug)

    source.reload()

    job = source.get_last_job()
    assert len(job.items) > 0
def test_minimal_ckan_response(rmock):
    '''CKAN Harvester should accept the minimum dataset payload'''
    CKAN_URL = 'https://harvest.me/'
    API_URL = '{}api/3/action/'.format(CKAN_URL)
    PACKAGE_LIST_URL = '{}package_list'.format(API_URL)
    PACKAGE_SHOW_URL = '{}package_show'.format(API_URL)

    name = faker.unique_string()
    json = {
        'success': True,
        'result': {
            'id': faker.uuid4(),
            'name': name,
            'title': faker.sentence(),
            'maintainer': faker.name(),
            'tags': [],
            'private': False,
            'maintainer_email': faker.email(),
            'license_id': None,
            'metadata_created': faker.iso8601(),
            'organization': None,
            'metadata_modified': faker.iso8601(),
            'author': None,
            'author_email': None,
            'notes': faker.paragraph(),
            'license_title': None,
            'state': None,
            'revision_id': faker.unique_string(),
            'type': 'dataset',
            'resources': [],
            # extras is not always present so we exclude it from the minimal payload
        }
    }
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)
    rmock.get(PACKAGE_LIST_URL,
              json={
                  'success': True,
                  'result': [name]
              },
              status_code=200,
              headers={'Content-Type': 'application/json'})
    rmock.get(PACKAGE_SHOW_URL,
              json=json,
              status_code=200,
              headers={'Content-Type': 'application/json'})
    actions.run(source.slug)
    source.reload()
    assert source.get_last_job().status == 'done'
Beispiel #7
0
def test_dkan_demo_harvest(source, app):
    '''
    Harvest DKAN_TEST_INSTANCE and check some datasets are created
    '''
    with app.app_context():
        actions.run(source.slug)
        source.reload()
        job = source.get_last_job()

    assert len(job.items) > 0
    datasets = Dataset.objects.filter(organization=source.organization)
    assert len(job.items) == datasets.count()

    for dataset in datasets:
        assert len(dataset.resources) > 0

    assert job.status == 'done'
def test_standard_api_json_error(rmock):
    json = {'success': False, 'error': 'an error'}
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)

    rmock.get(API_URL,
              json=json,
              status_code=200,
              headers={'Content-Type': 'application/json'})

    actions.run(source.slug)

    source.reload()

    job = source.get_last_job()
    assert len(job.items) is 0
    assert len(job.errors) is 1
    error = job.errors[0]
    assert error.message == 'an error'
def test_plain_text_error(rmock, code):
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)

    rmock.get(API_URL,
              text='"Some error"',
              status_code=code,
              headers={'Content-Type': 'text/plain'})

    actions.run(source.slug)

    source.reload()

    job = source.get_last_job()
    assert len(job.items) is 0
    assert len(job.errors) is 1
    error = job.errors[0]
    # Raw quoted string is properly unquoted
    assert error.message == 'Some error'
def test_html_error(rmock, code):
    # Happens with wrong source URL (html is returned instead of json)
    html = '<html><body>Error</body></html>'
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)

    rmock.get(API_URL,
              text=html,
              status_code=code,
              headers={'Content-Type': 'text/html'})

    actions.run(source.slug)

    source.reload()

    job = source.get_last_job()
    assert len(job.items) is 0
    assert len(job.errors) is 1
    error = job.errors[0]
    # HTML is detected and does not clutter the message
    assert html not in error.message
def test_tag_filter(ckan):
    # create 2 datasets with a different tag each
    tag = faker.word()
    package = package_factory(ckan, tags=[{'name': tag}])
    package_factory(ckan, tags=[{'name': faker.word()}])

    source = HarvestSourceFactory(
        backend='ckan',
        url=ckan.BASE_URL,
        config={'filters': [{
            'key': 'tags',
            'value': tag
        }]})

    actions.run(source.slug)
    source.reload()

    job = source.get_last_job()
    assert len(job.items) == 1
    assert Dataset.objects.count() == 1
    assert Dataset.objects.first().extras['harvest:remote_id'] == package['id']
def test_standard_api_json_error_with_details_and_type(rmock):
    json = {
        'success': False,
        'error': {
            'message': 'Access denied',
            '__type': 'Authorization Error',
        }
    }
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)

    rmock.get(API_URL,
              json=json,
              status_code=200,
              headers={'Content-Type': 'application/json'})

    actions.run(source.slug)

    source.reload()

    job = source.get_last_job()
    assert len(job.items) is 0
    assert len(job.errors) is 1
    error = job.errors[0]
    assert error.message == 'Authorization Error: Access denied'
Beispiel #13
0
def test_simple(rmock):
    for license_id in set(OdsBackend.LICENSES.values()):
        License.objects.create(id=license_id, title=license_id)

    org = OrganizationFactory()
    source = HarvestSourceFactory(backend='ods', url=ODS_URL, organization=org)

    api_url = ''.join((ODS_URL, '/api/datasets/1.0/search/'))
    rmock.get(api_url,
              text=ods_response('search.json'),
              headers={'Content-Type': 'application/json'})

    actions.run(source.slug)

    assert parse_qs(urlparse(rmock.last_request.url).query) == {
        'start': ['0'],
        'rows': ['50'],
        'interopmetas': ['true']
    }

    source.reload()

    job = source.get_last_job()
    assert len(job.items) == 4
    assert job.status == 'done'

    datasets = {d.extras['harvest:remote_id']: d for d in Dataset.objects}
    assert len(datasets) == 2

    assert 'test-a' in datasets
    d = datasets['test-a']
    assert d.title == 'test-a'
    assert d.description == 'test-a-description'
    assert d.tags == [
        'culture', 'environment', 'heritage', 'keyword1', 'keyword2'
    ]
    assert d.extras['ods:references'] == 'http://example.com'
    assert d.extras['ods:has_records']
    assert d.extras['harvest:remote_id'] == 'test-a'
    assert d.extras['harvest:domain'] == 'etalab-sandbox.opendatasoft.com'
    assert d.extras[
        'ods:url'] == 'http://etalab-sandbox.opendatasoft.com/explore/dataset/test-a/'  # noqa
    assert d.license.id == 'fr-lo'

    assert len(d.resources) == 2
    resource = d.resources[0]
    assert resource.title == 'Export au format CSV'
    assert resource.description is not None
    assert resource.format == 'csv'
    assert resource.mime == 'text/csv'
    assert isinstance(resource.modified, datetime)
    assert resource.url == ('http://etalab-sandbox.opendatasoft.com/'
                            'explore/dataset/test-a/download'
                            '?format=csv&timezone=Europe/Berlin'
                            '&use_labels_for_header=true')

    resource = d.resources[1]
    assert resource.title == 'Export au format JSON'
    assert resource.description is not None
    assert resource.format == 'json'
    assert resource.mime == 'application/json'
    assert isinstance(resource.modified, datetime)
    assert resource.url == ('http://etalab-sandbox.opendatasoft.com/'
                            'explore/dataset/test-a/download'
                            '?format=json&timezone=Europe/Berlin'
                            '&use_labels_for_header=true')

    # test-b has geo feature
    assert 'test-b' in datasets
    test_b = datasets['test-b']
    assert test_b.tags == [
        'buildings', 'equipment', 'housing', 'keyword1', 'spatial-planning',
        'town-planning'
    ]
    assert len(test_b.resources) == 4
    resource = test_b.resources[2]
    assert resource.title == 'Export au format GeoJSON'
    assert resource.description is not None
    assert resource.format == 'json'
    assert resource.mime == 'application/vnd.geo+json'
    assert resource.url == ('http://etalab-sandbox.opendatasoft.com/'
                            'explore/dataset/test-b/download'
                            '?format=geojson&timezone=Europe/Berlin'
                            '&use_labels_for_header=true')
    resource = test_b.resources[3]
    assert resource.title == 'Export au format Shapefile'
    assert resource.description is not None
    assert resource.format == 'shp'
    assert resource.mime is None
    assert resource.url == ('http://etalab-sandbox.opendatasoft.com/'
                            'explore/dataset/test-b/download'
                            '?format=shp&timezone=Europe/Berlin'
                            '&use_labels_for_header=true')

    # test-c has no data
    assert 'test-c' not in datasets

    # test-d is INSPIRE
    assert 'test-d' not in datasets