Example #1
0
    def test_validate_origin_dataset(self, test_datajson_dataset):
        djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        valid = djs.validate_origin_dataset()
        assert valid == False
        assert djs.errors == ['Owner organization ID is required']

        del test_datajson_dataset['accessLevel']
        del test_datajson_dataset['contactPoint']
        del test_datajson_dataset['identifier']
        del test_datajson_dataset['programCode']
        del test_datajson_dataset['bureauCode']
        del test_datajson_dataset['publisher']
        del test_datajson_dataset['modified']
        del test_datajson_dataset['keyword']
        djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                   schema='usmetadata')
        djsumd.ckan_owner_org_id = 'XXXXX'

        valid = djsumd.validate_origin_dataset()

        assert valid == False

        # accessLevel does not error because it is added in the load_default_values method
        assert djsumd.errors == [
            '"identifier" field could not be empty at origin dataset',
            '"contactPoint__fn" field could not be empty at origin dataset',
            '"programCode" field could not be empty at origin dataset',
            '"bureauCode" field could not be empty at origin dataset',
            '"contactPoint__hasEmail" field could not be empty at origin dataset',
            '"publisher" field could not be empty at origin dataset',
            '"modified" field could not be empty at origin dataset',
            '"keyword" field could not be empty at origin dataset'
        ]
    def test_get_base_ckan_dataset(self, test_datajson_dataset,
                                   base_ckan_dataset,
                                   base_ckan_dataset_usmetadata):
        datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        assert datajson.get_base_ckan_dataset(
            schema='default') == base_ckan_dataset

        datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        assert datajson.get_base_ckan_dataset(
            schema='usmetadata') == base_ckan_dataset_usmetadata
Example #3
0
    def test_load_default_values(self, test_datajson_dataset):
        djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        assert djs.original_dataset['accessLevel'] == ''

        djs_usmetadata = DataJSONSchema1_1(
            original_dataset=test_datajson_dataset, schema='usmetadata')
        assert djs_usmetadata.original_dataset['accessLevel'] == 'public'

        del test_datajson_dataset['accessLevel']
        djs_usmetadata = DataJSONSchema1_1(
            original_dataset=test_datajson_dataset, schema='usmetadata')
        assert djs_usmetadata.original_dataset['accessLevel'] == 'public'
    def test_drop_distribution(self, test_datajson_dataset):

        dataset = test_datajson_dataset
        # drop required keys
        djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata')
        djss.ckan_owner_org_id = 'XXXX'
        ckan_dataset = djss.transform_to_ckan_dataset()

        del dataset['distribution']
        djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata')
        djss.ckan_owner_org_id = 'XXXX'
        ckan_dataset = djss.transform_to_ckan_dataset()

        assert ckan_dataset['resources'] == []
    def test_catalog_extras(self, test_datajson_dataset):
        djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                 schema='usmetadata')
        # ORG is required!
        djss.ckan_owner_org_id = 'XXXX'
        ckan_dataset = djss.transform_to_ckan_dataset()

        t2 = test_datajson_dataset
        t2['catalog_@context'] = "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld"
        t2['catalog_describedBy'] = "https://project-open-data.cio.gov/v1.1/schema/catalog.json"
        t2['catalog_conformsTo'] = "https://project-open-data.cio.gov/v1.1/schema"
        t2['catalog_@id'] = 'https://healthdata.gov/data.json'

        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()
        assert [
            "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld"
        ] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'catalog_@context'
        ]
        assert ["https://project-open-data.cio.gov/v1.1/schema/catalog.json"
                ] == [
                    extra['value'] for extra in ckan_dataset['extras']
                    if extra['key'] == 'catalog_describedBy'
                ]
        assert ["https://project-open-data.cio.gov/v1.1/schema"] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'catalog_conformsTo'
        ]
        assert ['https://healthdata.gov/data.json'] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'catalog_@id'
        ]
Example #6
0
    def test_merge_resources(self, test_datajson_dataset):
        djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        djs.ckan_owner_org_id = 'XXXXX'
        existing_resources = [{
            'url': 'http://marketnews.usda.gov/',
            'id': '4'
        }]
        new_resources = [{
            'url': 'http://marketnews.usda.gov/',
            'description': '',
            'format': 'text/html',
            'name': 'Web Page',
            'mimetype': 'text/html'
        }]
        result = djs.merge_resources(existing_resources=existing_resources,
                                     new_resources=new_resources)

        assert result == [{
            'url': 'http://marketnews.usda.gov/',
            'description': '',
            'format': 'text/html',
            'name': 'Web Page',
            'mimetype': 'text/html',
            'id': '4'
        }]
Example #7
0
 def test_upgrade_usmetadata_default_fields(
         self, test_datajson_dataset, datajson_usmetadata_mapped_fields):
     djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                             schema='usmetadata')
     usmetadata_default_fields = djs.upgrade_usmetadata_default_fields(
         djs.mapped_fields)
     assert usmetadata_default_fields == datajson_usmetadata_mapped_fields
    def test_create_package_with_tags(self):

        # djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset, schema='usmetadata')
        djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset)
        djss.ckan_owner_org_id = CKAN_ORG_ID
        package = djss.transform_to_ckan_dataset()
        assert 'extras' in package
        # TODO check what we expect here
        # assert [['005:45']] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode']
        # assert [['005:047']] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode']
        assert ['005:45'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode']
        assert ['005:047'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode']

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        res = cpa.create_package(ckan_package=package, on_duplicated='DELETE')
        assert res['success'] == True
        result = res['result']

        # read it
        res = cpa.show_package(ckan_package_id_or_name=result['id'])
        assert res['success'] == True
        ckan_dataset = res['result']

        assert 'extras' in ckan_dataset
        assert ['005:45'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode']
        assert ['005:047'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode']
    def test_collections(self, test_datajson_dataset):
        djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                 schema='usmetadata')
        # ORG is required!
        djss.ckan_owner_org_id = 'XXXX'
        ckan_dataset = djss.transform_to_ckan_dataset()
        assert [] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'is_collection'
        ]
        t2 = test_datajson_dataset
        t2['is_collection'] = True
        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()
        assert [True] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'is_collection'
        ]

        assert [] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'collection_package_id'
        ]
        t2['collection_pkg_id'] = 'XXXXX'
        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()
        assert ['XXXXX'] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'collection_package_id'
        ]
    def test_set_destination_element(self, test_datajson_dataset):
        datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)

        with pytest.raises(Exception) as e:
            assert datajson.set_destination_element(raw_field='something',
                                                    new_value='A Test Value')
        assert str(
            e.value) == 'Not found field "something" at CKAN destination dict'
    def test_required_fields(self, test_datajson_dataset):

        dataset = test_datajson_dataset
        # drop required keys
        djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata')
        # ORG is required!

        ckan_dataset = djss.transform_to_ckan_dataset()
        assert ckan_dataset is None
        assert 'Owner organization ID is required' in djss.errors

        djss.ckan_owner_org_id = 'XXXX'
        ckan_dataset = djss.transform_to_ckan_dataset()
        del ckan_dataset['name']

        ret = djss.validate_final_dataset()
        assert not ret
        assert '"name" is a required field' in djss.errors
Example #12
0
    def test_transform_resources(self, test_datajson_dataset):
        djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                   schema='usmetadata')
        djsumd.ckan_owner_org_id = 'XXXXX'

        distribution = {
            '@type': 'dcat:Distribution',
            'downloadURL': 'http://marketnews.usda.gov/',
            'mediaType': 'text/html',
            'title': 'Web Page'
        }

        result = djsumd.transform_resources(distribution)
        assert result == [{
            'url': 'http://marketnews.usda.gov/',
            'description': '',
            'format': 'text/html',
            'name': 'Web Page',
            'mimetype': 'text/html'
        }]
Example #13
0
    def test_infer_resources(self, test_datajson_dataset):
        del test_datajson_dataset['distribution']
        djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                   schema='usmetadata')
        djsumd.ckan_owner_org_id = 'XXXXX'

        djsumd.original_dataset['accessURL'] = "http://urlwithspaces.com  "
        #TODO check why we transform webService if its not used
        djsumd.original_dataset['webService'] = "http://webService.com  "
        djsumd.original_dataset['format'] = "distribution format"

        distribution = djsumd.infer_resources()

        assert distribution == [{
            'accessURL': 'http://urlwithspaces.com',
            'format': 'distribution format',
            'mimetype': 'distribution format'
        }, {
            'webService': 'http://webService.com',
            'format': 'distribution format',
            'mimetype': 'distribution format'
        }]
Example #14
0
    def test_fix_fields(self, test_datajson_dataset):
        djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                   schema='usmetadata')
        djsumd.ckan_owner_org_id = 'XXXXX'

        fields = djsumd.fix_fields('tags', ['FOB', 'wholesale market'])
        assert fields == [{'name': 'fob'}, {'name': 'wholesale-market'}]

        fields = djsumd.fix_fields('contact_email',
                                   'mailto:[email protected]')
        assert fields == '*****@*****.**'

        fields = djsumd.fix_fields('maintainer_email',
                                   'mailto:[email protected]')
        assert fields == '*****@*****.**'

        fields = djsumd.fix_fields('extras__bureauCode', ['list', 'items'])
        assert fields == 'list,items'

        fields = djsumd.fix_fields('extras__programCode', ['list', 'items'])
        assert fields == 'list,items'

        fields = djsumd.fix_fields('accrual_periodicity', 'irregular')
        assert fields == 'not updated'
    def test_datajson_1_1_to_ckan(self, test_datajson_dataset):

        djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        # ORG is required!
        djss.ckan_owner_org_id = 'XXXX'

        ckan_dataset = djss.transform_to_ckan_dataset()

        assert ckan_dataset['owner_org'] == 'XXXX'
        assert ckan_dataset['notes'] == 'Some notes ...'
        assert len(ckan_dataset['resources']) == 2

        if djss.schema == 'usmetadata':
            assert ckan_dataset['contact_email'] == '*****@*****.**'
            # test *Code
            assert ckan_dataset['bureau_code'] == '005:45'
            assert ckan_dataset['program_code'] == '005:047'
            assert ckan_dataset[
                'publisher'] == 'Agricultural Marketing Service'
        else:
            assert ckan_dataset[
                'maintainer_email'] == '*****@*****.**'
            # test *Code
            # TODO check what we expect here
            # assert [['005:45']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'bureauCode']
            # assert [['005:047']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'programCode']
            assert ['005:45'] == [
                extra['value'] for extra in ckan_dataset['extras']
                if extra['key'] == 'bureauCode'
            ]
            assert ['005:047'] == [
                extra['value'] for extra in ckan_dataset['extras']
                if extra['key'] == 'programCode'
            ]
            # test publisher processor
            assert ['Agricultural Marketing Service'] == [
                extra['value'] for extra in ckan_dataset['extras']
                if extra['key'] == 'publisher'
            ]

        assert len(ckan_dataset['tags']) == 2
        assert ckan_dataset['license_id'] == 'cc-by'  # transformation
        assert [] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'publisher_hierarchy'
        ]

        # test publisher subOrganizationOf
        t2 = test_datajson_dataset
        t2['publisher']['subOrganizationOf'] = {
            "@type": "org:Organization",
            "name": "Department of Agriculture"
        }
        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()

        if djss.schema == 'usmetadata':
            assert ckan_dataset[
                'publisher'] == 'Agricultural Marketing Service'
        else:
            assert ['Agricultural Marketing Service'] == [
                extra['value'] for extra in ckan_dataset['extras']
                if extra['key'] == 'publisher'
            ]

        assert ['Department of Agriculture > Agricultural Marketing Service'
                ] == [
                    extra['value'] for extra in ckan_dataset['extras']
                    if extra['key'] == 'publisher_hierarchy'
                ]

        t2['publisher']['subOrganizationOf']['subOrganizationOf'] = {
            "@type": "org:Organization",
            "name": "USA GOV"
        }
        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()

        if djss.schema == 'usmetadata':
            assert ckan_dataset[
                'publisher'] == 'Agricultural Marketing Service'
        else:
            assert ['Agricultural Marketing Service'] == [
                extra['value'] for extra in ckan_dataset['extras']
                if extra['key'] == 'publisher'
            ]

        assert [
            'USA GOV > Department of Agriculture > Agricultural Marketing Service'
        ] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'publisher_hierarchy'
        ]

        t2 = test_datajson_dataset
        t2['harvest_source_id'] = 'XXXXX'

        djss.original_dataset = t2
        ckan_dataset = djss.transform_to_ckan_dataset()
        assert ['XXXXX'] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_source_id'
        ]
    def test_resources(self, test_datajson_dataset):
        djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset,
                                 schema='usmetadata')
        # ORG is required!
        djss.ckan_owner_org_id = 'XXXX'

        # sample from CKAN results
        existing_resources = [
            {  # the first is a real CKAN result from a data.json distribution/resource on test_datajsoin_dataset
                "conformsTo": "https://management.cio.gov/schema/",
                "cache_last_updated": None,
                "describedByType": "application/json",
                "package_id": "d84cac16-307f-4ed9-8353-82d303e2b581",
                "webstore_last_updated": None,
                "id": "d0eb660c-7734-4fe1-b106-70f817f1c99d",
                "size": None,
                "state": "active",
                "describedBy":
                "https://management.cio.gov/schemaexamples/costSavingsAvoidanceSchema.json",
                "hash": "",
                "description": "costsavings.json",
                "format": "JSON",
                "tracking_summary": {
                    "total": 20,
                    "recent": 1
                },
                "mimetype_inner": None,
                "url_type": None,
                "revision_id": "55598e72-79d2-4679-8095-aa4b1e67b2f5",
                "mimetype": "application/json",
                "cache_url": None,
                "name": "JSON File",
                "created": "2018-02-03T23:39:07.247009",
                "url": "http://www.usda.gov/digitalstrategy/costsavings.json",
                "webstore_url": None,
                "last_modified": None,
                "position": 0,
                "no_real_name": "True",
                "resource_type": None
            },
            {
                "cache_last_updated": None,
                "package_id": "6fdad934-75a4-44d3-aced-2a69a289356d",
                "webstore_last_updated": None,
                "id": "280dff75-cace-458a-bc4d-ff7c67a8366c",
                "size": None,
                "state": "active",
                "hash": "",
                "description": "Query tool",
                "format": "HTML",
                "tracking_summary": {
                    "total": 1542,
                    "recent": 41
                },
                "last_modified": None,
                "url_type": None,
                "mimetype": "text/html",
                "cache_url": None,
                "name": "Poverty",
                "created": "2018-02-04T00:02:06.320564",
                "url":
                "http://www.ers.usda.gov/data-products/county-level-data-sets/poverty.aspx",
                "webstore_url": None,
                "mimetype_inner": None,
                "position": 0,
                "revision_id": "ffb7058b-2606-4a13-9669-ccfde2547ff7",
                "resource_type": None
            }
        ]

        ckan_dataset = djss.transform_to_ckan_dataset(
            existing_resources=existing_resources)

        assert len(ckan_dataset['resources']) == 2

        # we expect for one dataset with an ID (merged)
        for resource in ckan_dataset['resources']:
            if resource['url'] == 'http://marketnews.usda.gov/':
                assert resource['format'] == 'text/html'
                assert resource['mimetype'] == 'text/html'
                assert resource['description'] == ''
                assert resource['name'] == 'Web Page'
            elif resource[
                    'url'] == "http://www.usda.gov/digitalstrategy/costsavings.json":
                assert resource['format'] == 'application/json'
                assert resource['mimetype'] == 'application/json'
                assert resource['description'] == ''
                assert 'name' not in resource
            else:
                assert 'Unexpected URL' == False
Example #17
0
 def test_get_field_mapping(self, test_datajson_dataset,
                            datajson_mapped_fields):
     djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
     assert djs.mapped_fields == datajson_mapped_fields
 def test_identify_origin_element(self, test_datajson_dataset):
     datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
     fn = datajson.identify_origin_element('contactPoint__fn')
     hasEmail = datajson.identify_origin_element('contactPoint__hasEmail')
     assert fn == 'Fred Teensma'
     assert hasEmail == 'mailto:[email protected]'
 def test_validate_final_dataset(self, test_datajson_dataset):
     datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
     result = datajson.validate_final_dataset()
     assert result == False
     assert '"name" field could not be empty' in datajson.errors
Example #20
0
    def test_transform_to_ckan_dataset(self, test_datajson_dataset, caplog):
        djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
        result = djs.transform_to_ckan_dataset()

        assert result == None

        djs.ckan_owner_org_id = 'XXXXX'
        result = djs.transform_to_ckan_dataset(existing_resources=[{
            'url': 'http://marketnews.usda.gov/',
            'id': '1'
        }])

        assert 'Transforming data.json dataset USDA-26521' in caplog.text
        assert 'Dataset transformed USDA-26521 OK' in caplog.text
        assert 'Connecting fields "name", "name"' in caplog.text
        assert 'No data in origin for "name"' in caplog.text
        assert 'Connected OK fields "title"="Fruit and Vegetable Market News Search"' in caplog.text
        assert result == {
            'name':
            'fruit-and-vegetable-market-news-search',
            'title':
            'Fruit and Vegetable Market News Search',
            'owner_org':
            'XXXXX',
            'private':
            False,
            'maintainer':
            'Fred Teensma',
            'maintainer_email':
            '*****@*****.**',
            'notes':
            'Some notes ...',
            'state':
            'active',
            'resources': [{
                'url': 'http://marketnews.usda.gov/',
                'description': '',
                'format': 'text/html',
                'name': 'Web Page',
                'mimetype': 'text/html',
                'id': '1'
            }, {
                'url': 'http://www.usda.gov/digitalstrategy/costsavings.json',
                'description': '',
                'format': 'application/json',
                'mimetype': 'application/json',
                'conformsTo': 'https://management.cio.gov/schema/',
                'describedBy':
                'https://management.cio.gov/schemaexamples/costSavingsAvoidanceSchema.json',
                'describedByType': 'application/json'
            }],
            'tags': [{
                'name': 'fob'
            }, {
                'name': 'wholesale-market'
            }],
            'extras': [{
                'key': 'resource-type',
                'value': 'Dataset'
            }, {
                'key': 'modified',
                'value': '2014-12-23'
            }, {
                'key': 'identifier',
                'value': 'USDA-26521'
            }, {
                'key': 'accessLevel',
                'value': ''
            }, {
                'key': 'bureauCode',
                'value': '005:45'
            }, {
                'key': 'programCode',
                'value': '005:047'
            }, {
                'key': 'license',
                'value': 'https://creativecommons.org/licenses/by/4.0'
            }, {
                'key': 'source_datajson_identifier',
                'value': True
            }, {
                'key': 'publisher',
                'value': 'Agricultural Marketing Service'
            }],
            'tag_string':
            'fob,wholesale-market',
            'license_id':
            'cc-by'
        }
 def test_build_tags(self, test_datajson_dataset):
     datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
     datajson.ckan_owner_org_id = 'XXXXX'
     result = datajson.build_tags(['A tag ', 'Another tag '])
     assert result == [{'name': 'a-tag'}, {'name': 'another-tag'}]
 def test_get_accrual_periodicity(self, test_datajson_dataset):
     datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset)
     result = datajson.get_accrual_periodicity('irregular', reverse=True)
     assert result == 'not updated'