def test_dataset_json_ld_with_at_graph(self):

        contents = self._get_file_contents('catalog_with_at_graph.jsonld')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='json-ld')

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 1

        dataset = datasets[0]
        extras = dict((e['key'], e['value']) for e in dataset['extras'])

        assert dataset['title'] == 'Title dataset'

        assert extras['contact_name'] == 'Jane Doe'
        # mailto gets removed for storage and is added again on output
        assert extras['contact_email'] == '*****@*****.**'

        assert len(dataset['resources']) == 1

        resource = dataset['resources'][0]
        assert resource['name'] == u'download.zip'
        assert resource['url'] == u'http://example2.org/files/download.zip'
        assert resource[
            'access_url'] == u'https://ckan.example.org/dataset/d4ce4e6e-ab89-44cb-bf5c-33a162c234de/resource/a289c289-55c9-410f-b4c7-f88e5f6f4e47'
        assert resource[
            'download_url'] == u'http://example2.org/files/download.zip'
Example #2
0
    def test_dataset_ttl(self):

        dataset = factories.Dataset(notes='Test dataset')

        url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'text/turtle')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents('catalog_pod.jsonld')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='json-ld')

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 1

        dataset = datasets[0]
        extras = dict((e['key'], e['value']) for e in dataset['extras'])

        assert dataset['title'] == 'U.S. Widget Manufacturing Statistics'

        assert extras['contact_name'] == 'Jane Doe'
        # mailto gets removed for storage and is added again on output
        assert extras['contact_email'] == '*****@*****.**'
        assert extras['publisher_name'] == 'Widget Services'
        assert extras['publisher_email'] == '*****@*****.**'

        assert len(dataset['resources']) == 4

        resource = [
            r for r in dataset['resources'] if r['name'] == 'widgets.csv'
        ][0]
        assert resource['name'] == u'widgets.csv'
        assert resource[
            'url'] == u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv'
        assert resource[
            'download_url'] == u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv'
Example #4
0
    def test_parse_dataset_default_lang_de(self):
        maxrdf = self._get_max_rdf()

        p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de'])

        p.parse(maxrdf)
        self._add_basic_fields_with_languages(p)

        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        dataset = datasets[0]

        # Title and description to be in default language "de"
        self.assertEqual(dataset.get('title'),
                         u'Naturräume Geest und Marsch (DE)')
        self.assertEqual(
            dataset.get('notes'),
            u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt. (DE)'
        )
        # Publisher and ContactPoint
        extras = dataset.get('extras')
        self.assertTrue(len(extras) > 0)
        self._assert_extras_string(
            extras, 'publisher_name',
            u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz (DE)')
        self._assert_extras_string(extras, 'contact_name',
                                   u'Herr Dr. Michael Schröder (DE)')
        # Resources
        self._assert_resource_lang(dataset, 'DE')
    def test_dataset_compatibility_mode(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True)

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 1

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [
                extra['value'] for extra in dataset['extras']
                if extra['key'] == key
            ]
            return v[0] if v else None

        assert _get_extra_value('dcat_issued') == u'2012-05-10'
        assert _get_extra_value('dcat_modified') == u'2012-05-10T21:04:00'
        assert _get_extra_value(
            'dcat_publisher_name') == 'Publishing Organization for dataset 1'
        assert _get_extra_value('dcat_publisher_email') == '*****@*****.**'
        assert _get_extra_value('language') == 'ca,en,es'
Example #6
0
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents('catalog_pod.jsonld')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='json-ld')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = dict((e['key'], e['value']) for e in dataset['extras'])

        eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics')

        eq_(extras['contact_name'], 'Jane Doe')
        eq_(extras['contact_email'], 'mailto:[email protected]')
        eq_(extras['publisher_name'], 'Widget Services')
        eq_(extras['publisher_email'], '*****@*****.**')

        eq_(len(dataset['resources']), 4)

        resource = [
            r for r in dataset['resources'] if r['name'] == 'widgets.csv'
        ][0]
        eq_(resource['name'], u'widgets.csv')
        eq_(
            resource['url'],
            u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
        eq_(
            resource['download_url'],
            u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
Example #7
0
    def test_catalog_modified_date(self):

        dataset1 = factories.Dataset(title='First dataset')
        time.sleep(1)
        dataset2 = factories.Dataset(title='Second dataset')

        url = url_for('dcat_catalog',
                      _format='ttl',
                      modified_since=dataset2['metadata_modified'])

        app = self._get_test_app()

        response = app.get(url)

        content = response.body

        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        eq_(dcat_datasets[0]['title'], dataset2['title'])
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents("catalog_pod.jsonld")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents, _format="json-ld")

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = dict((e["key"], e["value"]) for e in dataset["extras"])

        eq_(dataset["title"], "U.S. Widget Manufacturing Statistics")

        eq_(extras["contact_name"], "Jane Doe")
        eq_(extras["contact_email"], "mailto:[email protected]")
        eq_(extras["publisher_name"], "Widget Services")
        eq_(extras["publisher_email"], "*****@*****.**")

        eq_(len(dataset["resources"]), 4)

        resource = [r for r in dataset["resources"] if r["name"] == "widgets.csv"][0]
        eq_(resource["name"], u"widgets.csv")
        eq_(resource["url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
        eq_(resource["download_url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
    def test_dataset_ttl(self):

        dataset = factories.Dataset(notes="Test dataset")

        url = url_for("dcat_dataset", _id=dataset["id"], _format="ttl")

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers["Content-Type"], "text/turtle")

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format="turtle")

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset["title"], dataset["title"])
        eq_(dcat_dataset["notes"], dataset["notes"])
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents('catalog_pod.jsonld')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='json-ld')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = dict((e['key'], e['value']) for e in dataset['extras'])

        eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics')

        eq_(extras['contact_name'], 'Jane Doe')
        eq_(extras['contact_email'], 'mailto:[email protected]')
        eq_(extras['publisher_name'], 'Widget Services')
        eq_(extras['publisher_email'], '*****@*****.**')

        eq_(len(dataset['resources']), 4)

        resource = [r for r in dataset['resources'] if r['name'] == 'widgets.csv'][0]
        eq_(resource['name'], u'widgets.csv')
        eq_(resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
        eq_(resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
Example #11
0
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents('dataset_deri.ttl')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='n3')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        eq_(dataset['title'], 'Abandoned Vehicles')
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]
        eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles')
        eq_(
            resource['url'],
            u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv'
        )
        eq_(resource['uri'],
            u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
    def test_dataset_ttl(self):

        dataset = factories.Dataset(
            notes='Test dataset'
        )

        url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'text/turtle')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
Example #13
0
    def test_subthemes(self):

        load_themes()

        subthemes = [{
            'theme':
            'AGRI',
            'subthemes': [
                'http://eurovoc.europa.eu/100253',
                'http://eurovoc.europa.eu/100258'
            ]
        }, {
            'theme': 'ENVI',
            'subthemes': []
        }]

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued': '2016-11-29',
            'modified': '2016-11-29',
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '412946129',
            'holder_name': 'bolzano',
            'holder_identifier': '234234234',
            'alternate_identifier': 'ISBN,TEST',
            'theme': json.dumps(subthemes),
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])

        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())

        assert len(datasets) == 1
        d = datasets[0]
        themes = json.loads(dataset['theme'])
        assert (len(themes) == len(subthemes) == 2)
        for t in themes:
            if t['theme'] == 'ENVI':
                assert t['subthemes'] == []
            elif t['theme'] == 'AGRI':
                assert set(t['subthemes']) == set(subthemes[0]['subthemes'])
            else:
                assert False, "Unknown theme: {}".format(t)
    def test_catalog_modified_date(self):

        dataset1 = factories.Dataset(title='First dataset')
        time.sleep(1)
        dataset2 = factories.Dataset(title='Second dataset')

        url = url_for('dcat_catalog',
                      _format='ttl',
                      modified_since=dataset2['metadata_modified'])

        app = self._get_test_app()

        response = app.get(url)

        content = response.body

        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        eq_(dcat_datasets[0]['title'], dataset2['title'])
    def test_publisher(self):

        contents = self._get_file_contents('catalog_dati_unibo.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        g = p.g

        datasets = [d for d in p.datasets()]
        assert(len(datasets)> 1)
        for d in datasets:
            did = d['identifier']
            pname = d.get('publisher_name')
            pid = d.get('publisher_identifier')
            dat_ref = list(g.subjects(DCT.identifier, Literal(did)))[0]
            pub_ref = g.value(dat_ref, DCT.publisher)
            pubnames = list(g.objects(pub_ref, FOAF.name))
            if not pubnames:
                assert pname is None and pid is None,\
                    "Got {}/{} for publisher, when no ref in graph".format(pname, pid)
            else:
                assert pname and pid, "no pname {} and pid {} for {}".format(pname, pid, pubnames)

                lang_hit = False
                for lname in pubnames:
                    if hasattr(lname, 'lang'):
                        if lname.lang and lname.lang == DEFAULT_LANG:
                            lang_hit = pname == lname.value
                    else:
                        if not lang_hit:
                            lang_hit = pname == lname.value
                assert lang_hit, "There should be lang hit"
Example #16
0
    def test_parse_dataset_default_lang_not_in_graph(self):
        maxrdf = self._get_max_rdf()

        p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de'])

        p.parse(maxrdf)
        self._add_basic_fields_with_languages(p)

        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        dataset = datasets[0]

        # Title and description random
        self.assertIn(u'Naturräume Geest und Marsch', dataset.get('title'))
        self.assertIn(
            u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt',
            dataset.get('notes'))
        # Publisher and ContactPoint
        extras = dataset.get('extras')
        self.assertTrue(len(extras) > 0)
        self.assertIn(u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz', self._get_value_from_extras(extras, 'publisher_name'))
        self.assertIn(u'Herr Dr. Michael Schröder', self._get_value_from_extras(extras, 'contact_name'))
        # Resources
        resources = dataset.get('resources')
        self.assertEqual(len(resources), 2)
        for res in resources:
            # Title and description random
            self.assertIn(u'Naturräume Geest und Marsch', res.get('name'))
            self.assertIn(
                u'Das ist eine deutsche Beschreibung der Distribution',
                res.get('description'))
    def test_creators(self):

        creators = [{'creator_name': {DEFAULT_LANG: 'abc', 'it': 'abc it'}, 'creator_identifier': "ABC"},
                    {'creator_name': {DEFAULT_LANG: 'cde', 'it': 'cde it'}, 'creator_identifier': "CDE"},
                    ]
        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'issued':'2016-11-29',
            'modified':'2016-11-29',
            'identifier':'ISBN',
            'temporal_start':'2016-11-01',
            'temporal_end':'2016-11-30',
            'frequency':'UPDATE_CONT',
            'publisher_name':'bolzano',
            'publisher_identifier':'234234234',
            'creator_name':'test',
            'creator_identifier':'412946129',
            'holder_name':'bolzano',
            'holder_identifier':'234234234',
            'alternate_identifier':'ISBN,TEST',
            'theme':'{ECON,ENVI}',
            'geographical_geonames_url':'http://www.geonames.org/3181913',
            'language':'{DEU,ENG,ITA}',
            'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'creator': json.dumps(creators)
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])
        
        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())
        
        assert len(datasets) == 1
        d = datasets[0]
        creators.append({'creator_identifier': dataset['creator_identifier'],
                              'creator_name': {DEFAULT_LANG: dataset['creator_name']}})

        creators_dict = dict((v['creator_identifier'], v) for v in creators)

        creators_in = json.loads(d['creator'])

        for c in creators_in:
            assert c['creator_identifier'] in creators_dict.keys(), "no {} key in {}".format(c['creator_identifier'],
                                                                                             creators_dict.keys())
            assert c['creator_name'] == creators_dict[c['creator_identifier']]['creator_name'],\
                "{} vs {}".format(c['creator_name'], creators_dict[c['creator_identifier']]['creator_name'])
        for c in creators_dict.keys():
            assert c in [_c['creator_identifier'] for _c in creators_in]
            cdata = creators_dict[c]
            assert cdata in creators_in
    def test_temporal_coverage(self):

        load_themes()
        temporal_coverage = [{'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12'},
                             {'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12'},
                            ]
        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'issued':'2016-11-29',
            'modified':'2016-11-29',
            'identifier':'ISBN',
            'temporal_start':'2016-11-01T00:00:00',
            'temporal_end':'2016-11-30T00:00:00',
            'temporal_coverage': json.dumps(temporal_coverage),
            'frequency':'UPDATE_CONT',
            'publisher_name':'bolzano',
            'publisher_identifier':'234234234',
            'creator_name':'test',
            'creator_identifier':'412946129',
            'holder_name':'bolzano',
            'holder_identifier':'234234234',
            'alternate_identifier':'ISBN,TEST',
            'theme':'{ECON,ENVI}',
            'geographical_geonames_url':'http://www.geonames.org/3181913',
            'language':'{DEU,ENG,ITA}',
            'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])
        
        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())
        
        assert len(datasets) == 1
        d = datasets[0]

        temporal_coverage.append({'temporal_start': dataset['temporal_start'],
                                  'temporal_end': dataset['temporal_end']})

        try:
            validators.dcatapit_temporal_coverage(d['temporal_coverage'], {})
            # this should not raise exception
            assert True
        except validators.Invalid, err:
            assert False, "Temporal coverage should be valid: {}".format(err)
    def test_alternate_identifiers(self):

        contents = self._get_file_contents('dataset_identifier.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])
        p.parse(contents)
        g = p.g
        datasets = [d for d in p.datasets()]
        assert len(datasets) == 1
        assert datasets[0]['alternate_identifier'] =='[{"identifier": "ISBN:alt id 123", "agent": {}}]',\
                    datasets[0]['alternate_identifier']
Example #20
0
    def test_catalog(self):

        contents = self._get_file_contents('catalog.xml')

        p = RDFParser(profiles=['swiss_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)
Example #21
0
    def test_alternate_identifiers(self):

        with open(get_example_file('dataset_identifier.rdf'), 'r') as f:
            contents = f.read()

        p = RDFParser(profiles=['it_dcat_ap'])
        p.parse(contents)

        datasets = [d for d in p.datasets()]
        assert len(datasets) == 1
        assert datasets[0]['alternate_identifier'] == '[{"identifier": "ISBN:alt id 123", "agent": {}}]',\
            datasets[0]['alternate_identifier']
    def test_parse_without_pagination(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
        <rdfs:SomeClass rdf:about="http://example.org">
            <rdfs:label>Some label</rdfs:label>
        </rdfs:SomeClass>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        eq_(p.next_page(), None)
Example #23
0
    def test_parse_without_pagination(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
        <rdfs:SomeClass rdf:about="http://example.org">
            <rdfs:label>Some label</rdfs:label>
        </rdfs:SomeClass>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        assert p.next_page() is None
    def test_parse_data_different_format(self):

        data = '''
        @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
        @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

        <http://example.org> a rdfs:SomeClass ;
            rdfs:label "Some label" .
        '''

        p = RDFParser()

        eq_(len(p.g), 0)

        p.parse(data, _format='n3')

        eq_(len(p.g), 2)
Example #25
0
    def test_parse_data_different_format(self):

        data = '''
        @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
        @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

        <http://example.org> a rdfs:SomeClass ;
            rdfs:label "Some label" .
        '''

        p = RDFParser()

        assert len(p.g) == 0

        p.parse(data, _format='n3')

        assert len(p.g) == 2
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents("catalog.rdf")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)

        dataset = datasets[0] if datasets[0]["title"] == "Example dataset 1" else datasets[1]

        eq_(dataset["title"], "Example dataset 1")
        eq_(len(dataset["resources"]), 3)
        eq_(len(dataset["tags"]), 2)
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents('catalog.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 2

        dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1'
                   else datasets[1])

        assert dataset['title'] == 'Example dataset 1'
        assert len(dataset['resources']) == 3
        assert len(dataset['tags']) == 2
Example #28
0
    def test_dataset_show_without_format(self):
        dataset = factories.Dataset(notes='Test dataset')

        content = helpers.call_action('dcat_dataset_show', id=dataset['id'])

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content)

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents('catalog.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)

        dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1'
                   else datasets[1])

        eq_(dataset['title'], 'Example dataset 1')
        eq_(len(dataset['resources']), 3)
        eq_(len(dataset['tags']), 2)
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        # Get file contents
        url = harvest_job.source.url

        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            url, before_download_errors = harvester.before_download(
                url, harvest_job)

            for error_msg in before_download_errors:
                self._save_gather_error(error_msg, harvest_job)

            if not url:
                return False

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")
        content, rdf_format = self._get_content_and_type(
            url, harvest_job, 1, content_type=rdf_format)

        # TODO: store content?
        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            content, after_download_errors = harvester.after_download(
                content, harvest_job)

            for error_msg in after_download_errors:
                self._save_gather_error(error_msg, harvest_job)

        if not content:
            return False

        # TODO: profiles conf
        parser = RDFParser()

        try:
            parser.parse(content, _format=rdf_format)
        except RDFParserException, e:
            self._save_gather_error(
                'Error parsing the RDF file: {0}'.format(e), harvest_job)
            return False
Example #31
0
    def test_parse_data(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
        <rdfs:SomeClass rdf:about="http://example.org">
            <rdfs:label>Some label</rdfs:label>
        </rdfs:SomeClass>
        </rdf:RDF>
        '''

        p = RDFParser()

        eq_(len(p.g), 0)

        p.parse(data)

        eq_(len(p.g), 2)
Example #32
0
    def parse_chunk(self, harvest_job, content, rdf_format, guids_in_source, object_ids):
        # TODO: store content?
        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            content, after_download_errors = harvester.after_download(content, harvest_job)

            for error_msg in after_download_errors:
                self._save_gather_error(error_msg, harvest_job)

        if not content:
            return False

        # TODO: profiles conf
        parser = RDFParser()

        try:
            parser.parse(content, _format=rdf_format)
        except RDFParserException, e:
            self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
            return False
Example #33
0
def test_dataset_show_with_format():
    dataset = factories.Dataset(notes='Test dataset')

    content = helpers.call_action('dcat_dataset_show',
                                  id=dataset['id'],
                                  _format='xml')

    # Parse the contents to check it's an actual serialization
    p = RDFParser()

    p.parse(content, _format='xml')

    dcat_datasets = [d for d in p.datasets()]

    assert len(dcat_datasets) == 1

    dcat_dataset = dcat_datasets[0]

    assert dcat_dataset['title'] == dataset['title']
    assert dcat_dataset['notes'] == dataset['notes']
Example #34
0
    def test_dataset_show_without_format(self):
        dataset = factories.Dataset(
            notes='Test dataset'
        )

        content = helpers.call_action('dcat_dataset_show', id=dataset['id'])

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content)

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents("dataset_deri.ttl")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents, _format="n3")

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        eq_(dataset["title"], "Abandoned Vehicles")
        eq_(len(dataset["resources"]), 1)

        resource = dataset["resources"][0]
        eq_(resource["name"], u"CSV distribution of: Abandoned Vehicles")
        eq_(resource["url"], u"http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv")
        eq_(resource["uri"], u"http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv")
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents('dataset_deri.ttl')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='n3')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        eq_(dataset['title'], 'Abandoned Vehicles')
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]
        eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles')
        eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv')
        eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
Example #37
0
    def test_dataset_issued_with_year_before_1900(self):

        contents = self._get_file_contents('1894.xml')

        p = RDFParser(profiles=['swiss_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Check date values
        eq_(dataset['issued'], -2398377600)
        issued = datetime.fromtimestamp(dataset['issued'])
        eq_(issued.date().isoformat(), u'1893-12-31')

        eq_(dataset['modified'], 1524528000)
        modified = datetime.fromtimestamp(dataset['modified'])
        eq_(modified.date().isoformat(), u'2018-04-24')
Example #38
0
    def test_parse_pagination_last_page(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
         <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
            <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
            <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
            <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        assert p.next_page() is None
    def test_parse_pagination_last_page(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
         <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
            <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
            <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
            <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        eq_(p.next_page(), None)
Example #40
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        # Get file contents
        url = harvest_job.source.url

        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            url, before_download_errors = harvester.before_download(url, harvest_job)

            for error_msg in before_download_errors:
                self._save_gather_error(error_msg, harvest_job)

            if not url:
                return False

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(harvest_job.source.config).get("rdf_format")
        content, rdf_format = self._get_content_and_type(url, harvest_job, 1, content_type=rdf_format)

        # TODO: store content?
        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            content, after_download_errors = harvester.after_download(content, harvest_job)

            for error_msg in after_download_errors:
                self._save_gather_error(error_msg, harvest_job)

        if not content:
            return False

        # TODO: profiles conf
        parser = RDFParser()

        try:
            parser.parse(content, _format=rdf_format)
        except RDFParserException, e:
            self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
            return False
Example #41
0
    def test_catalog_ttl(self):

        for i in xrange(4):
            factories.Dataset()

        url = url_for('dcat_catalog', _format='ttl')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'text/turtle')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 4)
    def test_dataset_compatibility_mode(self):

        contents = self._get_file_contents("dataset.rdf")

        p = RDFParser(profiles=["euro_dcat_ap"], compatibility_mode=True)

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [extra["value"] for extra in dataset["extras"] if extra["key"] == key]
            return v[0] if v else None

        eq_(_get_extra_value("dcat_issued"), u"2012-05-10")
        eq_(_get_extra_value("dcat_modified"), u"2012-05-10T21:04:00")
        eq_(_get_extra_value("dcat_publisher_name"), "Publishing Organization for dataset 1")
        eq_(_get_extra_value("dcat_publisher_email"), "*****@*****.**")
        eq_(_get_extra_value("language"), "ca,en,es")
Example #43
0
    def test_catalog_ttl(self):

        for i in xrange(4):
            factories.Dataset()

        url = url_for("dcat_catalog", _format="ttl")

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers["Content-Type"], "text/turtle")

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format="turtle")

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 4)
Example #44
0
    def test_catalog_modified_date(self):

        dataset1 = factories.Dataset(title="First dataset")
        time.sleep(1)
        dataset2 = factories.Dataset(title="Second dataset")

        url = url_for("dcat_catalog", _format="ttl", modified_since=dataset2["metadata_modified"])

        app = self._get_test_app()

        response = app.get(url)

        content = response.body

        p = RDFParser()

        p.parse(content, _format="turtle")

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        eq_(dcat_datasets[0]["title"], dataset2["title"])
    def test_catalog_default(self):

        for i in xrange(4):
            factories.Dataset()

        url = url_for('dcat_catalog', _format='rdf')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'application/rdf+xml')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='xml')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 4)
    def _update_package_in_triplestore(self, package_id, package_org):
        '''Updates the package with the given package ID in the triple store.'''
        uri = 'n/a'
        # Get uri of dataset
        rdf = self._get_rdf(package_id)
        rdf_parser = RDFParser()
        rdf_parser.parse(rdf)
        # Should be only one dataset
        for uri in rdf_parser._datasets():
            self.triplestore_client.delete_dataset_in_triplestore(uri)
            self.triplestore_client.create_dataset_in_triplestore(rdf, uri)

            # shacl-validate the graph
            validation_rdf = self.shacl_validation_client.validate(
                rdf, uri, package_org)
            if validation_rdf:
                # update in mqa-triplestore
                self.triplestore_client.delete_dataset_in_triplestore_mqa(
                    uri, package_org)
                self.triplestore_client.create_dataset_in_triplestore_mqa(
                    validation_rdf, uri)

        return uri
    def test_dataset_compatibility_mode(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True)

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        eq_(_get_extra_value('dcat_issued'), u'2012-05-10')
        eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1')
        eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('language'), 'ca,en,es')
Example #48
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        # Get file contents
        url = harvest_job.source.url

        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            url, before_download_errors = harvester.before_download(url, harvest_job)

            for error_msg in before_download_errors:
                self._save_gather_error(error_msg, harvest_job)

            if not url:
                return False

        content = self._get_content(url, harvest_job, 1)

        # TODO: store content?
        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            content, after_download_errors = harvester.after_download(content, harvest_job)

            for error_msg in after_download_errors:
                self._save_gather_error(error_msg, harvest_job)

        if not content:
            return False

        # TODO: profiles conf
        parser = RDFParser()
        # TODO: format conf
        try:
            parser.parse(content)
        except RDFParserException, e:
            self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
            return False
Example #49
0
 def _delete_dataset_in_triplestore(self, harvest_object):
     '''
     Deletes the package with the given package ID in the triple store.
     '''
     try:
         if self.triplestore_client.is_available():
             package_id = harvest_object.package_id
             LOGGER.debug(
                 u'Start deleting dataset with ID %s from triplestore.',
                 package_id)
             context = {'user': self._get_user_name()}
             rdf = toolkit.get_action('dcat_dataset_show')(context, {
                 'id': package_id
             })
             rdf_parser = RDFParser()
             rdf_parser.parse(rdf)
             # Should be only one dataset
             uri = next(rdf_parser._datasets(), None)
             source_dataset = model.Package.get(harvest_object.source.id)
             self._delete_dataset_in_triplestore_by_uri(uri, source_dataset)
     except RDFParserException as ex:
         LOGGER.warn(
             u'Error while parsing the RDF file for dataset with ID %s: %s',
             package_id, ex)
Example #50
0
    def test_parse_data_raises_on_parse_error(self):

        p = RDFParser()

        data = 'Wrong data'

        with pytest.raises(RDFParserException):
            p.parse('')

        with pytest.raises(RDFParserException):
            p.parse(data)

        with pytest.raises(RDFParserException):
            p.parse(data, _format='n3')
Example #51
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format)

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for dataset in parser.datasets():
                if not dataset.get('name'):
                    dataset['name'] = self._gen_new_name(dataset['title'])

                # Unless already set by the parser, get the owner organization (if any)
                # from the harvest source dataset
                if not dataset.get('owner_org'):
                    source_dataset = model.Package.get(harvest_job.source.id)
                    if source_dataset.owner_org:
                        dataset['owner_org'] = source_dataset.owner_org

                # Try to get a unique identifier for the harvested dataset
                guid = self._get_guid(dataset)

                if not guid:
                    self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
                                            harvest_job)
                    continue

                dataset['extras'].append({'key': 'guid', 'value': guid})
                guids_in_source.append(guid)

                obj = HarvestObject(guid=guid, job=harvest_job,
                                    content=json.dumps(dataset))

                obj.save()
                object_ids.append(obj.id)

            # get the next page
            next_page_url = parser.next_page()
Example #52
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None
        self._names_taken = []

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            if content:
                content_hash.update(content)

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning('Remote content was the same even when using a paginated URL, skipping')
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            try:

                source_dataset = model.Package.get(harvest_job.source.id)

                for dataset in parser.datasets():
                    if not dataset.get('name'):
                        dataset['name'] = self._gen_new_name(dataset['title'])
                    if dataset['name'] in self._names_taken:
                        suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1
                        dataset['name'] = '{}-{}'.format(dataset['name'], suffix)
                    self._names_taken.append(dataset['name'])

                    # Unless already set by the parser, get the owner organization (if any)
                    # from the harvest source dataset
                    if not dataset.get('owner_org'):
                        if source_dataset.owner_org:
                            dataset['owner_org'] = source_dataset.owner_org

                    # Try to get a unique identifier for the harvested dataset
                    guid = self._get_guid(dataset, source_url=source_dataset.url)

                    if not guid:
                        self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
                                                harvest_job)
                        continue

                    dataset['extras'].append({'key': 'guid', 'value': guid})
                    guids_in_source.append(guid)

                    obj = HarvestObject(guid=guid, job=harvest_job,
                                        content=json.dumps(dataset))

                    obj.save()
                    object_ids.append(obj.id)
            except Exception, e:
                self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()),
                                        harvest_job)
                return []
    def test_dataset_all_fields(self):

        contents = self._get_file_contents("dataset.rdf")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Basic fields

        eq_(dataset["title"], u"Zimbabwe Regional Geochemical Survey.")
        eq_(dataset["notes"], u"During the period 1982-86 a team of geologists from the British Geological Survey ...")
        eq_(dataset["url"], "http://dataset.info.org")
        eq_(dataset["version"], "2.3")

        # Tags

        eq_(
            sorted(dataset["tags"], key=lambda k: k["name"]),
            [{"name": u"exploration"}, {"name": u"geochemistry"}, {"name": u"geology"}],
        )
        # Extras

        def _get_extra_value(key):
            v = [extra["value"] for extra in dataset["extras"] if extra["key"] == key]
            return v[0] if v else None

        def _get_extra_value_as_list(key):
            value = _get_extra_value(key)
            return json.loads(value) if value else []

        #  Simple values
        eq_(_get_extra_value("issued"), u"2012-05-10")
        eq_(_get_extra_value("modified"), u"2012-05-10T21:04:00")
        eq_(_get_extra_value("identifier"), u"9df8df51-63db-37a8-e044-0003ba9b0d98")
        eq_(_get_extra_value("version_notes"), u"New schema added")
        eq_(_get_extra_value("temporal_start"), "1905-03-01")
        eq_(_get_extra_value("temporal_end"), "2013-01-05")
        eq_(_get_extra_value("frequency"), "http://purl.org/cld/freq/daily")
        eq_(_get_extra_value("spatial_uri"), "http://publications.europa.eu/mdr/authority/country/ZWE")
        eq_(_get_extra_value("publisher_uri"), "http://orgs.vocab.org/some-org")
        eq_(_get_extra_value("publisher_name"), "Publishing Organization for dataset 1")
        eq_(_get_extra_value("publisher_email"), "*****@*****.**")
        eq_(_get_extra_value("publisher_url"), "http://some.org")
        eq_(_get_extra_value("publisher_type"), "http://purl.org/adms/publishertype/NonProfitOrganisation")
        eq_(_get_extra_value("contact_name"), "Point of Contact")
        eq_(_get_extra_value("contact_email"), "mailto:[email protected]")
        eq_(_get_extra_value("access_rights"), "public")
        eq_(_get_extra_value("provenance"), "Some statement about provenance")
        eq_(_get_extra_value("dcat_type"), "test-type")

        #  Lists
        eq_(sorted(_get_extra_value_as_list("language")), [u"ca", u"en", u"es"])
        eq_(
            sorted(_get_extra_value_as_list("theme")),
            [u"Earth Sciences", u"http://eurovoc.europa.eu/100142", u"http://eurovoc.europa.eu/209065"],
        )
        eq_(sorted(_get_extra_value_as_list("conforms_to")), [u"Standard 1", u"Standard 2"])

        eq_(
            sorted(_get_extra_value_as_list("alternate_identifier")),
            [u"alternate-identifier-1", u"alternate-identifier-2"],
        )
        eq_(
            sorted(_get_extra_value_as_list("documentation")),
            [u"http://dataset.info.org/doc1", u"http://dataset.info.org/doc2"],
        )
        eq_(
            sorted(_get_extra_value_as_list("related_resource")),
            [u"http://dataset.info.org/related1", u"http://dataset.info.org/related2"],
        )
        eq_(
            sorted(_get_extra_value_as_list("has_version")),
            [
                u"https://data.some.org/catalog/datasets/derived-dataset-1",
                u"https://data.some.org/catalog/datasets/derived-dataset-2",
            ],
        )
        eq_(
            sorted(_get_extra_value_as_list("is_version_of")),
            [u"https://data.some.org/catalog/datasets/original-dataset"],
        )
        eq_(
            sorted(_get_extra_value_as_list("source")),
            [
                u"https://data.some.org/catalog/datasets/source-dataset-1",
                u"https://data.some.org/catalog/datasets/source-dataset-2",
            ],
        )
        eq_(
            sorted(_get_extra_value_as_list("sample")),
            [u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample"],
        )

        # Dataset URI
        eq_(_get_extra_value("uri"), u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98")

        # Resources
        eq_(len(dataset["resources"]), 1)

        resource = dataset["resources"][0]

        #  Simple values
        eq_(resource["name"], u"Some website")
        eq_(resource["description"], u"A longer description")
        eq_(resource["format"], u"HTML")
        eq_(resource["mimetype"], u"text/html")
        eq_(resource["issued"], u"2012-05-11")
        eq_(resource["modified"], u"2012-05-01T00:04:06")
        eq_(resource["status"], u"http://purl.org/adms/status/Completed")

        eq_(resource["hash"], u"4304cf2e751e6053c90b1804c89c0ebb758f395a")
        eq_(resource["hash_algorithm"], u"http://spdx.org/rdf/terms#checksumAlgorithm_sha1")

        # Lists
        for item in [
            (
                "documentation",
                [u"http://dataset.info.org/distribution1/doc1", u"http://dataset.info.org/distribution1/doc2"],
            ),
            ("language", [u"ca", u"en", u"es"]),
            ("conforms_to", [u"Standard 1", u"Standard 2"]),
        ]:
            eq_(sorted(json.loads(resource[item[0]])), item[1])

        # These two are likely to need clarification
        eq_(resource["license"], u"http://creativecommons.org/licenses/by/3.0/")
        eq_(resource["rights"], u"Some statement about rights")

        eq_(resource["url"], u"http://www.bgs.ac.uk/gbase/geochemcd/home.html")
        assert "download_url" not in resource

        eq_(resource["size"], 12323)

        # Distribution URI
        eq_(resource["uri"], u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1")
Example #54
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(
                    next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(
                next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            content_hash.update(content)

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning(
                        'Remote content was the same even when using a paginated URL, skipping'
                    )
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(
                    content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error(
                    'Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for dataset in parser.datasets():
                if not dataset.get('name'):
                    dataset['name'] = self._gen_new_name(dataset['title'])

                # Unless already set by the parser, get the owner organization (if any)
                # from the harvest source dataset
                if not dataset.get('owner_org'):
                    source_dataset = model.Package.get(harvest_job.source.id)
                    if source_dataset.owner_org:
                        dataset['owner_org'] = source_dataset.owner_org

                # Try to get a unique identifier for the harvested dataset
                guid = self._get_guid(dataset)

                if not guid:
                    self._save_gather_error(
                        'Could not get a unique identifier for dataset: {0}'.
                        format(dataset), harvest_job)
                    continue

                dataset['extras'].append({'key': 'guid', 'value': guid})
                guids_in_source.append(guid)

                obj = HarvestObject(guid=guid,
                                    job=harvest_job,
                                    content=json.dumps(dataset))

                obj.save()
                object_ids.append(obj.id)

            # get the next page
            next_page_url = parser.next_page()
    def test_dataset_all_fields(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Basic fields

        eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.')
        eq_(dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...')
        eq_(dataset['url'], 'http://dataset.info.org')
        eq_(dataset['version'], '2.3')

        # Tags

        eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{'name': u'exploration'},
                                                               {'name': u'geochemistry'},
                                                               {'name': u'geology'}])
        # Extras

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        def _get_extra_value_as_list(key):
            value = _get_extra_value(key)
            return json.loads(value) if value else []

        #  Simple values
        eq_(_get_extra_value('issued'), u'2012-05-10')
        eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98')
        eq_(_get_extra_value('alternate_identifier'), u'alternate-identifier-x343')
        eq_(_get_extra_value('version_notes'), u'New schema added')
        eq_(_get_extra_value('temporal_start'), '1905-03-01')
        eq_(_get_extra_value('temporal_end'), '2013-01-05')
        eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily')
        eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE')
        eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org')
        eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1')
        eq_(_get_extra_value('publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('publisher_url'), 'http://some.org')
        eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation')
        eq_(_get_extra_value('contact_name'), 'Point of Contact')
        eq_(_get_extra_value('contact_email'), 'mailto:[email protected]')

        #  Lists
        eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en', u'es'])
        eq_(sorted(_get_extra_value_as_list('theme')), [u'Earth Sciences',
                                                        u'http://eurovoc.europa.eu/100142',
                                                        u'http://eurovoc.europa.eu/209065'])
        eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2'])

        # Dataset URI
        eq_(_get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98')

        # Resources
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]

        #  Simple values
        eq_(resource['name'], u'Some website')
        eq_(resource['description'], u'A longer description')
        eq_(resource['format'], u'HTML')
        eq_(resource['mimetype'], u'text/html')
        eq_(resource['issued'], u'2012-05-11')
        eq_(resource['modified'], u'2012-05-01T00:04:06')
        eq_(resource['status'], u'http://purl.org/adms/status/Completed')

        # These two are likely to need clarification
        eq_(resource['license'], u'http://creativecommons.org/licenses/by/3.0/')
        eq_(resource['rights'], u'Some statement about rights')

        eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html')
        assert 'download_url' not in resource

        eq_(resource['size'], 12323)

        # Distribution URI
        eq_(resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1')