Ejemplo n.º 1
0
    def test_datasets_none_found(self):

        p = RDFParser()

        p.g = Graph()

        eq_(len([d for d in p.datasets()]), 0)
Ejemplo n.º 2
0
    def test__datasets(self):

        p = RDFParser()

        p.g = _default_graph()

        eq_(len([d for d in p._datasets()]), 3)
Ejemplo n.º 3
0
    def test_profiles_are_called_on_datasets(self):

        p = RDFParser()

        p._profiles = [MockRDFProfile1, MockRDFProfile2]

        p.g = _default_graph()

        for dataset in p.datasets():
            assert dataset["profile_1"]
            assert dataset["profile_2"]
Ejemplo n.º 4
0
    def test_datasets(self):

        p = RDFParser()

        p.g = _default_graph()

        datasets = []
        for dataset in p.datasets():

            assert "title" in dataset

            datasets.append(dataset)

        eq_(len(datasets), 3)
Ejemplo n.º 5
0
    def test_parse_data_different_format(self):

        data = """
        @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
        @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

        <http://example.org> a rdfs:SomeClass ;
            rdfs:label "Some label" .
        """

        p = RDFParser()

        eq_(len(p.g), 0)

        p.parse(data, _format="n3")

        eq_(len(p.g), 2)
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents('catalog.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)

        dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1'
                   else datasets[1])

        eq_(dataset['title'], 'Example dataset 1')
        eq_(len(dataset['resources']), 3)
        eq_(len(dataset['tags']), 2)
    def test_dataset_spatial_label(self):

        contents = self._get_file_contents('dataset_sweden.rdf')

        p = RDFParser(profiles=['euro_dcat_ap', 'sweden_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        eq_(_get_extra_value('spatial_text'), u'Stockholm')
Ejemplo n.º 8
0
    def test_parse_data(self):

        data = """<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
        <rdfs:SomeClass rdf:about="http://example.org">
            <rdfs:label>Some label</rdfs:label>
        </rdfs:SomeClass>
        </rdf:RDF>
        """

        p = RDFParser()

        eq_(len(p.g), 0)

        p.parse(data)

        eq_(len(p.g), 2)
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents('dataset_deri.ttl')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='n3')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        eq_(dataset['title'], 'Abandoned Vehicles')
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]
        eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles')
        eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv')
        eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
    def test_distribution_download_url(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.downloadURL, Literal('http://download.url.org')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        eq_(resource['url'], u'http://download.url.org')
        eq_(resource['download_url'], u'http://download.url.org')
    def test_dataset_compatibility_mode(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True)

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        eq_(_get_extra_value('dcat_issued'), u'2012-05-10')
        eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1')
        eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('language'), 'ca,en,es')
Ejemplo n.º 12
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        # Get file contents
        url = harvest_job.source.url

        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            url, before_download_errors = harvester.before_download(url, harvest_job)

            for error_msg in before_download_errors:
                self._save_gather_error(error_msg, harvest_job)

            if not url:
                return False

        content = self._get_content(url, harvest_job, 1)

        # TODO: store content?
        for harvester in p.PluginImplementations(IDCATRDFHarvester):
            content, after_download_errors = harvester.after_download(content, harvest_job)

            for error_msg in after_download_errors:
                self._save_gather_error(error_msg, harvest_job)

        if not content:
            return False

        # TODO: profiles conf
        parser = RDFParser()
        # TODO: format conf
        try:
            parser.parse(content)
        except RDFParserException, e:
            self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
            return False
    def test_dataset_all_fields(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Basic fields

        eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.')
        eq_(dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...')
        eq_(dataset['url'], 'http://dataset.info.org')

        # Tags

        eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{'name': u'exploration'},
                                                               {'name': u'geochemistry'},
                                                               {'name': u'geology'}])
        # Extras

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        def _get_extra_value_as_list(key):
            value = _get_extra_value(key)
            return json.loads(value) if value else []

        #  Simple values
        eq_(_get_extra_value('issued'), u'2012-05-10')
        eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98')
        eq_(_get_extra_value('alternate_identifier'), u'alternate-identifier-x343')
        eq_(_get_extra_value('dcat_version'), u'2.3')
        eq_(_get_extra_value('version_notes'), u'New schema added')
        eq_(_get_extra_value('temporal_start'), '1905-03-01')
        eq_(_get_extra_value('temporal_end'), '2013-01-05')
        eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily')
        eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE')
        eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org')
        eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1')
        eq_(_get_extra_value('publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('publisher_url'), 'http://some.org')
        eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation')
        eq_(_get_extra_value('contact_name'), 'Point of Contact')
        eq_(_get_extra_value('contact_email'), 'mailto:[email protected]')

        #  Lists
        eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en' , u'es'])
        eq_(sorted(_get_extra_value_as_list('theme')), [u'Earth Sciences',
                                                        u'http://eurovoc.europa.eu/100142',
                                                        u'http://eurovoc.europa.eu/209065'])
        eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2'])

        # Dataset URI
        eq_(_get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98')

        # Resources
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]

        #  Simple values
        eq_(resource['name'], u'Some website')
        eq_(resource['description'], u'A longer description')
        eq_(resource['format'], u'HTML')
        eq_(resource['mimetype'], u'text/html')
        eq_(resource['issued'], u'2012-05-11')
        eq_(resource['modified'], u'2012-05-01T00:04:06')
        eq_(resource['status'], u'http://purl.org/adms/status/Completed')

        # These two are likely to need clarification
        eq_(resource['license'], u'http://creativecommons.org/licenses/by/3.0/')
        eq_(resource['rights'], u'Some statement about rights')

        eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html')
        assert 'download_url' not in resource

        eq_(resource['size'], 12323)

        # Distribution URI
        eq_(resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1')