Exemple #1
0
    def graph_add_resources(self, dataset_uri, dataset_dict):

        g = self.g

        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_uri, DCAT.distribution, distribution))

            # As we don't allow direct download of the data, we need to add landing page
            # To dataset - see http://www.w3.org/TR/vocab-dcat/#example-landing-page
            g.add((dataset_uri, DCAT.landingPage, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('name', DC.title, None),
                ('description', DC.description, None),
                ('status', ADMS.status, None),
                ('rights', DC.rights, None),
                ('license', DC.license, None),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DC['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            g.set((distribution, DCAT.accessURL, distribution))

            # Dates
            items = [
                ('issued', DC.issued, None),
                ('modified', DC.modified, None),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution, items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
    def _resources_graph(self, dataset_ref, dataset_dict):
        g = self.g
        for resource_dict in dataset_dict.get('resources', []):
            distribution = URIRef(resource_uri(resource_dict))
            g.add((dataset_ref, SCHEMA.distribution, distribution))
            g.add((distribution, RDF.type, SCHEMA.DataDownload))

            self._distribution_graph(distribution, resource_dict)
    def test_distribution_fields(self):
        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
            'language': '[\"en\", \"es\", \"ca\"]',
        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [
                resource
            ]
        }

        s = RDFSerializer(profiles=['schemaorg'])
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, SCHEMA.distribution, None))]), 1)

        # URI
        distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload)
        assert self._triple(g, distribution, SCHEMA.name, resource['name'])
        assert self._triple(g, distribution, SCHEMA.description, resource['description'])
        assert self._triple(g, distribution, SCHEMA.license, resource['license'])

        # List
        for item in [
            ('language', SCHEMA.inLanguage),
        ]:
            values = json.loads(resource[item[0]])
            eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, distribution, item[1], value)

        # Dates
        assert self._triple(g, distribution, SCHEMA.datePublished, resource['issued'])
        assert self._triple(g, distribution, SCHEMA.dateModified, resource['modified'])

        # Numbers
        assert self._triple(g, distribution, SCHEMA.contentSize, resource['size'])
    def test_distribution_fields(self):
        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
            'language': '[\"en\", \"es\", \"ca\"]',
        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [
                resource
            ]
        }

        s = RDFSerializer(profiles=['schemaorg'])
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, SCHEMA.distribution, None))]), 1)

        # URI
        distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload)
        assert self._triple(g, distribution, SCHEMA.name, resource['name'])
        assert self._triple(g, distribution, SCHEMA.description, resource['description'])
        assert self._triple(g, distribution, SCHEMA.license, resource['license'])

        # List
        for item in [
            ('language', SCHEMA.inLanguage),
        ]:
            values = json.loads(resource[item[0]])
            eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, distribution, item[1], value)

        # Dates
        assert self._triple(g, distribution, SCHEMA.datePublished, resource['issued'])
        assert self._triple(g, distribution, SCHEMA.dateModified, resource['modified'])

        # Numbers
        assert self._triple(g, distribution, SCHEMA.contentSize, resource['size'])
Exemple #5
0
    def test_distribution_fields(self):

        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [resource]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(
            len([t
                 for t in g.triples((dataset_ref, DCAT.distribution, None))]),
            1)

        # URI
        distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
        assert self._triple(g, distribution, DCT.title, resource['name'])
        assert self._triple(g, distribution, DCT.description,
                            resource['description'])
        assert self._triple(g, distribution, DCT.rights, resource['rights'])
        assert self._triple(g, distribution, DCT.license, resource['license'])
        assert self._triple(g, distribution, ADMS.status, resource['status'])

        # Dates
        assert self._triple(g, distribution, DCT.issued, resource['issued'],
                            XSD.dateTime)
        assert self._triple(g, distribution, DCT.modified,
                            resource['modified'], XSD.dateTime)

        # Numbers
        assert self._triple(g, distribution, DCAT.byteSize,
                            float(resource['size']), XSD.decimal)
    def test_distribution_fields(self):

        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [
                resource
            ]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1)

        # URI
        distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
        assert self._triple(g, distribution, DCT.title, resource['name'])
        assert self._triple(g, distribution, DCT.description, resource['description'])
        assert self._triple(g, distribution, DCT.rights, resource['rights'])
        assert self._triple(g, distribution, DCT.license, resource['license'])
        assert self._triple(g, distribution, ADMS.status, resource['status'])

        # Dates
        assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime)
        assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime)

        # Numbers
        assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal)
Exemple #7
0
    def test_distributions(self):

        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'Test DCAT dataset',
            'resources': [
                {
                    'id': 'c041c635-054f-4431-b647-f9186926d021',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'CSV file'
                },
                {
                    'id': '8bceeda9-0084-477f-aa33-dad6148900d5',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'XLS file'
                },
                {
                    'id': 'da73d939-0f11-45a1-9733-5de108383133',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'PDF file'
                },
            ]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(
            len([t
                 for t in g.triples((dataset_ref, DCAT.distribution, None))]),
            3)

        for resource in dataset['resources']:
            distribution = self._triple(g, dataset_ref, DCAT.distribution,
                                        URIRef(
                                            utils.resource_uri(resource)))[2]

            assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
            assert self._triple(g, distribution, DCT.title, resource['name'])
    def test_distributions(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [
                {
                    'id': 'c041c635-054f-4431-b647-f9186926d021',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'CSV file'
                },
                {
                    'id': '8bceeda9-0084-477f-aa33-dad6148900d5',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'XLS file'
                },
                {
                    'id': 'da73d939-0f11-45a1-9733-5de108383133',
                    'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
                    'name': 'PDF file'
                },

            ]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3)

        for resource in dataset['resources']:
            distribution = self._triple(g,
                                        dataset_ref,
                                        DCAT.distribution,
                                        URIRef(utils.resource_uri(resource)))[2]

            assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
            assert self._triple(g, distribution, DCT.title, resource['name'])
    def test_distributions(self):

        dataset = {
            "id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
            "name": "test-dataset",
            "title": "Test DCAT dataset",
            "resources": [
                {
                    "id": "c041c635-054f-4431-b647-f9186926d021",
                    "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
                    "name": "CSV file",
                },
                {
                    "id": "8bceeda9-0084-477f-aa33-dad6148900d5",
                    "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
                    "name": "XLS file",
                },
                {
                    "id": "da73d939-0f11-45a1-9733-5de108383133",
                    "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
                    "name": "PDF file",
                },
            ],
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3)

        for resource in dataset["resources"]:
            distribution = self._triple(g, dataset_ref, DCAT.distribution, URIRef(utils.resource_uri(resource)))[2]

            assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
            assert self._triple(g, distribution, DCT.title, resource["name"])
Exemple #10
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('identifier', DCT.identifier, ['guid', 'id']),
            ('alternate_identifier', ADMS.identifier, None),
            ('title', DCT.title, None),
            ('notes', DCT.description, None),
            ('url', DCAT.landingPage, None),
            ('version', OWL.versionInfo, None),


#            ('accrual-periodicity', DCT.accrualPeriodicity, None),
#            ('temporal', DCT.temporal, None),
#            ('language', DCT.language, None),
#            ('dcat-category-id', DCAT.theme, None),

        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)


        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created']),
            ('modified', DCT.modified, ['metadata_modified']),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)


        # Publisher
        publisher_uri = self._get_dataset_value(dataset_dict, 'spc')
        if publisher_uri:
            publisher_details = URIRef(publisher_uri)
            rightsHolder_details = URIRef(publisher_uri)
        else:
            # No publisher_uri
            publisher_details = BNode()
            rightsHolder_details = BNode()
        
        g.add((publisher_details, RDF.type, FOAF.Organization))
        publisher_name = dataset_dict.get('organization').get('title')
        if publisher_name:
            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
        g.add((dataset_ref, DCT.publisher, publisher_details))

        # DCAT-AP_IT new properties
        
        # subTheme
        #g.add((dataset_ref, DCATAPIT.subTheme, ))
        
        # rightsHolder
        g.add((rightsHolder_details, RDF.type, DCATAPIT.Agent))
        g.add((rightsHolder_details, DCT.identifier, Literal(publisher_uri)))
        g.add((rightsHolder_details, FOAF.name, Literal(publisher_name)))
        g.add((dataset_ref, DCT.rightsHolder, rightsHolder_details))

		# creator
        g.add((dataset_ref, DCT.creator, ...))

        # Contact details
        if any([
            self._get_dataset_value(dataset_dict, 'contact_uri'),
            self._get_dataset_value(dataset_dict, 'contact_name'),
            self._get_dataset_value(dataset_dict, 'contact_email'),
            self._get_dataset_value(dataset_dict, 'maintainer'),
            self._get_dataset_value(dataset_dict, 'maintainer_email'),
            self._get_dataset_value(dataset_dict, 'author'),
            self._get_dataset_value(dataset_dict, 'author_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(contact_uri)
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Organization))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            items = [
                ('contact_name', VCARD.fn, ['maintainer', 'author']),
                ('contact_email', VCARD.hasEmail, ['maintainer_email',
                                                   'author_email']),
            ]

            self._add_triples_from_dict(dataset_dict, contact_details, items)

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('name', DCT.title, None),
                ('description', DCT.description, None),
                ('status', ADMS.status, None),
                ('rights', DCT.rights, None),
                ('license', DCT.license, None),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DCT['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, Literal(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, Literal(url)))

            # Dates
            items = [
                ('issued', DCT.issued, None),
                ('modified', DCT.modified, None),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution, items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
Exemple #11
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('title', DCT.title, None),
            ('notes', DCT.description, None),
            ('url', DCAT.landingPage, None),
            ('identifier', DCT.identifier, ['guid', 'id']),
            ('version', OWL.versionInfo, ['dcat_version']),
            ('alternate_identifier', ADMS.identifier, None),
            ('version_notes', ADMS.versionNotes, None),
            ('frequency', DCT.accrualPeriodicity, None),
            ('accrualPeriodicity', DCT.accrualPeriodicity, None)
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created']),
            ('modified', DCT.modified, ['metadata_modified']),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        #  Lists
        items = [
            ('language', DCT.language, None),
            ('theme-primary', DCAT.theme, None),
            ('theme-secondary', DCAT.theme, None),
            ('conforms-to', DCAT.conformsTo, None),
            ('lineage', DCT.provenance, None)
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Contact details
        if any([
            self._get_dataset_value(dataset_dict, 'contact_uri'),
            self._get_dataset_value(dataset_dict, 'contact_name'),
            self._get_dataset_value(dataset_dict, 'contact_email'),
            self._get_dataset_value(dataset_dict, 'maintainer'),
            self._get_dataset_value(dataset_dict, 'maintainer_email'),
            self._get_dataset_value(dataset_dict, 'author'),
            self._get_dataset_value(dataset_dict, 'author_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(contact_uri)
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Organization))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            items = [
                ('contact_name', VCARD.fn, ['maintainer', 'author']),
                ('contact_email', VCARD.hasEmail, ['maintainer_email',
                                                   'author_email']),
            ]

            self._add_triples_from_dict(dataset_dict, contact_details, items)

        license_id = self._get_dataset_value(dataset_dict, 'license_id')
        if (license_id == 'cc-by'):
            g.add((dataset_ref, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/')))
        else:
            g.add((dataset_ref, DCT.license, Literal(license_id)))

        # Publisher
        if any([
            self._get_dataset_value(dataset_dict, 'publisher_uri'),
            self._get_dataset_value(dataset_dict, 'publisher_name'),
            dataset_dict.get('organization'),
        ]):

            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
            if publisher_uri:
                publisher_details = URIRef(publisher_uri)
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            g.add((publisher_details, RDF.type, FOAF.Organization))
            g.add((dataset_ref, DCT.publisher, publisher_details))

            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
            if not publisher_name and dataset_dict.get('organization'):
                publisher_name = dataset_dict['organization']['title']

            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
            # TODO: It would make sense to fallback these to organization
            # fields but they are not in the default schema and the
            # `organization` object in the dataset_dict does not include
            # custom fields
            items = [
                ('publisher_email', FOAF.mbox, None),
                ('publisher_url', FOAF.homepage, None),
                ('publisher_type', DCT.type, None),
            ]

            self._add_triples_from_dict(dataset_dict, publisher_details, items)

        # Update Frequency
        # accrualPeriodicity
        update_freq = self._get_dataset_value(dataset_dict, 'update_frequency')
        if (update_freq):
            has_uri = False
            # check if there exists a URI for the update_frequency value
            from ckanext.dgu.forms.dataset_form import update_frequency_uri
            for freq_name, freq_uri in update_frequency_uri:
                if freq_name.lower() == update_freq.lower():
                    has_uri = True
                    break

            g.add((dataset_ref, DCT.accrualPeriodicity, URIRef(freq_uri) if has_uri else Literal(update_freq)))

        # Temporal
        start = self._get_dataset_value(dataset_dict, 'temporal_start')
        end = self._get_dataset_value(dataset_dict, 'temporal_end')
        if start or end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if start:
                self._add_date_triple(temporal_extent, SCHEMA.startDate, start)
            if end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate, end)
            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Spatial
        spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri')
        spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text')
        spatial_geom = self._get_dataset_value(dataset_dict, 'spatial')

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = URIRef(spatial_uri)
            else:
                spatial_ref = BNode()

            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            if spatial_text:
                g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

            if spatial_geom:
                # GeoJSON
                g.add((spatial_ref,
                       LOCN.geometry,
                       Literal(spatial_geom, datatype=GEOJSON_IMT)))
                # WKT, because GeoDCAT-AP says so
                try:
                    g.add((spatial_ref,
                           LOCN.geometry,
                           Literal(wkt.dumps(json.loads(spatial_geom),
                                             decimals=4),
                                   datatype=GSP.wktLiteral)))
                except (TypeError, ValueError, InvalidGeoJSONException):
                    pass

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('name', DCT.title, None),
                ('description', DCT.description, None),
                ('status', ADMS.status, None),
                ('rights', DCT.rights, None),
                ('license', DCT.license, None),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            # Format
            _format = resource_dict['format']
            if _format:
                if '/' in _format:
                    # add dct:format
                    dctFormat = _format.strip().replace("/", ".").replace(" ", "").lower()
                    g.add((distribution, DCT['format'], Literal(dctFormat)))
                else:
                    g.add((distribution, DCT['format'], Literal(_format.lower())))
                # add dcat:mediaType
                fmt = formats.Formats.match(_format.strip().lower())
                mime_types = fmt['mime_types'] if fmt else None
                if mime_types:
                    g.add((distribution, DCAT.mediaType, Literal(mime_types)))
                
            license_id = self._get_dataset_value(dataset_dict, 'license_id')
            if (license_id == 'cc-by'):
                g.add((distribution, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/')))
            else:
                g.add((distribution, DCT.license, Literal(license_id)))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, Literal(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, Literal(url)))

            # Dates
            items = [
                ('issued', DCT.issued, None),
                ('modified', DCT.modified, None),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution, items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
Exemple #12
0
def export_resource_to_rdf(resource_dict, dataset_dict, _format='xml'):
    """Export the resource in RDF format.

    Builds an RDF Graph containing only the selected resource and exports it to the
    selected format (default ``xml``).

    :param dict resource_dict: resource metadata.
    :param dict dataset_dict: dataset metadata.
    :param str _format: export format. Default is ``xml``.

    :returns: the serialized RDF graph of the resource.
    :rtype: 
    """
    g = Graph()

    distribution = URIRef(resource_uri(resource_dict))

    g.add((distribution, RDF.type, DCAT.Distribution))

    if 'license' not in resource_dict and 'license_id' in dataset_dict:
        lr = LicenseRegister()
        _license = lr.get(dataset_dict['license_id'])
        resource_dict['license'] = _license.url

    #  Simple values
    items = [
        ('name', DCT.title, None, Literal),
        ('description', DCT.description, None, Literal),
        ('status', ADMS.status, None, Literal),
        ('rights', DCT.rights, None, Literal),
        ('license', DCT.license, None, URIRef),
    ]

    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            g.add((distribution, rdf_prop, rdf_type(value)))

    #  Lists
    items = [
        ('documentation', FOAF.page, None, URIRef),
        ('language', DCT.language, None, URIRef),
        ('conforms_to', DCT.conformsTo, None, URIRef),
    ]
    # self._add_list_triples_from_dict(resource_dict, distribution, items)
    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            if isinstance(value, list):
                for val in value:
                    g.add((distribution, rdf_prop, rdf_type(val)))
            else:
                g.add((distribution, rdf_prop, rdf_type(value)))

    # Format
    if '/' in resource_dict.get('format', ''):
        g.add((distribution, DCAT.mediaType, Literal(resource_dict['format'])))
    else:
        if resource_dict.get('format'):
            g.add((distribution, DCT['format'],
                   Literal(resource_dict['format'])))

        if resource_dict.get('mimetype'):
            g.add((distribution, DCAT.mediaType,
                   Literal(resource_dict['mimetype'])))

    # URL
    url = resource_dict.get('url')
    download_url = resource_dict.get('download_url')
    if download_url:
        g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
    if (url and not download_url) or (url and url != download_url):
        g.add((distribution, DCAT.accessURL, URIRef(url)))

    # Dates
    items = [
        ('issued', DCT.issued, None, Literal),
        ('modified', DCT.modified, None, Literal),
    ]

    #self._add_date_triples_from_dict(resource_dict, distribution, items)
    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            g.add((distribution, rdf_prop, rdf_type(value)))

    # Numbers
    if resource_dict.get('size'):
        try:
            g.add((distribution, DCAT.byteSize,
                   Literal(float(resource_dict['size']),
                           datatype=XSD.decimal)))
        except (ValueError, TypeError):
            g.add(
                (distribution, DCAT.byteSize, Literal(resource_dict['size'])))
    # Checksum
    if resource_dict.get('hash'):
        checksum = BNode()
        g.add((checksum, SPDX.checksumValue,
               Literal(resource_dict['hash'], datatype=XSD.hexBinary)))

        if resource_dict.get('hash_algorithm'):
            if resource_dict['hash_algorithm'].startswith('http'):
                g.add((checksum, SPDX.algorithm,
                       URIRef(resource_dict['hash_algorithm'])))
            else:
                g.add((checksum, SPDX.algorithm,
                       Literal(resource_dict['hash_algorithm'])))
        g.add((distribution, SPDX.checksum, checksum))

    return g.serialize(format=_format)
  def graph_from_dataset(self, dataset_dict, dataset_ref):

    log.debug("ODMDCATBasicProfileDataset graph_from_dataset")

    g = self.g

    namespaces = odm_rdf_helper.get_namespaces_by_dataset_type(dataset_dict['type'])

    for prefix, namespace in namespaces.iteritems():
      g.bind(prefix, namespace)

    g.add((dataset_ref, DCT.identifier, Literal(dataset_dict.get('id'))))
    g.add((dataset_ref, DCT.type, Literal(dataset_dict.get('type', 'dataset'))))
    g.add((dataset_ref, RDF.type, DCAT.Dataset))

    items = [
      (dataset_ref, DCT.title, dataset_dict.get('title_translated') or dataset_dict.get('title')),
      (dataset_ref, DCT.description, dataset_dict.get('notes_translated') or dataset_dict.get('notes'))
    ]

    raw_triples = odm_rdf_helper.get_triples_by_dataset_type(dataset_ref,dataset_dict,dataset_dict['type'])
    raw_triples.extend(items)

    for raw_triple in raw_triples:
      triples = odm_rdf_helper.split_multilingual_object_into_triples(raw_triple)
      for triple in triples:
        g.add(triple)

    #Organization
    organization = dataset_dict.get('organization')
    g.add((dataset_ref, FOAF.organization, URIRef(config.get('ckan.site_url') + "organization/" + organization['name'])))

    #license
    license = URIRef(dataset_dict.get('license_url'))
    g.add((license, DCT.title, Literal(dataset_dict.get('license_title'))))
    g.add((dataset_ref, DCT.license, license))

    # odm_spatial_range
    for item in dataset_dict.get('odm_spatial_range', []):
      iso3_code = odm_rdf_helper.map_country_code_iso2_iso3(item.upper())
      g.add((dataset_ref, GN.countrycode, URIRef("http://data.landportal.info/geo/" + iso3_code)))

    #taxonomy
    for term in dataset_dict.get('taxonomy', []):
      matches = odm_rdf_helper.map_internal_to_standard_taxonomic_term(term)

      if isinstance(matches,basestring):
        g.add((dataset_ref, FOAF.topic, Literal(matches)))
      else:
        node = BNode()
        if 'exact_match' in matches:
          node = URIRef(matches['exact_match'])
        if 'broad_matches' in matches:
          for broad_match in matches['broad_matches']:
            g.add((node,SKOS.broadMatch, URIRef(broad_match)))
            g.add((node,DCT.title, Literal(term)))

        g.add((dataset_ref, FOAF.topic, node))

    #  Language
    for item in dataset_dict.get('odm_language', []):
      g.add((dataset_ref, DC.language, Literal(item.upper())))

    # Dates
    try:
      items = odm_rdf_helper.get_date_fields_by_dataset_type(dataset_dict['type'])
      self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)
    except ValueError:
      log.debug("Error adding date triples for dataset "  + dataset_dict['id'])

    # Resources
    for resource_dict in dataset_dict.get('resources', []):

      distribution = URIRef(resource_uri(resource_dict))
      g.add((dataset_ref, DCAT.Distribution, distribution))
      g.add((distribution, RDF.type, DCAT.Distribution))

      items = [
        (distribution, DCT.title, resource_dict.get('name_translated') or resource_dict.get('name')),
        (distribution, DCT.description, resource_dict.get('description_translated') or resource_dict.get('description'))
      ]
      for item in items:
        triples = odm_rdf_helper.split_multilingual_object_into_triples(item)
        for triple in triples:
          g.add(triple)

      try:
        self._add_triples_from_dict(resource_dict, distribution, items)
      except ValueError:
        log.debug("Error adding triples for dataset "  + dataset_dict['id'])


      #  Language
      for item in resource_dict.get('odm_language', []):
        g.add((distribution, DC.language, Literal(item.upper())))

      # Format
      if '/' in resource_dict.get('format', ''):
        g.add((distribution, DCAT.mediaType,
               Literal(resource_dict['format'])))
      else:
        if resource_dict.get('format'):
          g.add((distribution, DCT['format'],
                 Literal(resource_dict['format'])))

        if resource_dict.get('mimetype'):
          g.add((distribution, DCAT.mediaType,
                 Literal(resource_dict['mimetype'])))

      # URL
      url = resource_dict.get('url')
      download_url = resource_dict.get('download_url')
      if download_url:
        g.add((distribution, DCAT.downloadURL, Literal(download_url)))
      if (url and not download_url) or (url and url != download_url):
        g.add((distribution, DCAT.downloadURL, URIRef(url)))
Exemple #14
0
    def parse_dataset(self, dataset_dict, dataset_ref):
        """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """
        # Manage different versions of DCATDE namespaces first.
        # Ensure that they are ordered from oldest to newest version, such that older values get overwritten
        # in case of multiple definitions
        dcatde_versions = [DCATDE_1_0, DCATDE]

        # geocodingText and legalbasisText got renamed, so handle them separately
        for key, predicate, in (
            ('legalbasisText', DCATDE_1_0.legalbasisText),
            ('geocodingText', DCATDE_1_0.geocodingText),
            ('legalbasisText', DCATDE.legalBasis),
            ('geocodingText', DCATDE.geocodingDescription),
        ):
            values = self._object_value_list(dataset_ref, predicate)
            if values:
                ds_utils.set_extras_field(dataset_dict, key,
                                          json.dumps(values))

        # iterate over all namespaces to import as much as possible
        for dcatde_namespace in dcatde_versions:
            # Simple additional fields
            for key, predicate in (
                ('qualityProcessURI', dcatde_namespace.qualityProcessURI),
                ('politicalGeocodingLevelURI',
                 dcatde_namespace.politicalGeocodingLevelURI),
            ):
                value = self._object_value(dataset_ref, predicate)
                if value:
                    ds_utils.set_extras_field(dataset_dict, key, value)

            # List fields
            for key, predicate, in (
                ('contributorID', dcatde_namespace.contributorID),
                ('politicalGeocodingURI',
                 dcatde_namespace.politicalGeocodingURI),
            ):
                values = self._object_value_list(dataset_ref, predicate)
                if values:
                    ds_utils.set_extras_field(dataset_dict, key,
                                              json.dumps(values))

            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.originator, 'originator',
                                True)
            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.maintainer, 'maintainer',
                                False)

            # Add additional distribution fields
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                for resource_dict in dataset_dict.get('resources', []):
                    # Match distribution in graph and distribution in ckan-dict
                    if unicode(distribution) == resource_uri(resource_dict):
                        for key, predicate in (
                            ('licenseAttributionByText',
                             dcatde_namespace.licenseAttributionByText),
                            ('plannedAvailability',
                             dcatde_namespace.plannedAvailability)):
                            value = self._object_value(distribution, predicate)
                            if value:
                                ds_utils.insert_resource_extra(
                                    resource_dict, key, value)
        # -- end loop over dcatde namespaces --

        # additions in other namespaces than DCATDE
        self._parse_contact(dataset_dict, dataset_ref, DCT.contributor,
                            'contributor', True)
        self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author',
                            False)

        # dcat:landingPage
        landing_page = self._object_value(dataset_ref, DCAT.landingPage)
        if landing_page:
            ds_utils.set_extras_field(dataset_dict, 'metadata_original_html',
                                      landing_page)

        # dcat:contactPoint
        # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer?
        contact = self._object(dataset_ref, DCAT.contactPoint)
        self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL)

        contact_tel = self._object_value(contact, VCARD.hasTelephone)
        if contact_tel:
            ds_utils.insert(dataset_dict, 'maintainer_tel',
                            self._without_tel(contact_tel), True)

        self._add_maintainer_field(dataset_dict, contact, 'street',
                                   VCARD.hasStreetAddress)
        self._add_maintainer_field(dataset_dict, contact, 'city',
                                   VCARD.hasLocality)
        self._add_maintainer_field(dataset_dict, contact, 'zip',
                                   VCARD.hasPostalCode)
        self._add_maintainer_field(dataset_dict, contact, 'country',
                                   VCARD.hasCountryName)

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')

        if not groups:
            groups = []

        for obj in self.g.objects(dataset_ref, DCAT.theme):
            current_theme = unicode(obj)

            if current_theme.startswith(dcat_theme_prefix):
                group = current_theme.replace(dcat_theme_prefix, '').lower()
                groups.append({'id': group, 'name': group})

        dataset_dict['groups'] = groups

        return dataset_dict
Exemple #15
0
            'notes': (dataset_ref, DCT.description),
            'publisher_name': (publisher_ref, FOAF.name),
        }
        if holder_use_dataset and holder_ref:
            loc_package_mapping['holder_name'] = (holder_ref, FOAF.name)

        self._add_multilang_values(loc_dict, loc_package_mapping)
        if not holder_use_dataset and holder_ref:
            loc_dict = interfaces.get_for_group_or_organization(org_dict['id'])
            loc_package_mapping = {'name': (holder_ref, FOAF.name)}
            self._add_multilang_values(loc_dict, loc_package_mapping)

        ### Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(
                resource_dict))  # TODO: preserve original info if harvested

            # Add the DCATAPIT type
            g.add((distribution, RDF.type, DCATAPIT.Distribution))

            ### format
            self._remove_node(resource_dict, distribution,
                              ('format', DCT['format'], None, Literal))
            if not self._add_uri_node(resource_dict, distribution,
                                      ('distribution_format', DCT['format'],
                                       None, URIRef), FORMAT_BASE_URI):
                guessed_format = guess_format(resource_dict)
                if guessed_format:
                    self.g.add((distribution, DCT['format'],
                                URIRef(FORMAT_BASE_URI + guessed_format)))
                else:
  def graph_from_dataset(self, dataset_dict, dataset_ref):
    g = self.g
    dist_additons = {}

    for prefix, namespace in namespaces.items():
      g.bind(prefix, namespace)
    
    # dcat:contactPoint
    for contactPoint_ref in g.objects(dataset_ref, DCAT.contactPoint):
      for email in g.objects(contactPoint_ref, VCARD.hasEmail):
        g.remove((contactPoint_ref, VCARD.hasEmail, Literal(email)))
        g.add((contactPoint_ref, VCARD.hasEmail, URIRef('mailto:' + email)))
    
    # dcat:theme
    groups = self._get_dataset_value(dataset_dict, 'groups')
    for group in groups:
      mdrtheme_groups = self.category_mapping[group['name']]
      if mdrtheme_groups:
        for mdrtheme_group in mdrtheme_groups:
          g.add((dataset_ref, DCAT.theme, URIRef(MDRTHEME + mdrtheme_group)))
          
    # dcatde:contributorID
    contributor_id = config.get('ckanext.hro_dcatapde.contributorid')
    if contributor_id:
      g.add((dataset_ref, DCATDE.contributorID, URIRef('http://dcat-ap.de/def/contributors/' + contributor_id)))

    # dcatde:geocodingDescription
    # dcatde:politicalGeocodingLevelURI
    # dcatde:politicalGeocodingURI
    # dct:spatial
    geocoding = self._get_dataset_value(dataset_dict, 'spatial')
    if geocoding:
      for spatial_ref in g.objects(dataset_ref, DCT.spatial):
        g.remove((spatial_ref, LOCN.geometry, Literal(geocoding, datatype = GEOJSON)))
        if 'multipolygon' in geocoding:
          geocoding = geocoding.replace('multipolygon', 'MultiPolygon')
        elif 'polygon' in geocoding:
          geocoding = geocoding.replace('polygon', 'Polygon')
        g.add((spatial_ref, LOCN.geometry, Literal(geocoding, datatype = GEOJSON)))
    geocoding_text = self._get_dataset_value(dataset_dict, 'spatial_text')
    if geocoding_text:
      for spatial_ref in g.objects(dataset_ref, DCT.spatial):
        g.remove((spatial_ref, SKOS.prefLabel, Literal(geocoding_text)))
      g.add((dataset_ref, DCATDE.geocodingDescription, Literal(geocoding_text)))
      if geocoding_text in self.geocoding_mapping:
        geocoding_object = self.geocoding_mapping[geocoding_text]
        if 'politicalGeocodingLevelURI' in geocoding_object:
          g.add((dataset_ref, DCATDE.politicalGeocodingLevelURI, URIRef(geocoding_object['politicalGeocodingLevelURI'])))
        if 'politicalGeocodingURI' in geocoding_object:
          g.add((dataset_ref, DCATDE.politicalGeocodingURI, URIRef(geocoding_object['politicalGeocodingURI'])))

    # dcatde:maintainer
    maintainer = self._get_dataset_value(dataset_dict, 'maintainer')
    maintainer_email = self._get_dataset_value(dataset_dict, 'maintainer_email')
    if maintainer or maintainer_email:
      maintainer_details = BNode()
      g.add((maintainer_details, RDF.type, FOAF.Organization))
      g.add((dataset_ref, DCATDE.maintainer, maintainer_details))
      if maintainer:
        g.add((maintainer_details, FOAF.name, Literal(maintainer)))
      if maintainer_email:
        g.add((maintainer_details, FOAF.mbox, Literal(maintainer_email)))

    # dct:accessRights
    g.add((dataset_ref, DCT.accessRights, Literal('public')))

    # dct:conformsTo
    g.add((dataset_ref, DCT.conformsTo, URIRef(DCATDE)))

    # dct:creator
    creator = self._get_dataset_value(dataset_dict, 'author')
    creator_email = self._get_dataset_value(dataset_dict, 'author_email')
    if creator or creator_email:
      creator_details = BNode()
      g.add((creator_details, RDF.type, FOAF.Organization))
      g.add((dataset_ref, DCT.creator, creator_details))
      if creator:
        g.add((creator_details, FOAF.name, Literal(creator)))
      if creator_email:
        g.add((creator_details, FOAF.mbox, Literal(creator_email)))
    
    # dct:language
    language = config.get('ckan.locale_default', 'en')
    if language in self.language_mapping:
      mdrlang_language = self.language_mapping[language]
      g.add((dataset_ref, DCT.language, Literal(getattr(MDRLANG, mdrlang_language))))

    # dct:temporal
    start_date = self._get_dataset_value(dataset_dict, 'temporal_coverage_from')
    end_date = self._get_dataset_value(dataset_dict, 'temporal_coverage_to')
    if start_date or end_date:
      temporal_extent = BNode()
      g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
      if start_date:
        self._add_date_triple(temporal_extent, DCAT.startDate, start_date)
      if end_date:
        self._add_date_triple(temporal_extent, DCAT.endDate, end_date)
      g.add((dataset_ref, DCT.temporal, temporal_extent))

    # attribution for resources (distributions) enhancement
    terms_of_use = json.loads(self._get_dataset_value(dataset_dict, 'terms_of_use'))
    if terms_of_use:
      if 'attribution_text' in terms_of_use:
        dist_additons['attribution_text'] = terms_of_use['attribution_text'].encode('utf-8')

    # license maping for resources (distributions) enhancement
    license_id = self._get_dataset_value(dataset_dict, 'license_id')
    if license_id in self.license_mapping:
      dist_additons['license_id'] = self.license_mapping[license_id]['dcatde-id']

    # resources (distributions) enhancement
    for resource_dict in dataset_dict.get('resources', []):
      for distribution in g.objects(dataset_ref, DCAT.distribution):
        if str(distribution) == resource_uri(resource_dict):
          self.enhance_resource(g, distribution, resource_dict, dist_additons)
Exemple #17
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('title', DCT.title, None, Literal),
            ('notes', DCT.description, None, Literal),
            ('url', DCAT.landingPage, None, URIRef),
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, URIRef),
            ('access_rights', DCT.accessRights, None, Literal),
            ('dcat_type', DCT.type, None, Literal),
            ('provenance', DCT.provenance, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        #  Lists
        items = [
            ('language', DCT.language, None, Literal),
            ('theme', DCAT.theme, None, URIRef),
            ('conforms_to', DCT.conformsTo, None, Literal),
            ('alternate_identifier', ADMS.identifier, None, Literal),
            ('documentation', FOAF.page, None, URIRef),
            ('related_resource', DCT.relation, None, URIRef),
            ('has_version', DCT.hasVersion, None, URIRef),
            ('is_version_of', DCT.isVersionOf, None, URIRef),
            ('source', DCT.source, None, Literal),
            ('sample', ADMS.sample, None, Literal),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Contact details
        if any([
                self._get_dataset_value(dataset_dict, 'contact_uri'),
                self._get_dataset_value(dataset_dict, 'contact_name'),
                self._get_dataset_value(dataset_dict, 'contact_email'),
                self._get_dataset_value(dataset_dict, 'maintainer'),
                self._get_dataset_value(dataset_dict, 'maintainer_email'),
                self._get_dataset_value(dataset_dict, 'author'),
                self._get_dataset_value(dataset_dict, 'author_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(self._removeWhitespaces(contact_uri))
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Organization))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn,
                                       'contact_name',
                                       ['maintainer', 'author'])
            # Add mail address as URIRef, and ensure it has a mailto: prefix
            self._add_triple_from_dict(dataset_dict,
                                       contact_details,
                                       VCARD.hasEmail,
                                       'contact_email',
                                       ['maintainer_email', 'author_email'],
                                       _type=URIRef,
                                       value_modifier=self._add_mailto)

        # Publisher
        if any([
                self._get_dataset_value(dataset_dict, 'publisher_uri'),
                self._get_dataset_value(dataset_dict, 'publisher_name'),
                dataset_dict.get('organization'),
        ]):

            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
            if publisher_uri:
                publisher_details = URIRef(
                    self._removeWhitespaces(publisher_uri))
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            g.add((publisher_details, RDF.type, FOAF.Organization))
            g.add((dataset_ref, DCT.publisher, publisher_details))

            publisher_name = self._get_dataset_value(dataset_dict,
                                                     'publisher_name')
            if not publisher_name and dataset_dict.get('organization'):
                publisher_name = dataset_dict['organization']['title']

            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
            # TODO: It would make sense to fallback these to organization
            # fields but they are not in the default schema and the
            # `organization` object in the dataset_dict does not include
            # custom fields
            items = [
                ('publisher_email', FOAF.mbox, None, Literal),
                ('publisher_url', FOAF.homepage, None, URIRef),
                ('publisher_type', DCT.type, None, URIRef),
            ]

            self._add_triples_from_dict(dataset_dict, publisher_details, items)

        # Temporal
        start = self._get_dataset_value(dataset_dict, 'temporal_start')
        end = self._get_dataset_value(dataset_dict, 'temporal_end')
        if start or end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if start:
                self._add_date_triple(temporal_extent, SCHEMA.startDate, start)
            if end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate, end)
            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Spatial
        spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri')
        spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text')
        spatial_geom = self._get_dataset_value(dataset_dict, 'spatial')

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = URIRef(self._removeWhitespaces(spatial_uri))
            else:
                spatial_ref = BNode()

            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            if spatial_text:
                g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

            if spatial_geom:
                # GeoJSON
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial_geom, datatype=GEOJSON_IMT)))
                # WKT, because GeoDCAT-AP says so
                try:
                    g.add((spatial_ref, LOCN.geometry,
                           Literal(wkt.dumps(json.loads(spatial_geom),
                                             decimals=4),
                                   datatype=GSP.wktLiteral)))
                except (TypeError, ValueError, InvalidGeoJSONException):
                    pass

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(
                self._removeWhitespaces(resource_uri(resource_dict)))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [('name', DCT.title, None, Literal),
                     ('description', DCT.description, None, Literal),
                     ('status', ADMS.status, None, URIRef),
                     ('rights', DCT.rights, None, URIRef),
                     ('license', DCT.license, None, URIRef),
                     ('access_url', DCAT.accessURL, None, URIRef),
                     ('download_url', DCAT.downloadURL, None, URIRef)]

            self._add_triples_from_dict(resource_dict, distribution, items)

            #  Lists
            items = [
                ('documentation', FOAF.page, None, URIRef),
                ('language', DCT.language, None, Literal),
                ('conforms_to', DCT.conformsTo, None, Literal),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # Format
            mimetype = resource_dict.get('mimetype')
            fmt = resource_dict.get('format')

            # IANA media types (either URI or Literal) should be mapped as mediaType.
            # In case format is available and mimetype is not set or identical to format,
            # check which type is appropriate.
            if fmt and (not mimetype or mimetype == fmt):
                if ('iana.org/assignments/media-types' in fmt
                        or not fmt.startswith('http') and '/' in fmt):
                    # output format value as dcat:mediaType instead of dct:format
                    mimetype = fmt
                    fmt = None
                else:
                    # Use dct:format
                    mimetype = None

            if mimetype:
                if mimetype.startswith('http'):
                    g.add((distribution, DCAT.mediaType,
                           URIRef(self._removeWhitespaces(mimetype))))
                else:
                    g.add((distribution, DCAT.mediaType, Literal(mimetype)))

            if fmt:
                if fmt.startswith('http'):
                    g.add((distribution, DCT['format'],
                           URIRef(self._removeWhitespaces(fmt))))
                else:
                    g.add((distribution, DCT['format'], Literal(fmt)))

            # URL fallback and old behavior
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            access_url = resource_dict.get('access_url')
            # Use url as fallback for access_url if access_url is not set and download_url is not equal
            if (url and ((not (access_url or download_url)) or
                         ((not access_url) and
                          (download_url and url != download_url)))):
                self._add_triple_from_dict(resource_dict,
                                           distribution,
                                           DCAT.accessURL,
                                           'url',
                                           _type=URIRef)

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
            # Checksum
            if resource_dict.get('hash'):
                checksum = BNode()
                g.add((checksum, RDF.type, SPDX.Checksum))
                g.add((checksum, SPDX.checksumValue,
                       Literal(resource_dict['hash'], datatype=XSD.hexBinary)))

                if resource_dict.get('hash_algorithm'):
                    if resource_dict['hash_algorithm'].startswith('http'):
                        g.add((checksum, SPDX.algorithm,
                               URIRef(
                                   self._removeWhitespaces(
                                       resource_dict['hash_algorithm']))))
                    else:
                        g.add((checksum, SPDX.algorithm,
                               Literal(resource_dict['hash_algorithm'])))
                g.add((distribution, SPDX.checksum, checksum))
Exemple #18
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):  # noqa

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, Literal),
            ('access_rights', DCT.accessRights, None, Literal),
            ('dcat_type', DCT.type, None, Literal),
            ('provenance', DCT.provenance, None, Literal),
            ('spatial', DCT.spatial, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        self._add_multilang_value(dataset_ref, DCT.description, 'description', dataset_dict) # noqa
        self._add_multilang_value(dataset_ref, DCT.title, 'title', dataset_dict) # noqa

        # LandingPage
        g.add((dataset_ref, DCAT.landingPage,
               Literal(dataset_dict['url'])))

        self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords', dataset_dict) # noqa

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        # Update Interval
        accrual_periodicity = dataset_dict.get('accrual_periodicity')
        if accrual_periodicity:
            g.add((
                dataset_ref,
                DCT.accrualPeriodicity,
                URIRef(accrual_periodicity)
            ))

        # Lists
        items = [
            ('language', DCT.language, None, Literal),
            ('theme', DCAT.theme, None, URIRef),
            ('conforms_to', DCT.conformsTo, None, Literal),
            ('alternate_identifier', ADMS.identifier, None, Literal),
            ('documentation', FOAF.page, None, Literal),
            ('has_version', DCT.hasVersion, None, Literal),
            ('is_version_of', DCT.isVersionOf, None, Literal),
            ('source', DCT.source, None, Literal),
            ('sample', ADMS.sample, None, Literal),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Relations
        if dataset_dict.get('relations'):
            relations = dataset_dict.get('relations')
            for relation in relations:
                relation_name = relation['label']
                relation_url = relation['url']

                relation = URIRef(relation_url)
                g.add((relation, RDFS.label, Literal(relation_name)))
                g.add((dataset_ref, DCT.relation, relation))

        # References
        if dataset_dict.get('see_alsos'):
            references = dataset_dict.get('see_alsos')
            for reference in references:
                reference_identifier = reference['dataset_identifier']
                g.add((dataset_ref, RDFS.seeAlso, Literal(reference_identifier))) # noqa

        # Contact details
        if dataset_dict.get('contact_points'):
            contact_points = self._get_dataset_value(dataset_dict, 'contact_points')  # noqa
            for contact_point in contact_points:
                contact_details = BNode()
                contact_point_email = contact_point['email']
                contact_point_name = contact_point['name']

                g.add((contact_details, RDF.type, VCARD.Organization))
                g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) # noqa
                g.add((contact_details, VCARD.fn, Literal(contact_point_name)))

                g.add((dataset_ref, DCAT.contactPoint, contact_details))

        # Publisher
        if dataset_dict.get('publishers'):
            publishers = dataset_dict.get('publishers')
            for publisher in publishers:
                publisher_name = publisher['label']

                publisher_details = BNode()
                g.add((publisher_details, RDF.type, RDF.Description))
                g.add((publisher_details, RDFS.label, Literal(publisher_name)))
                g.add((dataset_ref, DCT.publisher, publisher_details))

        # Temporals
        temporals = dataset_dict.get('temporals')
        if temporals:
            for temporal in temporals:
                start = temporal['start_date']
                end = temporal['end_date']
                if start or end:
                    temporal_extent = BNode()
                    g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
                    if start:
                        self._add_date_triple(temporal_extent, SCHEMA.startDate, start)  # noqa
                    if end:
                        self._add_date_triple(temporal_extent, SCHEMA.endDate, end)  # noqa
                    g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Themes
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group_name in groups:
            g.add((
                dataset_ref,
                DCAT.theme,
                URIRef(ogd_theme_base_url + group_name.get('name'))
            ))

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))
            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('status', ADMS.status, None, Literal),
                ('rights', DCT.rights, None, Literal),
                ('license', DCT.license, None, Literal),
                ('identifier', DCT.identifier, None, Literal),
                ('media_type', DCAT.mediaType, None, Literal),
                ('spatial', DCT.spatial, None, Literal),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            self._add_multilang_value(distribution, DCT.title, 'display_name', dataset_dict) # noqa
            self._add_multilang_value(distribution, DCT.description, 'description', dataset_dict) # noqa

            #  Lists
            items = [
                ('documentation', FOAF.page, None, Literal),
                ('language', DCT.language, None, Literal),
                ('conforms_to', DCT.conformsTo, None, Literal),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
                g.add((distribution, DCAT.accessURL, URIRef(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, URIRef(url)))

            # Format from Download-Url
            if download_url:
                format_value = str(download_url).rsplit('.', 1)[1]
                mapped_format = map_to_valid_format(format_value)
                g.add((distribution, DCT['format'], Literal(mapped_format)))

            # Mime-Type
            if resource_dict.get('mimetype'):
                g.add((
                    distribution,
                    DCAT.mediaType,
                    Literal(resource_dict['mimetype'])
                ))

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # Numbers
            if resource_dict.get('byte_size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
Exemple #19
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        log.debug("dataset: {}".format(dataset_dict['name']))
        g = self.g

        dist_additons = {}

        # bind namespaces to have readable names in RDF Document
        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # TEMPORARY: fix whitespace in 'url':
        url = dataset_dict['url']
        if url:
            g.remove((dataset_ref, DCAT.landingPage, URIRef(url)))
            url = url.replace(" ", "+")
            g.add((dataset_ref, DCAT.landingPage, URIRef(url)))

        # Nr. 40 - Contributor
        contributorId = pylons.config.get('ckanext.dcatde.contributorid')
        if contributorId:
            g.add((dataset_ref, DCATDE.contributorID,
                   URIRef("{}{}".format(DCATDE_CONTRIBUTORS, contributorId))))

        # Nr. 41 - Contact Point
        # If a maintainer name is given, set this to be the name of the
        # contact point. If not, use name of author/VÖ Stelle (ckanext-dcat default).
        for contactPoint_ref in g.objects(dataset_ref, DCAT.contactPoint):
            for email in g.objects(contactPoint_ref, VCARD.hasEmail):
                g.remove((contactPoint_ref, VCARD.hasEmail, Literal(email)))
                g.add((contactPoint_ref, VCARD.hasEmail,
                       URIRef("mailto:" + email)))

        # Nr. 44 - Publisher
        publisher_ref = BNode()
        publisher_name = self._get_dataset_value(dataset_dict, 'author')
        publisher_url = self._get_dataset_value(dataset_dict, 'url')
        # first, remove the publishers added by the generic RDF profile, as they
        # are based on the CKAN Organization
        for publisher in g.objects(dataset_ref, DCT.publisher):
            g.remove((dataset_ref, DCT.publisher, publisher))

        g.add((publisher_ref, RDF.type, FOAF.Organization))
        g.add((publisher_ref, FOAF.name, Literal(publisher_name)))
        # if publisher_url:
        #     g.add( (publisher_ref, FOAF.homepage, URIRef(publisher_url)) )
        g.add((dataset_ref, DCT.publisher, publisher_ref))

        # Nr. 45 - Kategorie
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group in groups:
            dcat_groups = self.category_mapping[group['name']]
            if dcat_groups is not None:
                for dcat_group in dcat_groups:
                    g.add((dataset_ref, DCAT.theme, MDRTHEME[dcat_group]))
                    # MDRTHEME.xyz is not dereferencable, so we add some additional
                    # triples that link to the downloadable source:
                    g.add((MDRTHEME[dcat_group], RDFS.isDefinedBy,
                           URIRef(MDRTHEME)))
                    g.add((
                        URIRef(MDRTHEME), RDFS.seeAlso,
                        URIRef(
                            "http://publications.europa.eu/mdr/resource/authority/data-theme/skos-ap-eu/data-theme-skos-ap-act.rdf"
                        )))

        # Nr. 48 - conformsTo (Application Profile der Metadaten)
        dcatapde_version = pylons.config.get('ckanext.dcatde.version')
        g.add((dataset_ref, DCT.conformsTo,
               URIRef("{}{}/".format(DCATDE, dcatapde_version))))

        # Nr. 49 - 52 (Urheber, Verwalter, Bearbeiter, Autor) - we don't know this

        # Nr. 59 - Sprache
        g.add((dataset_ref, DCT.language, MDRLANG.DEU))
        # MDRLANG.DEU is not dereferencable, so we add some additional
        # triples that link to the downloadable source:
        g.add((MDRLANG.DEU, RDFS.isDefinedBy, URIRef(MDRLANG)))
        g.add((
            URIRef(MDRLANG), RDFS.seeAlso,
            URIRef(
                "http://publications.europa.eu/mdr/resource/authority/language/skos-ap-eu/languages-skos-ap-act.rdf"
            )))

        # Nr. 61 - Provenienz

        # TODO: geharvestete Datensätze kennzeichnen?

        # Nr. 66 - dct:spatial via geonames reference
        # Nr. 72 - dcatde:politicalGeocodingLevelURI
        # Nr. 73 - dcatde:politicalGeocodingURI
        # passt leider nur bedingt auf Berlin (nur federal, state, administrativeDistrict)

        geographical_coverage = self._get_dataset_value(
            dataset_dict, 'geographical_coverage')
        if geographical_coverage in self.geo_coverage:
            coverage_object = self.geo_coverage[geographical_coverage]
            if 'geonames' in coverage_object:
                g.add((dataset_ref, DCT.spatial,
                       URIRef(coverage_object['geonames'])))
            if 'politicalGeocodingURI' in coverage_object:
                g.add((dataset_ref, DCATDE.politicalGeocodingURI,
                       URIRef(coverage_object['politicalGeocodingURI'])))
            if 'politicalGeocodingLevelURI' in coverage_object:
                g.add((dataset_ref, DCATDE.politicalGeocodingLevelURI,
                       URIRef(coverage_object['politicalGeocodingLevelURI'])))

        # Nr. 75 - dcatde:legalbasisText

        legalbasisText = self.legalBasis['default']
        org = dataset_dict.get('organization', {})
        if org and org['name'] in self.legalBasis['mapping']:
            legalbasisText = self.legalBasis['mapping'][org['name']]
        g.add((dataset_ref, DCATDE.legalbasisText, Literal(legalbasisText)))

        # Enhance Distributions
        ## License
        if 'license_id' in dataset_dict:
            ogd_license_code = dataset_dict['license_id']
            if ogd_license_code in self.license_mapping:
                dist_additons['license_id'] = self.license_mapping[
                    ogd_license_code]['dcatde-id']

        ## Attribution Text
        if 'attribution_text' in dataset_dict:
            dist_additons['attribution_text'] = dataset_dict.get(
                'attribution_text').encode('utf-8')

        for resource_dict in dataset_dict.get('resources', []):
            for distribution in g.objects(dataset_ref, DCAT.distribution):
                # Match distribution in graph and resource in ckan-dict
                if unicode(distribution) == resource_uri(resource_dict):
                    self.enhance_distribution_resource(g, distribution,
                                                       resource_dict,
                                                       dist_additons)

        # custom:

        # add information about the technical source of this dataset (webform, simplesearch, harvester, etc.)

        source = self._get_dataset_value(dataset_dict, 'berlin_source')
        if (source):
            g.add((dataset_ref, DCT.accrualMethod, ACCRUAL_METHODS[source]))
Exemple #20
0
    def test_distribution_fields(self):

        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
            'documentation':
            '[\"http://dataset.info.org/distribution1/doc1\", \"http://dataset.info.org/distribution1/doc2\"]',
            'language':
            '[\"en\", \"es\", \"http://publications.europa.eu/resource/authority/language/ITA\"]',
            'conforms_to': '[\"Standard 1\", \"Standard 2\"]',
            'hash': '4304cf2e751e6053c90b1804c89c0ebb758f395a',
            'hash_algorithm':
            'http://spdx.org/rdf/terms#checksumAlgorithm_sha1',
        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [resource]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(
            len([t
                 for t in g.triples((dataset_ref, DCAT.distribution, None))]),
            1)

        # URI
        distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
        assert self._triple(g, distribution, DCT.title, resource['name'])
        assert self._triple(g, distribution, DCT.description,
                            resource['description'])
        assert self._triple(g, distribution, DCT.rights, resource['rights'])
        assert self._triple(g, distribution, DCT.license,
                            URIRef(resource['license']))
        assert self._triple(g, distribution, ADMS.status,
                            URIRef(resource['status']))

        # List
        for item in [
            ('documentation', FOAF.page, URIRef),
            ('language', DCT.language, [Literal, Literal, URIRef]),
            ('conforms_to', DCT.conformsTo, Literal),
        ]:
            values = json.loads(resource[item[0]])
            eq_(len([t for t in g.triples((distribution, item[1], None))]),
                len(values))
            for num, value in enumerate(values):
                _type = item[2]
                if isinstance(item[2], list):
                    eq_(len(item[2]), len(values))
                    _type = item[2][num]
                assert self._triple(g, distribution, item[1], _type(value))

        # Dates
        assert self._triple(g, distribution, DCT.issued, resource['issued'],
                            XSD.dateTime)
        assert self._triple(g, distribution, DCT.modified,
                            resource['modified'], XSD.dateTime)

        # Numbers
        assert self._triple(g, distribution, DCAT.byteSize,
                            float(resource['size']), XSD.decimal)

        # Checksum
        checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
        assert checksum
        assert self._triple(g, checksum, RDF.type, SPDX.Checksum)
        assert self._triple(
            g,
            checksum,
            SPDX.checksumValue,
            resource['hash'],
            data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
        assert self._triple(g, checksum, SPDX.algorithm,
                            URIRef(resource['hash_algorithm']))
Exemple #21
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):  # noqa

        log.debug("Create graph from dataset '%s'" % dataset_dict['name'])

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, Literal),
            ('access_rights', DCT.accessRights, None, Literal),
            ('dcat_type', DCT.type, None, Literal),
            ('provenance', DCT.provenance, None, Literal),
            ('spatial', DCT.spatial, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        self._add_multilang_value(dataset_ref, DCT.description, 'description',
                                  dataset_dict)
        self._add_multilang_value(dataset_ref, DCT.title, 'title',
                                  dataset_dict)

        # LandingPage
        try:
            landing_page = uri_to_iri(dataset_dict['url'])
        except ValueError:
            landing_page = ''

        g.add((dataset_ref, DCAT.landingPage, Literal(landing_page)))

        # Keywords
        self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords',
                                  dataset_dict)

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        # Update Interval
        accrual_periodicity = dataset_dict.get('accrual_periodicity')
        if accrual_periodicity:
            g.add((dataset_ref, DCT.accrualPeriodicity,
                   URIRef(accrual_periodicity)))

        # Lists
        items = [
            ('language', DCT.language, None, Literal),
            ('theme', DCAT.theme, None, URIRef),
            ('conforms_to', DCT.conformsTo, None, Literal),
            ('alternate_identifier', ADMS.identifier, None, Literal),
            ('documentation', FOAF.page, None, Literal),
            ('has_version', DCT.hasVersion, None, Literal),
            ('is_version_of', DCT.isVersionOf, None, Literal),
            ('source', DCT.source, None, Literal),
            ('sample', ADMS.sample, None, Literal),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Relations
        if dataset_dict.get('relations'):
            relations = dataset_dict.get('relations')
            for relation in relations:
                relation_name = relation['label']
                try:
                    relation_url = uri_to_iri(relation['url'])
                except ValueError:
                    # skip this relation if the URL is invalid
                    continue

                relation = URIRef(relation_url)
                g.add((relation, RDFS.label, Literal(relation_name)))
                g.add((dataset_ref, DCT.relation, relation))

        # References
        if dataset_dict.get('see_alsos'):
            references = dataset_dict.get('see_alsos')
            for reference in references:
                # we only excpect dicts here
                if not isinstance(reference, dict):
                    continue
                reference_identifier = reference.get('dataset_identifier')
                if reference_identifier:
                    g.add((dataset_ref, RDFS.seeAlso,
                           Literal(reference_identifier)))

        # Contact details
        if dataset_dict.get('contact_points'):
            contact_points = self._get_dataset_value(dataset_dict,
                                                     'contact_points')  # noqa
            for contact_point in contact_points:
                contact_details = BNode()
                contact_point_email = contact_point['email']
                contact_point_name = contact_point['name']

                g.add((contact_details, RDF.type, VCARD.Organization))
                g.add((contact_details, VCARD.hasEmail,
                       URIRef(contact_point_email)))  # noqa
                g.add((contact_details, VCARD.fn, Literal(contact_point_name)))

                g.add((dataset_ref, DCAT.contactPoint, contact_details))

        # Publisher
        if dataset_dict.get('publishers'):
            publishers = dataset_dict.get('publishers')
            for publisher in publishers:
                publisher_name = publisher['label']

                publisher_details = BNode()
                g.add((publisher_details, RDF.type, RDF.Description))
                g.add((publisher_details, RDFS.label, Literal(publisher_name)))
                g.add((dataset_ref, DCT.publisher, publisher_details))

        # Temporals
        temporals = dataset_dict.get('temporals')
        if temporals:
            for temporal in temporals:
                start = temporal['start_date']
                end = temporal['end_date']
                if start or end:
                    temporal_extent = BNode()
                    g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
                    if start:
                        self._add_date_triple(temporal_extent,
                                              SCHEMA.startDate, start)
                    if end:
                        self._add_date_triple(temporal_extent, SCHEMA.endDate,
                                              end)
                    g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Themes
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group_name in groups:
            g.add((dataset_ref, DCAT.theme,
                   URIRef(ogd_theme_base_url + group_name.get('name'))))

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))
            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('status', ADMS.status, None, Literal),
                ('rights', DCT.rights, None, Literal),
                ('license', DCT.license, None, Literal),
                ('identifier', DCT.identifier, None, Literal),
                ('media_type', DCAT.mediaType, None, Literal),
                ('spatial', DCT.spatial, None, Literal),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            self._add_multilang_value(distribution, DCT.title, 'display_name',
                                      resource_dict)  # noqa
            self._add_multilang_value(distribution, DCT.description,
                                      'description', resource_dict)  # noqa

            #  Lists
            items = [
                ('documentation', FOAF.page, None, Literal),
                ('language', DCT.language, None, Literal),
                ('conforms_to', DCT.conformsTo, None, Literal),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # Download URL & Access URL
            download_url = resource_dict.get('download_url')
            if download_url:
                try:
                    download_url = uri_to_iri(download_url)
                    g.add(
                        (distribution, DCAT.downloadURL, URIRef(download_url)))
                except ValueError:
                    # only add valid URL
                    pass

            url = resource_dict.get('url')
            if (url and not download_url) or (url and url != download_url):
                try:
                    url = uri_to_iri(url)
                    g.add((distribution, DCAT.accessURL, URIRef(url)))
                except ValueError:
                    # only add valid URL
                    pass
            elif download_url:
                g.add((distribution, DCAT.accessURL, URIRef(download_url)))

            # Format
            if resource_dict.get('format'):
                g.add((distribution, DCT['format'],
                       Literal(resource_dict['format'])))

            # Mime-Type
            if resource_dict.get('mimetype'):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['mimetype'])))

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # ByteSize
            if resource_dict.get('byte_size'):
                g.add((distribution, DCAT.byteSize,
                       Literal(resource_dict['byte_size'])))
    def graph_from_dataset(self, dataset_dict, dataset_ref):  # noqa: C90
        try:

            g = self.g

            g.add((dataset_ref, RDF.type, DCAT.Dataset))

            for prefix, namespace in namespaces.iteritems():
                g.bind(prefix, namespace)

            # Basic fields
            basic_items = [
                ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ]
            self._add_triples_from_dict(dataset_dict, dataset_ref, basic_items)

            # landingPage is the original portal page
            site_url = pylons.config.get('ckan.site_url', '')
            g.add((
                dataset_ref,
                DCAT.landingPage,
                Literal(site_url + '/dataset/' + dataset_dict['name'])
            ))

            # Language
            g.add((dataset_ref, DCT.language, Literal(ckan_locale_default)))

            # Basic date fields
            date_items = [
                ('dateLastUpdated', DCT.modified, 'metadata_modified', Literal),  # noqa
                ('dateFirstPublished', DCT.issued, 'metadata_created', Literal),  # noqa
            ]
            self._add_date_triples_from_dict(
                dataset_dict,
                dataset_ref,
                date_items
            )

            # Organization
            organization_id = pylons.config.get(
                'ckanext.stadtzh-theme.dcat_ap_organization_slug',
                '',
            )
            id = self._get_dataset_value(dataset_dict, 'id')
            title = self._get_dataset_value(dataset_dict, 'title')
            description = self._get_dataset_value(dataset_dict, 'notes')
            g.add((
                dataset_ref,
                DCT.identifier,
                Literal(id + '@' + organization_id)
            ))
            g.add((
                dataset_ref,
                DCT.title,
                Literal(title, lang=ckan_locale_default)
            ))
            g.add((
                dataset_ref,
                DCT.description,
                Literal(description, lang=ckan_locale_default)
            ))

            # Update Interval
            try:
                update_interval = self._get_dataset_value(
                    dataset_dict,
                    'updateInterval'
                )
                accrualPeriodicity = mapping_accrualPeriodicity.get(
                    update_interval[0]
                )
            except IndexError:
                accrualPeriodicity = None
            if accrualPeriodicity:
                g.add((
                    dataset_ref,
                    DCT.accrualPeriodicity,
                    URIRef(accrualPeriodicity)
                ))

            # Temporal
            time_range = self._time_interval_from_dataset(dataset_dict)
            if time_range is not None and time_range.get('start_date') and time_range.get('end_date'):  # noqa
                start = time_range.get('start_date')
                end = time_range.get('end_date')

                temporal_extent = BNode()
                g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
                g.add((
                    temporal_extent,
                    SCHEMA.startDate,
                    Literal(start, datatype=XSD.date)
                ))
                g.add((
                    temporal_extent,
                    SCHEMA.endDate,
                    Literal(end, datatype=XSD.date)
                ))
                g.add((dataset_ref, DCT.temporal, temporal_extent))

            # Themes
            groups = self._get_dataset_value(dataset_dict, 'groups')
            try:
                theme_names = set(itertools.chain.from_iterable(
                    [self._themes(group.get('name')) for group in
                     groups]))
                if any(tag['name'] == 'geodaten'
                       for tag in dataset_dict.get('tags', [])):
                    theme_names.add('geography')

                for theme_name in theme_names:
                    g.add((
                        dataset_ref,
                        DCAT.theme,
                        URIRef(ogd_theme_base_url + theme_name)
                    ))
            except IndexError:
                pass

            # Legal Information
            legal_information = self._get_dataset_value(
                dataset_dict,
                'legalInformation'
            )
            g.add((dataset_ref, DCT.accessRights, Literal(legal_information)))

            # Contact details
            if any([
                self._get_dataset_value(dataset_dict, 'contact_uri'),
                self._get_dataset_value(dataset_dict, 'contact_name'),
                self._get_dataset_value(dataset_dict, 'contact_email'),
                self._get_dataset_value(dataset_dict, 'maintainer'),
                self._get_dataset_value(dataset_dict, 'maintainer_email'),
                self._get_dataset_value(dataset_dict, 'author'),
                self._get_dataset_value(dataset_dict, 'author_email'),
            ]):

                contact_details = BNode()

                g.add((contact_details, RDF.type, VCARD.Organization))
                g.add((dataset_ref, DCAT.contactPoint, contact_details))

                maintainer_email = self._get_dataset_value(
                    dataset_dict,
                    'maintainer_email'
                )
                g.add((contact_details, VCARD.hasEmail, URIRef(maintainer_email)))  # noqa

                items = [
                    ('contact_name', VCARD.fn, ['maintainer', 'author'], Literal),  # noqa
                ]
                self._add_triples_from_dict(dataset_dict, contact_details, items)   # noqa

            # Tags
            for tag in dataset_dict.get('tags', []):
                g.add((
                    dataset_ref,
                    DCAT.keyword,
                    Literal(tag['name'], lang=ckan_locale_default)
                ))

            # Resources
            for resource_dict in dataset_dict.get('resources', []):
                distribution = URIRef(resource_uri(resource_dict))

                g.add((dataset_ref, DCAT.distribution, distribution))
                g.add((distribution, RDF.type, DCAT.Distribution))
                g.add((distribution, DCT.language, Literal(ckan_locale_default)))  # noqa

                #  Simple values
                items = [
                    ('id', DCT.identifier, None, Literal),
                    ('name', DCT.title, None, Literal),
                    ('description', DCT.description, None, Literal),
                    ('state', ADMS.status, None, Literal),
                ]

                self._add_triples_from_dict(resource_dict, distribution, items)

                license_id = self._get_dataset_value(dataset_dict, 'license_id')  # noqa
                license_title = self._rights(license_id)
                g.add((distribution, DCT.rights, Literal(license_title)))
                g.add((distribution, DCT.license, Literal(license_title)))

                #  Lists
                items = [
                    ('conforms_to', DCT.conformsTo, None, Literal),
                ]
                self._add_list_triples_from_dict(
                    resource_dict,
                    distribution,
                    items
                )

                # Format
                if '/' in resource_dict.get('format', ''):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['format'])))
                else:
                    if resource_dict.get('format'):
                        g.add((distribution, DCT['format'],
                               Literal(resource_dict['format'])))

                    if resource_dict.get('mimetype'):
                        g.add((distribution, DCAT.mediaType,
                               Literal(resource_dict['mimetype'])))

                # URLs
                url = resource_dict.get('url')
                if url:
                    g.add((distribution, DCAT.accessURL, Literal(url)))

                # if resource has the following format, the distribution is a
                # service and therefore doesn't need a downloadURL
                format = resource_dict.get('format').lower()
                if format not in ['xml', 'wms', 'wmts', 'wfs']:
                    download_url = resource_dict.get('url')
                    if download_url:
                        g.add((
                            distribution,
                            DCAT.downloadURL,
                            Literal(download_url)
                        ))

                # Dates
                items = [
                    ('created', DCT.issued, None, Literal),
                    ('last_modified', DCT.modified, None, Literal),
                ]

                self._add_date_triples_from_dict(
                    resource_dict,
                    distribution,
                    items
                )

                # Numbers
                if resource_dict.get('size'):
                    try:
                        g.add((distribution, DCAT.byteSize,
                               Literal(float(resource_dict['size']),
                                       datatype=XSD.decimal)))
                    except (ValueError, TypeError):
                        g.add((distribution, DCAT.byteSize,
                               Literal(resource_dict['size'])))
                # Checksum
                if resource_dict.get('hash'):
                    checksum = BNode()
                    g.add((checksum, SPDX.checksumValue,
                           Literal(resource_dict['hash'],
                                   datatype=XSD.hexBinary)))

                    if resource_dict.get('hash_algorithm'):
                        if resource_dict['hash_algorithm'].startswith('http'):
                            g.add((checksum, SPDX.algorithm,
                                   URIRef(resource_dict['hash_algorithm'])))
                        else:
                            g.add((checksum, SPDX.algorithm,
                                   Literal(resource_dict['hash_algorithm'])))
                    g.add((distribution, SPDX.checksum, checksum))

            # Publisher
            if dataset_dict.get('organization'):

                publisher_name = dataset_dict.get('author')

                publisher_details = BNode()

                g.add((publisher_details, RDF.type, RDF.Description))
                g.add((publisher_details, RDFS.label, Literal(publisher_name)))
                g.add((dataset_ref, DCT.publisher, publisher_details))
        except Exception, e:
            log.exception(
                "Something went wrong: %s / %s" % (e, traceback.format_exc())
            )
            raise
Exemple #23
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):
        """ Transforms CKAN-Dictionary to DCAT-AP.de-Data """
        g = self.g

        # bind namespaces to have readable names in RDF Document
        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # Simple additional fields
        items = [('qualityProcessURI', DCATDE.qualityProcessURI, None, URIRef),
                 ('metadata_original_html', DCAT.landingPage, None, URIRef),
                 ('politicalGeocodingLevelURI',
                  DCATDE.politicalGeocodingLevelURI, None, URIRef)]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Additional Lists
        items = [('contributorID', DCATDE.contributorID, None, Literal),
                 ('politicalGeocodingURI', DCATDE.politicalGeocodingURI,
                  None, URIRef),
                 ('legalbasisText', DCATDE.legalBasis, None, Literal),
                 ('geocodingText', DCATDE.geocodingDescription, None, Literal)]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Add adminUnitL2 for every politicalGeocodingURI value. Compatibility.
        if self._get_dataset_value(dataset_dict, 'politicalGeocodingURI'):
            spatial_ref = BNode()
            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            items = [('politicalGeocodingURI', LOCN.adminUnitL2, None, URIRef)]
            self._add_list_triples_from_dict(dataset_dict, spatial_ref, items)

        # Contacts
        self._add_contact(dataset_dict, dataset_ref, DCATDE.originator,
                          'originator')
        self._add_contact(dataset_dict, dataset_ref, DCATDE.maintainer,
                          'maintainer')
        self._add_contact(dataset_dict, dataset_ref, DCT.contributor,
                          'contributor')
        self._add_contact(dataset_dict, dataset_ref, DCT.creator, 'author')

        # Add maintainer_url to contact_point
        maintainer_url = self._get_dataset_value(dataset_dict,
                                                 'maintainer_url')
        if maintainer_url:
            contact_point = self._get_or_create_contact_point(
                dataset_dict, dataset_ref)
            self._add_triple_from_dict(dataset_dict,
                                       contact_point,
                                       VCARD.hasURL,
                                       'maintainer_url',
                                       _type=URIRef)

        # add maintainer_tel to contact_point
        maintainer_tel = self._get_dataset_value(dataset_dict,
                                                 'maintainer_tel')
        if maintainer_tel:
            contact_point = self._get_or_create_contact_point(
                dataset_dict, dataset_ref)
            self._add_triple_from_dict(dataset_dict,
                                       contact_point,
                                       VCARD.hasTelephone,
                                       'maintainer_tel',
                                       _type=URIRef,
                                       value_modifier=self._add_tel)

        # add maintainer postal data to contact_point if available
        vcard_mapping = {
            'street': VCARD.hasStreetAddress,
            'city': VCARD.hasLocality,
            'zip': VCARD.hasPostalCode,
            'country': VCARD.hasCountryName
        }
        for vc_name in vcard_mapping:
            vcard_fld = self._get_dataset_value(dataset_dict,
                                                'maintainer_' + vc_name)
            if vcard_fld:
                contact_point = self._get_or_create_contact_point(
                    dataset_dict, dataset_ref)
                g.add((contact_point, vcard_mapping[vc_name],
                       Literal(vcard_fld)))

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group in groups:
            group_name_in_dict = group['name']
            if group_name_in_dict:
                value_to_add = self._removeWhitespaces(group_name_in_dict)
                if value_to_add:
                    g.add((dataset_ref, DCAT.theme,
                           URIRef(dcat_theme_prefix + value_to_add.upper())))

        # used_datasets
        items = [
            ('used_datasets', DCT.relation, None, URIRef),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Enhance Distributions
        for resource_dict in dataset_dict.get('resources', []):
            for distribution in g.objects(dataset_ref, DCAT.distribution):
                # Match distribution in graph and distribution in ckan-dict
                if unicode(distribution) == resource_uri(resource_dict):
                    items = [('licenseAttributionByText',
                              DCATDE.licenseAttributionByText, None, Literal),
                             ('plannedAvailability',
                              DCATDE.plannedAvailability, None, URIRef)]
                    self._add_triples_from_dict(resource_dict, distribution,
                                                items)
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # -- start
        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('title', DCT.title, None, Literal),
            ('notes', DCT.description, None, Literal),
            ('url', DCAT.landingPage, None, URIRef),
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, URIRef),
            ('subject', DCT.subject, None,
             URIRef),  # Mentioned in the vocabulary
            ('provenance', DCT.provenance, None, URIRef)
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        #  Lists
        items = [('language', DCT.language, None, URIRef),
                 ('theme', DCAT.theme, None, URIRef),
                 ('spatial_uri', DCT.spatial, None, URIRef),
                 ('conforms_to', DCT.conformsTo, None, URIRef),
                 ('alternate_identifier', ADMS.identifier, None, Literal),
                 ('documentation', FOAF.page, None, URIRef),
                 ('access_rights', DCT.accessRights, None, URIRef),
                 ('related_resource', DCT.relation, None, URIRef),
                 ('has_version', DCT.hasVersion, None, Literal),
                 ('is_version_of', DCT.isVersionOf, None, Literal),
                 ('source', DCT.source, None, Literal),
                 ('sample', ADMS.sample, None, Literal)]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Contact details
        if any([
                self._get_dataset_value(dataset_dict, 'contact_uri'),
                self._get_dataset_value(dataset_dict, 'contact_name'),
                self._get_dataset_value(dataset_dict, 'contact_email'),
                self._get_dataset_value(dataset_dict, 'maintainer'),
                self._get_dataset_value(dataset_dict, 'maintainer_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(contact_uri)
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Kind))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            items = [
                ('contact_name', VCARD.fn, ['maintainer'], Literal),
                ('contact_email', VCARD.hasEmail, ['maintainer_email'],
                 Literal),
            ]

            self._add_triples_from_dict(dataset_dict, contact_details, items)

        # Publisher
        if any([
                self._get_dataset_value(dataset_dict, 'publisher_uri'),
                self._get_dataset_value(dataset_dict, 'publisher_name'),
                self._get_dataset_value(dataset_dict, 'publisher_identifier'),
                dataset_dict.get('organization'),
        ]):

            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
            if publisher_uri:
                publisher_details = URIRef(publisher_uri)
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            g.add((publisher_details, RDF.type, FOAF.Agent))
            g.add((dataset_ref, DCT.publisher, publisher_details))

            publisher_name = self._get_dataset_value(dataset_dict,
                                                     'publisher_name')
            if not publisher_name and dataset_dict.get('organization'):
                publisher_name = dataset_dict['organization']['title']

            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
            # TODO: It would make sense to fallback these to organization
            # fields but they are not in the default schema and the
            # `organization` object in the dataset_dict does not include
            # custom fields
            items = [('publisher_email', FOAF.mbox, None, Literal),
                     ('publisher_identifier', DCT.identifier, None, Literal),
                     ('publisher_url', FOAF.homepage, None, URIRef),
                     ('publisher_type', DCT.type, None, Literal)]

            self._add_triples_from_dict(dataset_dict, publisher_details, items)

        # Temporal
        start = self._get_dataset_value(dataset_dict, 'temporal_start')
        end = self._get_dataset_value(dataset_dict, 'temporal_end')
        if start or end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if start:
                self._add_date_triple(temporal_extent, SCHEMA.startDate, start)
            if end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate, end)
            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # parts - has part/is part of

        if any([
                self._get_dataset_value(dataset_dict, 'has_part'),
                self._get_dataset_value(dataset_dict, 'is_part_of')
        ]):
            items = [('has_part', DCT.hasPart, None, URIRef),
                     ('is_part_of', DCT.isPartOf, None, URIRef)]

            self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Spatial
        spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri')
        spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text')
        spatial_geom = self._get_dataset_value(dataset_dict, 'spatial')

        if spatial_uri:
            spatial_uri = get_spatial_uri(spatial_uri)  # map from code to URI

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = URIRef(spatial_uri)
            else:
                spatial_ref = BNode()

            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            if spatial_text:
                g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

            if spatial_geom:
                # GeoJSON
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial_geom, datatype=GEOJSON_IMT)))
                # WKT, because GeoDCAT-AP says so
                try:
                    g.add((spatial_ref, LOCN.geometry,
                           Literal(wkt.dumps(json.loads(spatial_geom),
                                             decimals=4),
                                   datatype=GSP.wktLiteral)))
                except (TypeError, ValueError, InvalidGeoJSONException):
                    pass

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            if 'license' not in resource_dict and 'license_id' in dataset_dict:
                lr = LicenseRegister()
                _license = lr.get(dataset_dict['license_id'])
                if _license:
                    resource_dict['license'] = _license.url

            #  Simple values
            items = [
                ('name', DCT.title, None, Literal),
                ('description', DCT.description, None, Literal),
                ('status', ADMS.status, None, Literal),
                ('rights', DCT.rights, None, Literal),
                ('license', DCT.license, None, URIRef),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            #  Lists
            items = [
                ('documentation', FOAF.page, None, URIRef),
                ('language', DCT.language, None, URIRef),
                ('conforms_to', DCT.conformsTo, None, URIRef),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DCT['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, URIRef(url)))

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
            # Checksum
            if resource_dict.get('hash'):
                checksum = BNode()
                g.add((checksum, SPDX.checksumValue,
                       Literal(resource_dict['hash'], datatype=XSD.hexBinary)))

                if resource_dict.get('hash_algorithm'):
                    if resource_dict['hash_algorithm'].startswith('http'):
                        g.add((checksum, SPDX.algorithm,
                               URIRef(resource_dict['hash_algorithm'])))
                    else:
                        g.add((checksum, SPDX.algorithm,
                               Literal(resource_dict['hash_algorithm'])))
                g.add((distribution, SPDX.checksum, checksum))
    def test_distribution_fields(self):

        resource = {
            "id": "c041c635-054f-4431-b647-f9186926d021",
            "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
            "name": "CSV file",
            "description": "A CSV file",
            "url": "http://example.com/data/file.csv",
            "status": "http://purl.org/adms/status/Completed",
            "rights": "Some statement about rights",
            "license": "http://creativecommons.org/licenses/by/3.0/",
            "issued": "2015-06-26T15:21:09.034694",
            "modified": "2015-06-26T15:21:09.075774",
            "size": 1234,
            "documentation": '["http://dataset.info.org/distribution1/doc1", "http://dataset.info.org/distribution1/doc2"]',
            "language": '["en", "es", "ca"]',
            "conforms_to": '["Standard 1", "Standard 2"]',
            "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a",
            "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1",
        }

        dataset = {
            "id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
            "name": "test-dataset",
            "title": "Test DCAT dataset",
            "resources": [resource],
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1)

        # URI
        distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
        assert self._triple(g, distribution, DCT.title, resource["name"])
        assert self._triple(g, distribution, DCT.description, resource["description"])
        assert self._triple(g, distribution, DCT.rights, resource["rights"])
        assert self._triple(g, distribution, DCT.license, resource["license"])
        assert self._triple(g, distribution, ADMS.status, resource["status"])

        # List
        for item in [("documentation", FOAF.page), ("language", DCT.language), ("conforms_to", DCT.conformsTo)]:
            values = json.loads(resource[item[0]])
            eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, distribution, item[1], value)

        # Dates
        assert self._triple(g, distribution, DCT.issued, resource["issued"], XSD.dateTime)
        assert self._triple(g, distribution, DCT.modified, resource["modified"], XSD.dateTime)

        # Numbers
        assert self._triple(g, distribution, DCAT.byteSize, float(resource["size"]), XSD.decimal)

        # Checksum
        checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
        assert checksum
        assert self._triple(
            g, checksum, SPDX.checksumValue, resource["hash"], data_type="http://www.w3.org/2001/XMLSchema#hexBinary"
        )
        assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource["hash_algorithm"]))
Exemple #26
0
class ItalianDCATAPProfile(RDFProfile):
    '''
    An RDF profile for the Italian DCAT-AP recommendation for data portals
    It requires the European DCAT-AP profile (`euro_dcat_ap`)
    '''
    def parse_dataset(self, dataset_dict, dataset_ref):

        # check the dataset type
        if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g:
            # not a DCATAPIT dataset
            return dataset_dict

        # date info
        for predicate, key, logf in (
            (DCT.issued, 'issued', log.debug),
            (DCT.modified, 'modified', log.warn),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)

                value = helpers.format(value, '%Y-%m-%d', 'date')
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..1 predicates
        for predicate, key, logf in ((DCT.identifier, 'identifier',
                                      log.warn), ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..n predicates list
        for predicate, key, logf in (
            (ADMS.identifier, 'alternate_identifier', log.debug),
            (DCT.isVersionOf, 'is_version_of', log.debug),
        ):
            valueList = self._object_value_list(dataset_ref, predicate)
            if valueList:
                self._remove_from_extra(dataset_dict, key)
                value = ','.join(valueList)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # conformsTo
        self._remove_from_extra(dataset_dict, 'conforms_to')
        conform_list = []
        for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo):
            conform_list.append(self._object_value(conforms_to,
                                                   DCT.identifier))
        if conform_list:
            value = ','.join(conform_list)
            dataset_dict['conforms_to'] = value
        else:
            log.debug('No DCT.conformsTo found for dataset "%s"',
                      dataset_dict.get('title', '---'))

        # Temporal
        start, end = self._time_interval(dataset_ref, DCT.temporal)
        for v, key, logf in (
            (start, 'temporal_start', log.debug),
            (end, 'temporal_end', log.debug),
        ):
            if v:
                self._remove_from_extra(dataset_dict, key)

                value = helpers.format(v, '%Y-%m-%d', 'date')
                dataset_dict[key] = value
            else:
                log.warn('No %s Date found for dataset "%s"', key,
                         dataset_dict.get('title', '---'))

        # URI 0..1
        for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency',
                                          FREQ_BASE_URI), ):
            valueRef = self._object_value(dataset_ref, predicate)
            if valueRef:
                self._remove_from_extra(dataset_dict, key)
                value = self._strip_uri(valueRef, base_uri)
                dataset_dict[key] = value
            else:
                log.warn('No %s found for dataset "%s"', predicate,
                         dataset_dict.get('title', '---'))

        # URI lists
        for predicate, key, base_uri in (
            (DCT.language, 'language', LANG_BASE_URI),
            (DCAT.theme, 'theme', THEME_BASE_URI),
        ):
            self._remove_from_extra(dataset_dict, key)
            valueRefList = self._object_value_list(dataset_ref, predicate)
            valueList = [
                self._strip_uri(valueRef, base_uri)
                for valueRef in valueRefList
            ]
            value = ','.join(valueList)
            if len(valueList) > 1:
                value = '{' + value + '}'
            dataset_dict[key] = value

        # Spatial
        spatial_tags = []
        geonames_url = None

        for spatial in self.g.objects(dataset_ref, DCT.spatial):
            for spatial_literal in self.g.objects(
                    spatial, DCATAPIT.geographicalIdentifier):
                spatial_value = spatial_literal.value
                if GEO_BASE_URI in spatial_value:
                    spatial_tags.append(
                        self._strip_uri(spatial_value, GEO_BASE_URI))
                else:
                    if geonames_url:
                        log.warn(
                            "GeoName URL is already set to %s, value %s will not be imported",
                            geonames_url, spatial_value)
                    else:
                        geonames_url = spatial_value

        if len(spatial_tags) > 0:
            value = ','.join(spatial_tags)
            if len(spatial_tags) > 1:
                value = '{' + value + '}'
            dataset_dict['geographical_name'] = value

        if geonames_url:
            dataset_dict['geographical_geonames_url'] = geonames_url

        ### Collect strings from multilang fields

        # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...}
        localized_dict = {}

        for key, predicate in (
            ('title', DCT.title),
            ('notes', DCT.description),
        ):
            self._collect_multilang_strings(dataset_dict, key, dataset_ref,
                                            predicate, localized_dict)

        # Agents
        for predicate, basekey in (
            (DCT.publisher, 'publisher'),
            (DCT.rightsHolder, 'holder'),
            (DCT.creator, 'creator'),
        ):
            agent_dict, agent_loc_dict = self._parse_agent(
                dataset_ref, predicate, basekey)
            for key, v in agent_dict.iteritems():
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = v
            localized_dict.update(agent_loc_dict)

        # when all localized data have been parsed, check if there really any and add it to the dict
        if len(localized_dict) > 0:
            log.debug('Found multilang metadata')
            dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict

        ### Resources

        resources_loc_dict = {}

        # In ckan, the license is a dataset property, not resource's
        # We'll collect all of the resources' licenses, then we will postprocess them
        licenses = []  #  contains tuples (url, name)

        for resource_dict in dataset_dict.get('resources', []):
            resource_uri = resource_dict['uri']
            if not resource_uri:
                log.warn("URI not defined for resource %s",
                         resource_dict['name'])
                continue

            distribution = URIRef(resource_uri)
            if not (dataset_ref, DCAT.distribution, distribution) in self.g:
                log.warn("Distribution not found in dataset %s", resource_uri)
                continue

            # URI 0..1
            for predicate, key, base_uri in (
                (DCT['format'], 'format', FORMAT_BASE_URI),  # Format
            ):
                valueRef = self._object_value(distribution, predicate)
                if valueRef:
                    value = self._strip_uri(valueRef, base_uri)
                    resource_dict[key] = value
                else:
                    log.warn('No %s found for resource "%s"::"%s"', predicate,
                             dataset_dict.get('title', '---'),
                             resource_dict.get('name', '---'))

            # License
            license = self._object(distribution, DCT.license)
            if license:
                # just add this info in the resource extras
                resource_dict['license_url'] = str(license)
                license_name = self._object_value(
                    license, FOAF.name)  # may be either the title or the id
                if (license_name):
                    # just add this info in the resource extras
                    resource_dict['license_name'] = license_name
                else:
                    license_name = "unknown"
                licenses.append((str(license), license_name))
            else:
                log.warn('No license found for resource "%s"::"%s"',
                         dataset_dict.get('title', '---'),
                         resource_dict.get('name', '---'))

            # Multilang
            loc_dict = {}

            for key, predicate in (
                ('name', DCT.title),
                ('description', DCT.description),
            ):
                self._collect_multilang_strings(resource_dict, key,
                                                distribution, predicate,
                                                loc_dict)

            if len(loc_dict) > 0:
                log.debug('Found multilang metadata in resource %s',
                          resource_dict['name'])
                resources_loc_dict[resource_uri] = loc_dict

        if len(resources_loc_dict) > 0:
            log.debug('Found multilang metadata in resources')
            dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict

        # postprocess licenses
        # license_ids = {id for url,id in licenses}  # does not work in python 2.6
        license_ids = set()
        for url, id in licenses:
            license_ids.add(id)

        if license_ids:
            if len(license_ids) > 1:
                log.warn('More than one license found for dataset "%s"',
                         dataset_dict.get('title', '---'))
            dataset_dict['license_id'] = license_ids.pop()  # take a random one

        return dataset_dict

    def _collect_multilang_strings(self, base_dict, key, subj, pred, loc_dict):
        '''
        Search for multilang Literals matching (subj, pred).
        - Literals not localized will be stored as source_dict[key] -- possibly replacing the value set by the EURO parser
        - Localized literals will be stored into target_dict[key][lang]
        '''

        for obj in self.g.objects(subj, pred):
            value = obj.value
            lang = obj.language
            if not lang:
                # force default value in dataset
                base_dict[key] = value
            else:
                # add localized string
                lang_dict = loc_dict.setdefault(key, {})
                lang_dict[lang_mapping_xmllang_to_ckan.get(lang)] = value

    def _remove_from_extra(self, dataset_dict, key):

        #  search and replace
        for extra in dataset_dict.get('extras', []):
            if extra['key'] == key:
                dataset_dict['extras'].pop(dataset_dict['extras'].index(extra))
                return

    def _add_or_replace_extra(self, dataset_dict, key, value):

        #  search and replace
        for extra in dataset_dict.get('extras', []):
            if extra['key'] == key:
                extra['value'] = value
                return

        # add if not found
        dataset_dict['extras'].append({'key': key, 'value': value})

    def _parse_agent(self, subject, predicate, base_name):

        agent_dict = {}
        loc_dict = {}

        for agent in self.g.objects(subject, predicate):
            agent_dict[base_name + '_identifier'] = self._object_value(
                agent, DCT.identifier)
            self._collect_multilang_strings(agent_dict, base_name + '_name',
                                            agent, FOAF.name, loc_dict)

        return agent_dict, loc_dict

    def _strip_uri(self, value, base_uri):
        return value.replace(base_uri, '')

    def graph_from_dataset(self, dataset_dict, dataset_ref):

        title = dataset_dict.get('title')

        g = self.g

        for prefix, namespace in it_namespaces.iteritems():
            g.bind(prefix, namespace)

        ### add a further type for the Dataset node
        g.add((dataset_ref, RDF.type, DCATAPIT.Dataset))

        ### replace themes
        value = self._get_dict_value(dataset_dict, 'theme')
        if value:
            for theme in value.split(','):
                self.g.remove((dataset_ref, DCAT.theme, URIRef(theme)))
                theme = theme.replace('{', '').replace('}', '')
                self.g.add(
                    (dataset_ref, DCAT.theme, URIRef(THEME_BASE_URI + theme)))
                self._add_concept(THEME_CONCEPTS, theme)
        else:
            self.g.add((dataset_ref, DCAT.theme,
                        URIRef(THEME_BASE_URI + DEFAULT_THEME_KEY)))
            self._add_concept(THEME_CONCEPTS, DEFAULT_THEME_KEY)

        ### replace languages
        value = self._get_dict_value(dataset_dict, 'language')
        if value:
            for lang in value.split(','):
                self.g.remove((dataset_ref, DCT.language, Literal(lang)))
                lang = lang.replace('{', '').replace('}', '')
                self.g.add(
                    (dataset_ref, DCT.language, URIRef(LANG_BASE_URI + lang)))
                # self._add_concept(LANG_CONCEPTS, lang)

        ### add spatial (EU URI)
        value = self._get_dict_value(dataset_dict, 'geographical_name')
        if value:
            for gname in value.split(','):
                gname = gname.replace('{', '').replace('}', '')

                dct_location = BNode()
                self.g.add((dataset_ref, DCT.spatial, dct_location))

                self.g.add((dct_location, RDF['type'], DCT.Location))

                # Try and add a Concept from the spatial vocabulary
                if self._add_concept(GEO_CONCEPTS, gname):
                    self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                                Literal(GEO_BASE_URI + gname)))

                    # geo concept is not really required, but may be a useful adding
                    self.g.add((dct_location, LOCN.geographicalName,
                                URIRef(GEO_BASE_URI + gname)))
                else:
                    # The dataset field is not a controlled tag, let's create a Concept out of the label we have
                    concept = BNode()
                    self.g.add((concept, RDF['type'], SKOS.Concept))
                    self.g.add((concept, SKOS.prefLabel, Literal(gname)))
                    self.g.add((dct_location, LOCN.geographicalName, concept))

        ### add spatial (GeoNames)
        value = self._get_dict_value(dataset_dict, 'geographical_geonames_url')
        if value:
            dct_location = BNode()
            self.g.add((dataset_ref, DCT.spatial, dct_location))

            self.g.add((dct_location, RDF['type'], DCT.Location))
            self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                        Literal(value)))

        ### replace periodicity
        self._remove_node(dataset_dict, dataset_ref,
                          ('frequency', DCT.accrualPeriodicity, None, Literal))
        self._add_uri_node(
            dataset_dict, dataset_ref,
            ('frequency', DCT.accrualPeriodicity, DEFAULT_FREQ_CODE, URIRef),
            FREQ_BASE_URI)
        # self._add_concept(FREQ_CONCEPTS, dataset_dict.get('frequency', DEFAULT_VOCABULARY_KEY))

        ### replace landing page
        self._remove_node(dataset_dict, dataset_ref,
                          ('url', DCAT.landingPage, None, URIRef))
        landing_page_uri = None
        if dataset_dict.get('name'):
            landing_page_uri = '{0}/dataset/{1}'.format(
                catalog_uri().rstrip('/'), dataset_dict['name'])
        else:
            landing_page_uri = dataset_uri(
                dataset_dict)  # TODO: preserve original URI if harvested

        self.g.add((dataset_ref, DCAT.landingPage, URIRef(landing_page_uri)))

        ### conformsTo
        self.g.remove((dataset_ref, DCT.conformsTo, None))
        value = self._get_dict_value(dataset_dict, 'conforms_to')
        if value:
            for item in value.split(','):

                standard = BNode()
                self.g.add((dataset_ref, DCT.conformsTo, standard))

                self.g.add((standard, RDF['type'], DCT.Standard))
                self.g.add((standard, RDF['type'], DCATAPIT.Standard))
                self.g.add((standard, DCT.identifier, Literal(item)))

        ### publisher

        # DCAT by default creates this node
        # <dct:publisher>
        #   <foaf:Organization rdf:about="http://10.10.100.75/organization/55535226-f82a-4cf7-903a-3e10afeaa79a">
        #     <foaf:name>orga2_test</foaf:name>
        #   </foaf:Organization>
        # </dct:publisher>

        for s, p, o in g.triples((dataset_ref, DCT.publisher, None)):
            #log.info("Removing publisher %r", o)
            g.remove((s, p, o))

        self._add_agent(dataset_dict, dataset_ref, 'publisher', DCT.publisher)

        ### Rights holder : Agent
        holder_ref = self._add_agent(dataset_dict, dataset_ref, 'holder',
                                     DCT.rightsHolder)

        ### Autore : Agent
        self._add_agent(dataset_dict, dataset_ref, 'creator', DCT.creator)

        ### Point of Contact

        # <dcat:contactPoint rdf:resource="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri"/>

        # <!-- http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri -->
        # <dcatapit:Organization rdf:about="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri">
        #    <rdf:type rdf:resource="&vcard;Kind"/>
        #    <rdf:type rdf:resource="&vcard;Organization"/>
        #    <vcard:hasEmail rdf:resource="mailto:[email protected]"/>
        #    <vcard:fn>Regione Liguria - Sportello Cartografico</vcard:fn>
        # </dcatapit:Organization>

        # TODO: preserve original info if harvested

        # retrieve the contactPoint added by the euro serializer
        euro_poc = g.value(subject=dataset_ref,
                           predicate=DCAT.contactPoint,
                           object=None,
                           any=False)

        # euro poc has this format:
        # <dcat:contactPoint>
        #    <vcard:Organization rdf:nodeID="Nfcd06f452bcd41f48f33c45b0c95979e">
        #       <vcard:fn>THE ORGANIZATION NAME</vcard:fn>
        #       <vcard:hasEmail>THE ORGANIZATION EMAIL</vcard:hasEmail>
        #    </vcard:Organization>
        # </dcat:contactPoint>

        if euro_poc:
            g.remove((dataset_ref, DCAT.contactPoint, euro_poc))

        org_id = dataset_dict.get('organization', {}).get('id')

        # get orga info
        org_show = logic.get_action('organization_show')

        try:
            org_dict = org_show({}, {
                'id': org_id,
                'include_datasets': False,
                'include_tags': False,
                'include_users': False,
                'include_groups': False,
                'include_extras': True,
                'include_followers': False
            })
        except Exception, e:
            org_dict = {}

        org_uri = organization_uri(org_dict)

        poc = URIRef(org_uri)
        g.add((dataset_ref, DCAT.contactPoint, poc))
        g.add((poc, RDF.type, DCATAPIT.Organization))
        g.add((poc, RDF.type, VCARD.Kind))
        g.add((poc, RDF.type, VCARD.Organization))

        g.add((poc, VCARD.fn, Literal(org_dict.get('name'))))

        if 'email' in org_dict.keys(
        ):  # this element is mandatory for dcatapit, but it may not have been filled for imported datasets
            g.add((poc, VCARD.hasEmail, URIRef(org_dict.get('email'))))
        if 'telephone' in org_dict.keys():
            g.add(
                (poc, VCARD.hasTelephone, Literal(org_dict.get('telephone'))))
        if 'site' in org_dict.keys():
            g.add((poc, VCARD.hasURL, Literal(org_dict.get('site'))))

        ### Multilingual
        # Add localized entries in dataset
        # TODO: should we remove the non-localized nodes?

        loc_dict = interfaces.get_for_package(dataset_dict['id'])
        #  The multilang fields
        loc_package_mapping = {
            'title': (dataset_ref, DCT.title),
            'notes': (dataset_ref, DCT.description),
            'holder_name': (holder_ref, FOAF.name)
        }

        self._add_multilang_values(loc_dict, loc_package_mapping)

        ### Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(
                resource_dict))  # TODO: preserve original info if harvested

            # Add the DCATAPIT type
            g.add((distribution, RDF.type, DCATAPIT.Distribution))

            ### format
            self._remove_node(resource_dict, distribution,
                              ('format', DCT['format'], None, Literal))
            if not self._add_uri_node(resource_dict, distribution,
                                      ('distribution_format', DCT['format'],
                                       None, URIRef), FORMAT_BASE_URI):
                guessed_format = guess_format(resource_dict)
                if guessed_format:
                    self.g.add((distribution, DCT['format'],
                                URIRef(FORMAT_BASE_URI + guessed_format)))
                else:
                    log.warn('No format for resource: %s / %s',
                             dataset_dict.get('title', 'N/A'),
                             resource_dict.get('description', 'N/A'))
                    self.g.add((distribution, DCT['format'],
                                URIRef(FORMAT_BASE_URI + DEFAULT_FORMAT_CODE)))

            ### license
            # <dct:license rdf:resource="http://creativecommons.org/licenses/by/3.0/it/"/>
            #
            # <dcatapit:LicenseDocument rdf:about="http://creativecommons.org/licenses/by/3.0/it/">
            #    <rdf:type rdf:resource="&dct;LicenseDocument"/>
            #    <owl:versionInfo>3.0 ITA</owl:versionInfo>
            #    <foaf:name>CC BY</foaf:name>
            #    <dct:type rdf:resource="http://purl.org/adms/licencetype/Attribution"/>
            # </dcatapit:LicenseDocument>

            # "license_id" : "cc-zero"
            # "license_title" : "Creative Commons CCZero",
            # "license_url" : "http://www.opendefinition.org/licenses/cc-zero",

            license_url = dataset_dict.get('license_url', '')
            license_id = dataset_dict.get('license_id', '')
            license_title = dataset_dict.get('license_title', '')

            if license_url:
                license = URIRef(license_url)
                g.add((license, RDF['type'], DCATAPIT.LicenseDocument))
                g.add((license, RDF['type'], DCT.LicenseDocument))
                g.add((license, DCT['type'],
                       URIRef('http://purl.org/adms/licencetype/Attribution')
                       ))  # TODO: infer from CKAN license

                g.add((distribution, DCT.license, license))

                if license_id:
                    # log.debug('Adding license id: %s', license_id)
                    g.add((license, FOAF.name, Literal(license_id)))
                elif license_title:
                    # log.debug('Adding license title: %s', license_title)
                    g.add((license, FOAF.name, Literal(license_title)))
                else:
                    g.add((license, FOAF.name, Literal('unknown')))
                    log.warn('License not found for dataset: %s', title)

            ### Multilingual
            # Add localized entries in resource
            # TODO: should we remove the not-localized nodes?

            loc_dict = interfaces.get_for_resource(resource_dict['id'])

            #  The multilang fields
            loc_resource_mapping = {
                'name': (distribution, DCT.title),
                'description': (distribution, DCT.description),
            }
            self._add_multilang_values(loc_dict, loc_resource_mapping)
Exemple #27
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('title', DCT.title, None),
            ('notes', DCT.description, None),
            ('url', DCAT.landingPage, None),
            ('identifier', DCT.identifier, ['guid', 'id']),
            ('version', OWL.versionInfo, ['dcat_version']),
            ('version_notes', ADMS.versionNotes, None),
            ('frequency', DCT.accrualPeriodicity, None),
            ('access_rights', DCT.accessRights, None),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created']),
            ('modified', DCT.modified, ['metadata_modified']),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        #  Lists
        items = [
            ('language', DCT.language, None),
            ('theme', DCAT.theme, None),
            ('conforms_to', DCT.conformsTo, None),
            ('alternate_identifier', ADMS.identifier, None),
            ('documentation', FOAF.page, None),
            ('related_resource', DCT.relation, None),
            ('has_version', DCT.hasVersion, None),
            ('is_version_of', DCT.isVersionOf, None),
            ('source', DCT.source, None),
            ('sample', ADMS.sample, None),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Contact details
        if any([
            self._get_dataset_value(dataset_dict, 'contact_uri'),
            self._get_dataset_value(dataset_dict, 'contact_name'),
            self._get_dataset_value(dataset_dict, 'contact_email'),
            self._get_dataset_value(dataset_dict, 'maintainer'),
            self._get_dataset_value(dataset_dict, 'maintainer_email'),
            self._get_dataset_value(dataset_dict, 'author'),
            self._get_dataset_value(dataset_dict, 'author_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(contact_uri)
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Organization))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            items = [
                ('contact_name', VCARD.fn, ['maintainer', 'author']),
                ('contact_email', VCARD.hasEmail, ['maintainer_email',
                                                   'author_email']),
            ]

            self._add_triples_from_dict(dataset_dict, contact_details, items)

        # Publisher
        if any([
            self._get_dataset_value(dataset_dict, 'publisher_uri'),
            self._get_dataset_value(dataset_dict, 'publisher_name'),
            dataset_dict.get('organization'),
        ]):

            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
            if publisher_uri:
                publisher_details = URIRef(publisher_uri)
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            g.add((publisher_details, RDF.type, FOAF.Organization))
            g.add((dataset_ref, DCT.publisher, publisher_details))

            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
            if not publisher_name and dataset_dict.get('organization'):
                publisher_name = dataset_dict['organization']['title']

            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
            # TODO: It would make sense to fallback these to organization
            # fields but they are not in the default schema and the
            # `organization` object in the dataset_dict does not include
            # custom fields
            items = [
                ('publisher_email', FOAF.mbox, None),
                ('publisher_url', FOAF.homepage, None),
                ('publisher_type', DCT.type, None),
            ]

            self._add_triples_from_dict(dataset_dict, publisher_details, items)

        # Temporal
        start = self._get_dataset_value(dataset_dict, 'temporal_start')
        end = self._get_dataset_value(dataset_dict, 'temporal_end')
        if start or end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if start:
                self._add_date_triple(temporal_extent, SCHEMA.startDate, start)
            if end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate, end)
            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Spatial
        spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri')
        spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text')
        spatial_geom = self._get_dataset_value(dataset_dict, 'spatial')

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = URIRef(spatial_uri)
            else:
                spatial_ref = BNode()

            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            if spatial_text:
                g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

            if spatial_geom:
                # GeoJSON
                g.add((spatial_ref,
                       LOCN.geometry,
                       Literal(spatial_geom, datatype=GEOJSON_IMT)))
                # WKT, because GeoDCAT-AP says so
                try:
                    g.add((spatial_ref,
                           LOCN.geometry,
                           Literal(wkt.dumps(json.loads(spatial_geom),
                                             decimals=4),
                                   datatype=GSP.wktLiteral)))
                except (TypeError, ValueError, InvalidGeoJSONException):
                    pass

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('name', DCT.title, None),
                ('description', DCT.description, None),
                ('status', ADMS.status, None),
                ('rights', DCT.rights, None),
                ('license', DCT.license, None),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            #  Lists
            items = [
                ('documentation', FOAF.page, None),
                ('language', DCT.language, None),
                ('conforms_to', DCT.conformsTo, None),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution, items)

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DCT['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, Literal(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, Literal(url)))

            # Dates
            items = [
                ('issued', DCT.issued, None),
                ('modified', DCT.modified, None),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution, items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
            # Checksum
            if resource_dict.get('hash'):
                checksum = BNode()
                g.add((checksum, SPDX.checksumValue,
                       Literal(resource_dict['hash'],
                               datatype=XSD.hexBinary)))

                if resource_dict.get('hash_algorithm'):
                    if resource_dict['hash_algorithm'].startswith('http'):
                        g.add((checksum, SPDX.algorithm,
                               URIRef(resource_dict['hash_algorithm'])))
                    else:
                        g.add((checksum, SPDX.algorithm,
                               Literal(resource_dict['hash_algorithm'])))
                g.add((distribution, SPDX.checksum, checksum))
Exemple #28
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        log.debug("ODMDCATBasicProfileDataset graph_from_dataset")

        g = self.g

        namespaces = odm_rdf_helper.get_namespaces_by_dataset_type(
            dataset_dict['type'])

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, DCT.identifier, Literal(dataset_dict.get('id'))))
        g.add((dataset_ref, DCT.type,
               Literal(dataset_dict.get('type', 'dataset'))))
        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        items = [(dataset_ref, DCT.title, dataset_dict.get('title_translated')
                  or dataset_dict.get('title')),
                 (dataset_ref, DCT.description,
                  dataset_dict.get('notes_translated')
                  or dataset_dict.get('notes'))]

        raw_triples = odm_rdf_helper.get_triples_by_dataset_type(
            dataset_ref, dataset_dict, dataset_dict['type'])
        raw_triples.extend(items)

        for raw_triple in raw_triples:
            triples = odm_rdf_helper.split_multilingual_object_into_triples(
                raw_triple)
            for triple in triples:
                g.add(triple)

        #Organization
        organization = dataset_dict.get('organization')
        g.add((dataset_ref, FOAF.organization,
               URIRef(
                   config.get('ckan.site_url') + "organization/" +
                   organization['name'])))

        #license
        license = URIRef(dataset_dict.get('license_url'))
        g.add((license, DCT.title, Literal(dataset_dict.get('license_title'))))
        g.add((dataset_ref, DCT.license, license))

        # odm_spatial_range
        for item in dataset_dict.get('odm_spatial_range', []):
            iso3_code = odm_rdf_helper.map_country_code_iso2_iso3(item.upper())
            g.add((dataset_ref, GN.countrycode,
                   URIRef("http://data.landportal.info/geo/" + iso3_code)))

        #taxonomy
        for term in dataset_dict.get('taxonomy', []):
            matches = odm_rdf_helper.map_internal_to_standard_taxonomic_term(
                term)

            if isinstance(matches, basestring):
                g.add((dataset_ref, FOAF.topic, Literal(matches)))
            else:
                node = BNode()
                if 'exact_match' in matches:
                    node = URIRef(matches['exact_match'])
                if 'broad_matches' in matches:
                    for broad_match in matches['broad_matches']:
                        g.add((node, SKOS.broadMatch, URIRef(broad_match)))
                        g.add((node, DCT.title, Literal(term)))

                g.add((dataset_ref, FOAF.topic, node))

        #  Language
        for item in dataset_dict.get('odm_language', []):
            g.add((dataset_ref, DC.language, Literal(item.upper())))

        # Dates
        try:
            items = odm_rdf_helper.get_date_fields_by_dataset_type(
                dataset_dict['type'])
            self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)
        except ValueError:
            log.debug("Error adding date triples for dataset " +
                      dataset_dict['id'])

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))
            g.add((dataset_ref, DCAT.Distribution, distribution))
            g.add((distribution, RDF.type, DCAT.Distribution))

            items = [(distribution, DCT.title,
                      resource_dict.get('name_translated')
                      or resource_dict.get('name')),
                     (distribution, DCT.description,
                      resource_dict.get('description_translated')
                      or resource_dict.get('description'))]
            for item in items:
                triples = odm_rdf_helper.split_multilingual_object_into_triples(
                    item)
                for triple in triples:
                    g.add(triple)

            try:
                self._add_triples_from_dict(resource_dict, distribution, items)
            except ValueError:
                log.debug("Error adding triples for dataset " +
                          dataset_dict['id'])

            #  Language
            for item in resource_dict.get('odm_language', []):
                g.add((distribution, DC.language, Literal(item.upper())))

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DCT['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, Literal(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.downloadURL, URIRef(url)))
Exemple #29
0
    def parse_dataset(self, dataset_dict, dataset_ref):
        """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """

        # Simple additional fields
        for key, predicate in (
               ('qualityProcessURI', DCATDE.qualityProcessURI),
               ('metadata_original_html', DCAT.landingPage),
               ('politicalGeocodingLevelURI', DCATDE.politicalGeocodingLevelURI),
               ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                ds_utils.insert_new_extras_field(dataset_dict, key, value)

        # List fields
        for key, predicate, in (
               ('contributorID', DCATDE.contributorID),
               ('politicalGeocodingURI', DCATDE.politicalGeocodingURI),
               ('legalbasisText', DCATDE.legalbasisText),
               ('geocodingText', DCATDE.geocodingText),
               ):
            values = self._object_value_list(dataset_ref, predicate)
            if values:
                ds_utils.insert_new_extras_field(dataset_dict, key, json.dumps(values))

        self._parse_contact(dataset_dict, dataset_ref, DCATDE.originator, 'originator', True)
        self._parse_contact(dataset_dict, dataset_ref, DCATDE.maintainer, 'maintainer', False)
        self._parse_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor', True)
        self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author', False)

        # dcat:contactPoint
        # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer?
        contact = self._object(dataset_ref, DCAT.contactPoint)
        self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL)

        contact_tel = self._object_value(contact, VCARD.hasTelephone)
        if contact_tel:
            ds_utils.insert(dataset_dict, 'maintainer_tel', self._without_tel(contact_tel), True)

        self._add_maintainer_field(dataset_dict, contact, 'street', VCARD.hasStreetAddress)
        self._add_maintainer_field(dataset_dict, contact, 'city', VCARD.hasLocality)
        self._add_maintainer_field(dataset_dict, contact, 'zip', VCARD.hasPostalCode)
        self._add_maintainer_field(dataset_dict, contact, 'country', VCARD.hasCountryName)

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')

        if not groups:
            groups = []

        for obj in self.g.objects(dataset_ref, DCAT.theme):
            current_theme = unicode(obj)

            if current_theme.startswith(dcat_theme_prefix):
                group = current_theme.replace(dcat_theme_prefix, '').lower()
                groups.append({'id': group, 'name': group})

        dataset_dict['groups'] = groups

        # Add additional distribution fields
        for distribution in self.g.objects(dataset_ref, DCAT.distribution):
            for resource_dict in dataset_dict.get('resources', []):
                # Match distribution in graph and distribution in ckan-dict
                if unicode(distribution) == resource_uri(resource_dict):
                    for key, predicate in (
                            ('licenseAttributionByText', DCATDE.licenseAttributionByText),
                            ('plannedAvailability', DCATDE.plannedAvailability)
                    ):
                        value = self._object_value(distribution, predicate)
                        if value:
                            ds_utils.insert_resource_extra(resource_dict, key, value)

        return dataset_dict
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.items():
            g.bind(prefix, namespace)

        # Dataset

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        ## Simple values
        items = [
            ("title", DCTERMS.title, None, Literal),
            ("name", DCTERMS.identifier, None, Literal),
            ("author", DC.creator, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Description
        dataset_desc = dataset_dict.get("notes")
        if dataset_desc:
            dataset_desc_value = markdown_extract(dataset_desc,
                                                  extract_length=0)
        g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc)))

        ## Language
        langs = dataset_dict.get("language")
        if langs:
            for lang in langs:
                language_uri = LANG_PREFIX + lang
                g.add((dataset_ref, DCTERMS.language, URIRef(language_uri)))

        ## Tags
        for tag in dataset_dict.get("tags", []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag["name"])))

        ## Wikidata keywords
        for keyword in dataset_dict.get("keywords", []):
            g.add((dataset_ref, DCAT.theme, WD[keyword]))

        ## Data Type
        data_types = dataset_dict.get("data_type")
        if data_types:
            for data_type in data_types:
                g.add((dataset_ref, DCTERMS.type,
                       URIRef(DATA_TYPE_PREFIX + data_type)))

        ## Temporal Resolution
        temp_res = dataset_dict.get("temp_res")
        temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"}
        if temp_res:
            temp_res_value = temp_res_mapping[temp_res]
            g.add((dataset_ref, DCAT.temporalResolution,
                   Literal(temp_res_value, datatype=XSD.duration)))

        ## Start Time, End Time, and Created Time
        items = [("start_time", SCHEMA.startDate, None, Literal),
                 ("end_time", SCHEMA.endDate, None, Literal),
                 ("created_time", DCTERMS.issued, None, Literal)]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Spatial Coverage
        spatial = dataset_dict.get("spatial")
        x_min = dataset_dict.get("x_min")
        x_max = dataset_dict.get("x_max")
        y_min = dataset_dict.get("y_min")
        y_max = dataset_dict.get("y_max")

        if any([spatial, x_min, x_max, y_min, y_max]):
            spatial_ref = BNode()
            g.add((spatial_ref, RDF.type, DCTERMS.Location))
            g.add((dataset_ref, DCTERMS.spatial, spatial_ref))

            if spatial:
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial, datatype=GEOJSON_IMT)))

            if x_min and x_max and y_min and y_max:
                box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max)
                box_ref = BNode()
                g.add((box_ref, RDF.type, SCHEMA.GeoShape))
                g.add((box_ref, SCHEMA.box, Literal(box_value)))
                g.add((spatial_ref, LOCN.geometry, box_ref))

        ## Spatial Resolution
        spatial_res = dataset_dict.get("spatial_res")

        if spatial_res:
            g.add((dataset_ref, DCAT.spatialResolutionInMeters,
                   Literal(spatial_res, datatype=XSD.decimal)))

        ## Process Step
        proc_step = dataset_dict.get("process_step")

        if proc_step:
            proc_step_value = markdown_extract(proc_step, extract_length=0)
            proc_ref = BNode()
            g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement))
            g.add((proc_ref, RDFS.label, Literal(proc_step_value)))
            g.add((dataset_ref, DCTERMS.provenance, proc_ref))

        ## Project details
        project = dataset_dict.get("organization")

        if project:
            project["description"] = markdown_extract(project["description"],
                                                      extract_length=0)
            project_details = BNode()
            g.add((project_details, RDF.type, ORG.Organization))
            g.add((dataset_ref, DCTERMS.publisher, project_details))
            items = [("title", FOAF.name, None, Literal),
                     ("description", ORG.purpose, None, Literal)]

            self._add_triples_from_dict(project, project_details, items)

        ## Contact details
        contact_person = dataset_dict.get("contact_person")
        contact_email = dataset_dict.get("contact_email")

        if any([contact_person, contact_email]):
            contact_details = BNode()
            g.add((contact_details, RDF.type, VCARD.Individual))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn,
                                       "contact_person")

            self._add_triple_from_dict(dataset_dict,
                                       contact_details,
                                       VCARD.hasEmail,
                                       "contact_email",
                                       _type=URIRef,
                                       value_modifier=self._add_mailto)

        ## Theme
        themes = dataset_dict.get("groups")

        if themes:
            for theme in themes:
                theme_details = BNode()
                g.add((theme_details, RDF.type, SKOS.Concept))
                g.add((theme_details, SKOS.prefLabel, Literal(theme["title"])))
                g.add((dataset_ref, DCAT.theme, theme_details))

        # Resources

        ## Depositar defines license in the dataset level
        license = dataset_dict.get("license_url")

        for resource_dict in dataset_dict.get("resources", []):
            distribution = CleanedURIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            ## Simple values
            items = [
                ("name", DCTERMS.title, None, Literal),
                ("description", DCTERMS.description, None, Literal),
                ("encoding", CNT.characterEncoding, None, Literal),
                ("url", DCAT.downloadURL, None, URIRef),
            ]
            self._add_triples_from_dict(resource_dict, distribution, items)

            ## License
            if license:
                g.add((distribution, DCTERMS.license, URIRef(license)))

            ## Coordinate Systems
            crs = resource_dict.get("resource_crs")

            if crs:
                crs_value = EPSG_PREFIX + str(crs)
                g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value)))

            ## Format (mimetype)
            mimetype = resource_dict.get("mimetype")

            if mimetype:
                mimetype_value = IMT_PREFIX + mimetype
                g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))
    def test_distribution_fields(self):

        resource = {
            'id': 'c041c635-054f-4431-b647-f9186926d021',
            'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'CSV file',
            'description': 'A CSV file',
            'url': 'http://example.com/data/file.csv',
            'status': 'http://purl.org/adms/status/Completed',
            'rights': 'Some statement about rights',
            'license': 'http://creativecommons.org/licenses/by/3.0/',
            'issued': '2015-06-26T15:21:09.034694',
            'modified': '2015-06-26T15:21:09.075774',
            'size': 1234,
            'documentation': '[\"http://dataset.info.org/distribution1/doc1\", \"http://dataset.info.org/distribution1/doc2\"]',
            'language': '[\"en\", \"es\", \"ca\"]',
            'conforms_to': '[\"Standard 1\", \"Standard 2\"]',
            'hash': '4304cf2e751e6053c90b1804c89c0ebb758f395a',
            'hash_algorithm': 'http://spdx.org/rdf/terms#checksumAlgorithm_sha1',

        }

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'resources': [
                resource
            ]
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1)

        # URI
        distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
        eq_(unicode(distribution), utils.resource_uri(resource))

        # Basic fields
        assert self._triple(g, distribution, RDF.type, DCAT.Distribution)
        assert self._triple(g, distribution, DCT.title, resource['name'])
        assert self._triple(g, distribution, DCT.description, resource['description'])
        assert self._triple(g, distribution, DCT.rights, resource['rights'])
        assert self._triple(g, distribution, DCT.license, resource['license'])
        assert self._triple(g, distribution, ADMS.status, resource['status'])

        # List
        for item in [
            ('documentation', FOAF.page),
            ('language', DCT.language),
            ('conforms_to', DCT.conformsTo),
        ]:
            values = json.loads(resource[item[0]])
            eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, distribution, item[1], value)

        # Dates
        assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime)
        assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime)

        # Numbers
        assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal)

        # Checksum
        checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
        assert checksum
        assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
        assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm']))
Exemple #32
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)
        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # dct:title
        titles = (t for t in self._get_dataset_value(dataset_dict, 'title_translated').values() if t)

        for title in titles:
            g.add((dataset_ref, DCT.title, Literal(title)))

        # dct:description
        descriptions = (d for d in self._get_dataset_value(dataset_dict, 'notes_translated').values() if d)

        for description in descriptions:
            g.add((dataset_ref, DCT.description, Literal(description)))

        # dct:contactPoint
        contact_details = BNode()

        g.add((contact_details, RDF.type, VCARD.Organization))
        g.add((dataset_ref, DCAT.contactPoint, contact_details))

        self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, 'maintainer')
        self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, 'maintainer_email',
                                   _type=URIRef, value_modifier=self._add_mailto)

        self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasUrl, 'maintainer_website', _type=URIRef)

        # dcat:distribution
        for resource_dict in self._get_dataset_value(dataset_dict, 'resources'):
            distribution = BNode()
            g.add((distribution, RDF.type, DCAT.Distribution))
            g.add((dataset_ref, DCAT.distribution, distribution))

            titles = (t for t in set(resource_dict.get('name_translated').values()) if t)
            for title in titles:
                g.add((distribution, DCT.title, Literal(title)))

            descriptions = (d for d in set(resource_dict.get('description_translated').values()) if d)
            for description in descriptions:
                g.add((distribution, DCT.description, Literal(description)))

            g.add((distribution, DCAT.accessUrl, URIRef(resource_uri(resource_dict))))

            resource_url = resource_dict.get('url')
            if resource_url:
                g.add((distribution, DCAT.downloadUrl, URIRef(resource_url)))

        # dcat:keyword
        keywords = set(
                keyword
                for keyword_language in dataset_dict.get('keywords', {}).values()
                for keyword in keyword_language)

        for keyword in keywords:
            g.add((dataset_ref, DCAT.keyword, Literal(keyword)))

        # dct:publisher
        context = {'user': p.c.user}
        organization = p.get_action('organization_show')(context, data_dict={'id': dataset_dict['owner_org']})

        publisher = URIRef(p.url_for(controller='organization', action='read', id=organization['id'], qualified=True))
        g.add((publisher, RDF.type, FOAF.Organization))
        g.add((dataset_ref, DCT.publisher, publisher))

        organization_titles = (t for t in organization.get('title_translated', {}).values() if t)

        for title in organization_titles:
            g.add((publisher, FOAF.name, Literal(title)))

        self._add_triple_from_dict(organization, publisher, FOAF.homepage, 'homepage')

        # dcat:theme
        groups = dataset_dict.get('groups', [])

        for group_item in groups:
            group_dict = p.get_action('group_show')(context, data_dict={'id': group_item['id']})
            theme = URIRef(p.url_for(controller='group', action='read', id=group_dict['id'], qualified=True))
            g.add((theme, RDF.type, SKOS.Concept))
            g.add((dataset_ref, DCAT.theme, theme))

            group_titles = (t for t in group_dict.get('title_translated', {}).values() if t)
            for title in group_titles:
                g.add((theme, SKOS.prefLabel, Literal(title)))