def graph_add_resources(self, dataset_uri, dataset_dict): g = self.g for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_uri, DCAT.distribution, distribution)) # As we don't allow direct download of the data, we need to add landing page # To dataset - see http://www.w3.org/TR/vocab-dcat/#example-landing-page g.add((dataset_uri, DCAT.landingPage, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DC.title, None), ('description', DC.description, None), ('status', ADMS.status, None), ('rights', DC.rights, None), ('license', DC.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DC['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) g.set((distribution, DCAT.accessURL, distribution)) # Dates items = [ ('issued', DC.issued, None), ('modified', DC.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))
def _resources_graph(self, dataset_ref, dataset_dict): g = self.g for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, SCHEMA.distribution, distribution)) g.add((distribution, RDF.type, SCHEMA.DataDownload)) self._distribution_graph(distribution, resource_dict)
def test_distribution_fields(self): resource = { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file', 'description': 'A CSV file', 'url': 'http://example.com/data/file.csv', 'status': 'http://purl.org/adms/status/Completed', 'rights': 'Some statement about rights', 'license': 'http://creativecommons.org/licenses/by/3.0/', 'issued': '2015-06-26T15:21:09.034694', 'modified': '2015-06-26T15:21:09.075774', 'size': 1234, 'language': '[\"en\", \"es\", \"ca\"]', } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [ resource ] } s = RDFSerializer(profiles=['schemaorg']) g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, SCHEMA.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload) assert self._triple(g, distribution, SCHEMA.name, resource['name']) assert self._triple(g, distribution, SCHEMA.description, resource['description']) assert self._triple(g, distribution, SCHEMA.license, resource['license']) # List for item in [ ('language', SCHEMA.inLanguage), ]: values = json.loads(resource[item[0]]) eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) for value in values: assert self._triple(g, distribution, item[1], value) # Dates assert self._triple(g, distribution, SCHEMA.datePublished, resource['issued']) assert self._triple(g, distribution, SCHEMA.dateModified, resource['modified']) # Numbers assert self._triple(g, distribution, SCHEMA.contentSize, resource['size'])
def test_distribution_fields(self): resource = { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file', 'description': 'A CSV file', 'url': 'http://example.com/data/file.csv', 'status': 'http://purl.org/adms/status/Completed', 'rights': 'Some statement about rights', 'license': 'http://creativecommons.org/licenses/by/3.0/', 'issued': '2015-06-26T15:21:09.034694', 'modified': '2015-06-26T15:21:09.075774', 'size': 1234, } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [resource] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_( len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name']) assert self._triple(g, distribution, DCT.description, resource['description']) assert self._triple(g, distribution, DCT.rights, resource['rights']) assert self._triple(g, distribution, DCT.license, resource['license']) assert self._triple(g, distribution, ADMS.status, resource['status']) # Dates assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime) assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) # Numbers assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal)
def test_distribution_fields(self): resource = { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file', 'description': 'A CSV file', 'url': 'http://example.com/data/file.csv', 'status': 'http://purl.org/adms/status/Completed', 'rights': 'Some statement about rights', 'license': 'http://creativecommons.org/licenses/by/3.0/', 'issued': '2015-06-26T15:21:09.034694', 'modified': '2015-06-26T15:21:09.075774', 'size': 1234, } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [ resource ] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name']) assert self._triple(g, distribution, DCT.description, resource['description']) assert self._triple(g, distribution, DCT.rights, resource['rights']) assert self._triple(g, distribution, DCT.license, resource['license']) assert self._triple(g, distribution, ADMS.status, resource['status']) # Dates assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime) assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) # Numbers assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal)
def test_distributions(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [ { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file' }, { 'id': '8bceeda9-0084-477f-aa33-dad6148900d5', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'XLS file' }, { 'id': 'da73d939-0f11-45a1-9733-5de108383133', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'PDF file' }, ] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_( len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3) for resource in dataset['resources']: distribution = self._triple(g, dataset_ref, DCAT.distribution, URIRef( utils.resource_uri(resource)))[2] assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name'])
def test_distributions(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [ { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file' }, { 'id': '8bceeda9-0084-477f-aa33-dad6148900d5', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'XLS file' }, { 'id': 'da73d939-0f11-45a1-9733-5de108383133', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'PDF file' }, ] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3) for resource in dataset['resources']: distribution = self._triple(g, dataset_ref, DCAT.distribution, URIRef(utils.resource_uri(resource)))[2] assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name'])
def test_distributions(self): dataset = { "id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "test-dataset", "title": "Test DCAT dataset", "resources": [ { "id": "c041c635-054f-4431-b647-f9186926d021", "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "CSV file", }, { "id": "8bceeda9-0084-477f-aa33-dad6148900d5", "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "XLS file", }, { "id": "da73d939-0f11-45a1-9733-5de108383133", "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "PDF file", }, ], } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3) for resource in dataset["resources"]: distribution = self._triple(g, dataset_ref, DCAT.distribution, URIRef(utils.resource_uri(resource)))[2] assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource["name"])
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('identifier', DCT.identifier, ['guid', 'id']), ('alternate_identifier', ADMS.identifier, None), ('title', DCT.title, None), ('notes', DCT.description, None), ('url', DCAT.landingPage, None), ('version', OWL.versionInfo, None), # ('accrual-periodicity', DCT.accrualPeriodicity, None), # ('temporal', DCT.temporal, None), # ('language', DCT.language, None), # ('dcat-category-id', DCAT.theme, None), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created']), ('modified', DCT.modified, ['metadata_modified']), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Publisher publisher_uri = self._get_dataset_value(dataset_dict, 'spc') if publisher_uri: publisher_details = URIRef(publisher_uri) rightsHolder_details = URIRef(publisher_uri) else: # No publisher_uri publisher_details = BNode() rightsHolder_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) publisher_name = dataset_dict.get('organization').get('title') if publisher_name: g.add((publisher_details, FOAF.name, Literal(publisher_name))) g.add((dataset_ref, DCT.publisher, publisher_details)) # DCAT-AP_IT new properties # subTheme #g.add((dataset_ref, DCATAPIT.subTheme, )) # rightsHolder g.add((rightsHolder_details, RDF.type, DCATAPIT.Agent)) g.add((rightsHolder_details, DCT.identifier, Literal(publisher_uri))) g.add((rightsHolder_details, FOAF.name, Literal(publisher_name))) g.add((dataset_ref, DCT.rightsHolder, rightsHolder_details)) # creator g.add((dataset_ref, DCT.creator, ...)) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer', 'author']), ('contact_email', VCARD.hasEmail, ['maintainer_email', 'author_email']), ] self._add_triples_from_dict(dataset_dict, contact_details, items) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DCT.title, None), ('description', DCT.description, None), ('status', ADMS.status, None), ('rights', DCT.rights, None), ('license', DCT.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, Literal(url))) # Dates items = [ ('issued', DCT.issued, None), ('modified', DCT.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None), ('notes', DCT.description, None), ('url', DCAT.landingPage, None), ('identifier', DCT.identifier, ['guid', 'id']), ('version', OWL.versionInfo, ['dcat_version']), ('alternate_identifier', ADMS.identifier, None), ('version_notes', ADMS.versionNotes, None), ('frequency', DCT.accrualPeriodicity, None), ('accrualPeriodicity', DCT.accrualPeriodicity, None) ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created']), ('modified', DCT.modified, ['metadata_modified']), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None), ('theme-primary', DCAT.theme, None), ('theme-secondary', DCAT.theme, None), ('conforms-to', DCAT.conformsTo, None), ('lineage', DCT.provenance, None) ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer', 'author']), ('contact_email', VCARD.hasEmail, ['maintainer_email', 'author_email']), ] self._add_triples_from_dict(dataset_dict, contact_details, items) license_id = self._get_dataset_value(dataset_dict, 'license_id') if (license_id == 'cc-by'): g.add((dataset_ref, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/'))) else: g.add((dataset_ref, DCT.license, Literal(license_id))) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None), ('publisher_url', FOAF.homepage, None), ('publisher_type', DCT.type, None), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Update Frequency # accrualPeriodicity update_freq = self._get_dataset_value(dataset_dict, 'update_frequency') if (update_freq): has_uri = False # check if there exists a URI for the update_frequency value from ckanext.dgu.forms.dataset_form import update_frequency_uri for freq_name, freq_uri in update_frequency_uri: if freq_name.lower() == update_freq.lower(): has_uri = True break g.add((dataset_ref, DCT.accrualPeriodicity, URIRef(freq_uri) if has_uri else Literal(update_freq))) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DCT.title, None), ('description', DCT.description, None), ('status', ADMS.status, None), ('rights', DCT.rights, None), ('license', DCT.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Format _format = resource_dict['format'] if _format: if '/' in _format: # add dct:format dctFormat = _format.strip().replace("/", ".").replace(" ", "").lower() g.add((distribution, DCT['format'], Literal(dctFormat))) else: g.add((distribution, DCT['format'], Literal(_format.lower()))) # add dcat:mediaType fmt = formats.Formats.match(_format.strip().lower()) mime_types = fmt['mime_types'] if fmt else None if mime_types: g.add((distribution, DCAT.mediaType, Literal(mime_types))) license_id = self._get_dataset_value(dataset_dict, 'license_id') if (license_id == 'cc-by'): g.add((distribution, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/'))) else: g.add((distribution, DCT.license, Literal(license_id))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, Literal(url))) # Dates items = [ ('issued', DCT.issued, None), ('modified', DCT.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))
def export_resource_to_rdf(resource_dict, dataset_dict, _format='xml'): """Export the resource in RDF format. Builds an RDF Graph containing only the selected resource and exports it to the selected format (default ``xml``). :param dict resource_dict: resource metadata. :param dict dataset_dict: dataset metadata. :param str _format: export format. Default is ``xml``. :returns: the serialized RDF graph of the resource. :rtype: """ g = Graph() distribution = URIRef(resource_uri(resource_dict)) g.add((distribution, RDF.type, DCAT.Distribution)) if 'license' not in resource_dict and 'license_id' in dataset_dict: lr = LicenseRegister() _license = lr.get(dataset_dict['license_id']) resource_dict['license'] = _license.url # Simple values items = [ ('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, URIRef), ] for itm in items: key, rdf_prop, def_value, rdf_type = itm value = resource_dict.get(key, def_value) if value: g.add((distribution, rdf_prop, rdf_type(value))) # Lists items = [ ('documentation', FOAF.page, None, URIRef), ('language', DCT.language, None, URIRef), ('conforms_to', DCT.conformsTo, None, URIRef), ] # self._add_list_triples_from_dict(resource_dict, distribution, items) for itm in items: key, rdf_prop, def_value, rdf_type = itm value = resource_dict.get(key, def_value) if value: if isinstance(value, list): for val in value: g.add((distribution, rdf_prop, rdf_type(val))) else: g.add((distribution, rdf_prop, rdf_type(value))) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, URIRef(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, URIRef(url))) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] #self._add_date_triples_from_dict(resource_dict, distribution, items) for itm in items: key, rdf_prop, def_value, rdf_type = itm value = resource_dict.get(key, def_value) if value: g.add((distribution, rdf_prop, rdf_type(value))) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add( (distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum)) return g.serialize(format=_format)
def graph_from_dataset(self, dataset_dict, dataset_ref): log.debug("ODMDCATBasicProfileDataset graph_from_dataset") g = self.g namespaces = odm_rdf_helper.get_namespaces_by_dataset_type(dataset_dict['type']) for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, DCT.identifier, Literal(dataset_dict.get('id')))) g.add((dataset_ref, DCT.type, Literal(dataset_dict.get('type', 'dataset')))) g.add((dataset_ref, RDF.type, DCAT.Dataset)) items = [ (dataset_ref, DCT.title, dataset_dict.get('title_translated') or dataset_dict.get('title')), (dataset_ref, DCT.description, dataset_dict.get('notes_translated') or dataset_dict.get('notes')) ] raw_triples = odm_rdf_helper.get_triples_by_dataset_type(dataset_ref,dataset_dict,dataset_dict['type']) raw_triples.extend(items) for raw_triple in raw_triples: triples = odm_rdf_helper.split_multilingual_object_into_triples(raw_triple) for triple in triples: g.add(triple) #Organization organization = dataset_dict.get('organization') g.add((dataset_ref, FOAF.organization, URIRef(config.get('ckan.site_url') + "organization/" + organization['name']))) #license license = URIRef(dataset_dict.get('license_url')) g.add((license, DCT.title, Literal(dataset_dict.get('license_title')))) g.add((dataset_ref, DCT.license, license)) # odm_spatial_range for item in dataset_dict.get('odm_spatial_range', []): iso3_code = odm_rdf_helper.map_country_code_iso2_iso3(item.upper()) g.add((dataset_ref, GN.countrycode, URIRef("http://data.landportal.info/geo/" + iso3_code))) #taxonomy for term in dataset_dict.get('taxonomy', []): matches = odm_rdf_helper.map_internal_to_standard_taxonomic_term(term) if isinstance(matches,basestring): g.add((dataset_ref, FOAF.topic, Literal(matches))) else: node = BNode() if 'exact_match' in matches: node = URIRef(matches['exact_match']) if 'broad_matches' in matches: for broad_match in matches['broad_matches']: g.add((node,SKOS.broadMatch, URIRef(broad_match))) g.add((node,DCT.title, Literal(term))) g.add((dataset_ref, FOAF.topic, node)) # Language for item in dataset_dict.get('odm_language', []): g.add((dataset_ref, DC.language, Literal(item.upper()))) # Dates try: items = odm_rdf_helper.get_date_fields_by_dataset_type(dataset_dict['type']) self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) except ValueError: log.debug("Error adding date triples for dataset " + dataset_dict['id']) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.Distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) items = [ (distribution, DCT.title, resource_dict.get('name_translated') or resource_dict.get('name')), (distribution, DCT.description, resource_dict.get('description_translated') or resource_dict.get('description')) ] for item in items: triples = odm_rdf_helper.split_multilingual_object_into_triples(item) for triple in triples: g.add(triple) try: self._add_triples_from_dict(resource_dict, distribution, items) except ValueError: log.debug("Error adding triples for dataset " + dataset_dict['id']) # Language for item in resource_dict.get('odm_language', []): g.add((distribution, DC.language, Literal(item.upper()))) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.downloadURL, URIRef(url)))
def parse_dataset(self, dataset_dict, dataset_ref): """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """ # Manage different versions of DCATDE namespaces first. # Ensure that they are ordered from oldest to newest version, such that older values get overwritten # in case of multiple definitions dcatde_versions = [DCATDE_1_0, DCATDE] # geocodingText and legalbasisText got renamed, so handle them separately for key, predicate, in ( ('legalbasisText', DCATDE_1_0.legalbasisText), ('geocodingText', DCATDE_1_0.geocodingText), ('legalbasisText', DCATDE.legalBasis), ('geocodingText', DCATDE.geocodingDescription), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.set_extras_field(dataset_dict, key, json.dumps(values)) # iterate over all namespaces to import as much as possible for dcatde_namespace in dcatde_versions: # Simple additional fields for key, predicate in ( ('qualityProcessURI', dcatde_namespace.qualityProcessURI), ('politicalGeocodingLevelURI', dcatde_namespace.politicalGeocodingLevelURI), ): value = self._object_value(dataset_ref, predicate) if value: ds_utils.set_extras_field(dataset_dict, key, value) # List fields for key, predicate, in ( ('contributorID', dcatde_namespace.contributorID), ('politicalGeocodingURI', dcatde_namespace.politicalGeocodingURI), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.set_extras_field(dataset_dict, key, json.dumps(values)) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.originator, 'originator', True) self._parse_contact(dataset_dict, dataset_ref, dcatde_namespace.maintainer, 'maintainer', False) # Add additional distribution fields for distribution in self.g.objects(dataset_ref, DCAT.distribution): for resource_dict in dataset_dict.get('resources', []): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_uri(resource_dict): for key, predicate in ( ('licenseAttributionByText', dcatde_namespace.licenseAttributionByText), ('plannedAvailability', dcatde_namespace.plannedAvailability)): value = self._object_value(distribution, predicate) if value: ds_utils.insert_resource_extra( resource_dict, key, value) # -- end loop over dcatde namespaces -- # additions in other namespaces than DCATDE self._parse_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor', True) self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author', False) # dcat:landingPage landing_page = self._object_value(dataset_ref, DCAT.landingPage) if landing_page: ds_utils.set_extras_field(dataset_dict, 'metadata_original_html', landing_page) # dcat:contactPoint # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer? contact = self._object(dataset_ref, DCAT.contactPoint) self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL) contact_tel = self._object_value(contact, VCARD.hasTelephone) if contact_tel: ds_utils.insert(dataset_dict, 'maintainer_tel', self._without_tel(contact_tel), True) self._add_maintainer_field(dataset_dict, contact, 'street', VCARD.hasStreetAddress) self._add_maintainer_field(dataset_dict, contact, 'city', VCARD.hasLocality) self._add_maintainer_field(dataset_dict, contact, 'zip', VCARD.hasPostalCode) self._add_maintainer_field(dataset_dict, contact, 'country', VCARD.hasCountryName) # Groups groups = self._get_dataset_value(dataset_dict, 'groups') if not groups: groups = [] for obj in self.g.objects(dataset_ref, DCAT.theme): current_theme = unicode(obj) if current_theme.startswith(dcat_theme_prefix): group = current_theme.replace(dcat_theme_prefix, '').lower() groups.append({'id': group, 'name': group}) dataset_dict['groups'] = groups return dataset_dict
'notes': (dataset_ref, DCT.description), 'publisher_name': (publisher_ref, FOAF.name), } if holder_use_dataset and holder_ref: loc_package_mapping['holder_name'] = (holder_ref, FOAF.name) self._add_multilang_values(loc_dict, loc_package_mapping) if not holder_use_dataset and holder_ref: loc_dict = interfaces.get_for_group_or_organization(org_dict['id']) loc_package_mapping = {'name': (holder_ref, FOAF.name)} self._add_multilang_values(loc_dict, loc_package_mapping) ### Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri( resource_dict)) # TODO: preserve original info if harvested # Add the DCATAPIT type g.add((distribution, RDF.type, DCATAPIT.Distribution)) ### format self._remove_node(resource_dict, distribution, ('format', DCT['format'], None, Literal)) if not self._add_uri_node(resource_dict, distribution, ('distribution_format', DCT['format'], None, URIRef), FORMAT_BASE_URI): guessed_format = guess_format(resource_dict) if guessed_format: self.g.add((distribution, DCT['format'], URIRef(FORMAT_BASE_URI + guessed_format))) else:
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g dist_additons = {} for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) # dcat:contactPoint for contactPoint_ref in g.objects(dataset_ref, DCAT.contactPoint): for email in g.objects(contactPoint_ref, VCARD.hasEmail): g.remove((contactPoint_ref, VCARD.hasEmail, Literal(email))) g.add((contactPoint_ref, VCARD.hasEmail, URIRef('mailto:' + email))) # dcat:theme groups = self._get_dataset_value(dataset_dict, 'groups') for group in groups: mdrtheme_groups = self.category_mapping[group['name']] if mdrtheme_groups: for mdrtheme_group in mdrtheme_groups: g.add((dataset_ref, DCAT.theme, URIRef(MDRTHEME + mdrtheme_group))) # dcatde:contributorID contributor_id = config.get('ckanext.hro_dcatapde.contributorid') if contributor_id: g.add((dataset_ref, DCATDE.contributorID, URIRef('http://dcat-ap.de/def/contributors/' + contributor_id))) # dcatde:geocodingDescription # dcatde:politicalGeocodingLevelURI # dcatde:politicalGeocodingURI # dct:spatial geocoding = self._get_dataset_value(dataset_dict, 'spatial') if geocoding: for spatial_ref in g.objects(dataset_ref, DCT.spatial): g.remove((spatial_ref, LOCN.geometry, Literal(geocoding, datatype = GEOJSON))) if 'multipolygon' in geocoding: geocoding = geocoding.replace('multipolygon', 'MultiPolygon') elif 'polygon' in geocoding: geocoding = geocoding.replace('polygon', 'Polygon') g.add((spatial_ref, LOCN.geometry, Literal(geocoding, datatype = GEOJSON))) geocoding_text = self._get_dataset_value(dataset_dict, 'spatial_text') if geocoding_text: for spatial_ref in g.objects(dataset_ref, DCT.spatial): g.remove((spatial_ref, SKOS.prefLabel, Literal(geocoding_text))) g.add((dataset_ref, DCATDE.geocodingDescription, Literal(geocoding_text))) if geocoding_text in self.geocoding_mapping: geocoding_object = self.geocoding_mapping[geocoding_text] if 'politicalGeocodingLevelURI' in geocoding_object: g.add((dataset_ref, DCATDE.politicalGeocodingLevelURI, URIRef(geocoding_object['politicalGeocodingLevelURI']))) if 'politicalGeocodingURI' in geocoding_object: g.add((dataset_ref, DCATDE.politicalGeocodingURI, URIRef(geocoding_object['politicalGeocodingURI']))) # dcatde:maintainer maintainer = self._get_dataset_value(dataset_dict, 'maintainer') maintainer_email = self._get_dataset_value(dataset_dict, 'maintainer_email') if maintainer or maintainer_email: maintainer_details = BNode() g.add((maintainer_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCATDE.maintainer, maintainer_details)) if maintainer: g.add((maintainer_details, FOAF.name, Literal(maintainer))) if maintainer_email: g.add((maintainer_details, FOAF.mbox, Literal(maintainer_email))) # dct:accessRights g.add((dataset_ref, DCT.accessRights, Literal('public'))) # dct:conformsTo g.add((dataset_ref, DCT.conformsTo, URIRef(DCATDE))) # dct:creator creator = self._get_dataset_value(dataset_dict, 'author') creator_email = self._get_dataset_value(dataset_dict, 'author_email') if creator or creator_email: creator_details = BNode() g.add((creator_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.creator, creator_details)) if creator: g.add((creator_details, FOAF.name, Literal(creator))) if creator_email: g.add((creator_details, FOAF.mbox, Literal(creator_email))) # dct:language language = config.get('ckan.locale_default', 'en') if language in self.language_mapping: mdrlang_language = self.language_mapping[language] g.add((dataset_ref, DCT.language, Literal(getattr(MDRLANG, mdrlang_language)))) # dct:temporal start_date = self._get_dataset_value(dataset_dict, 'temporal_coverage_from') end_date = self._get_dataset_value(dataset_dict, 'temporal_coverage_to') if start_date or end_date: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start_date: self._add_date_triple(temporal_extent, DCAT.startDate, start_date) if end_date: self._add_date_triple(temporal_extent, DCAT.endDate, end_date) g.add((dataset_ref, DCT.temporal, temporal_extent)) # attribution for resources (distributions) enhancement terms_of_use = json.loads(self._get_dataset_value(dataset_dict, 'terms_of_use')) if terms_of_use: if 'attribution_text' in terms_of_use: dist_additons['attribution_text'] = terms_of_use['attribution_text'].encode('utf-8') # license maping for resources (distributions) enhancement license_id = self._get_dataset_value(dataset_dict, 'license_id') if license_id in self.license_mapping: dist_additons['license_id'] = self.license_mapping[license_id]['dcatde-id'] # resources (distributions) enhancement for resource_dict in dataset_dict.get('resources', []): for distribution in g.objects(dataset_ref, DCAT.distribution): if str(distribution) == resource_uri(resource_dict): self.enhance_resource(g, distribution, resource_dict, dist_additons)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None, Literal), ('notes', DCT.description, None, Literal), ('url', DCAT.landingPage, None, URIRef), ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, URIRef), ('access_rights', DCT.accessRights, None, Literal), ('dcat_type', DCT.type, None, Literal), ('provenance', DCT.provenance, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None, Literal), ('theme', DCAT.theme, None, URIRef), ('conforms_to', DCT.conformsTo, None, Literal), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, URIRef), ('related_resource', DCT.relation, None, URIRef), ('has_version', DCT.hasVersion, None, URIRef), ('is_version_of', DCT.isVersionOf, None, URIRef), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(self._removeWhitespaces(contact_uri)) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, 'contact_name', ['maintainer', 'author']) # Add mail address as URIRef, and ensure it has a mailto: prefix self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, 'contact_email', ['maintainer_email', 'author_email'], _type=URIRef, value_modifier=self._add_mailto) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef( self._removeWhitespaces(publisher_uri)) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None, Literal), ('publisher_url', FOAF.homepage, None, URIRef), ('publisher_type', DCT.type, None, URIRef), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(self._removeWhitespaces(spatial_uri)) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef( self._removeWhitespaces(resource_uri(resource_dict))) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('status', ADMS.status, None, URIRef), ('rights', DCT.rights, None, URIRef), ('license', DCT.license, None, URIRef), ('access_url', DCAT.accessURL, None, URIRef), ('download_url', DCAT.downloadURL, None, URIRef)] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None, URIRef), ('language', DCT.language, None, Literal), ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format mimetype = resource_dict.get('mimetype') fmt = resource_dict.get('format') # IANA media types (either URI or Literal) should be mapped as mediaType. # In case format is available and mimetype is not set or identical to format, # check which type is appropriate. if fmt and (not mimetype or mimetype == fmt): if ('iana.org/assignments/media-types' in fmt or not fmt.startswith('http') and '/' in fmt): # output format value as dcat:mediaType instead of dct:format mimetype = fmt fmt = None else: # Use dct:format mimetype = None if mimetype: if mimetype.startswith('http'): g.add((distribution, DCAT.mediaType, URIRef(self._removeWhitespaces(mimetype)))) else: g.add((distribution, DCAT.mediaType, Literal(mimetype))) if fmt: if fmt.startswith('http'): g.add((distribution, DCT['format'], URIRef(self._removeWhitespaces(fmt)))) else: g.add((distribution, DCT['format'], Literal(fmt))) # URL fallback and old behavior url = resource_dict.get('url') download_url = resource_dict.get('download_url') access_url = resource_dict.get('access_url') # Use url as fallback for access_url if access_url is not set and download_url is not equal if (url and ((not (access_url or download_url)) or ((not access_url) and (download_url and url != download_url)))): self._add_triple_from_dict(resource_dict, distribution, DCAT.accessURL, 'url', _type=URIRef) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, RDF.type, SPDX.Checksum)) g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef( self._removeWhitespaces( resource_dict['hash_algorithm'])))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))
def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, Literal), ('access_rights', DCT.accessRights, None, Literal), ('dcat_type', DCT.type, None, Literal), ('provenance', DCT.provenance, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) self._add_multilang_value(dataset_ref, DCT.description, 'description', dataset_dict) # noqa self._add_multilang_value(dataset_ref, DCT.title, 'title', dataset_dict) # noqa # LandingPage g.add((dataset_ref, DCAT.landingPage, Literal(dataset_dict['url']))) self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords', dataset_dict) # noqa # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Update Interval accrual_periodicity = dataset_dict.get('accrual_periodicity') if accrual_periodicity: g.add(( dataset_ref, DCT.accrualPeriodicity, URIRef(accrual_periodicity) )) # Lists items = [ ('language', DCT.language, None, Literal), ('theme', DCAT.theme, None, URIRef), ('conforms_to', DCT.conformsTo, None, Literal), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, Literal), ('has_version', DCT.hasVersion, None, Literal), ('is_version_of', DCT.isVersionOf, None, Literal), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Relations if dataset_dict.get('relations'): relations = dataset_dict.get('relations') for relation in relations: relation_name = relation['label'] relation_url = relation['url'] relation = URIRef(relation_url) g.add((relation, RDFS.label, Literal(relation_name))) g.add((dataset_ref, DCT.relation, relation)) # References if dataset_dict.get('see_alsos'): references = dataset_dict.get('see_alsos') for reference in references: reference_identifier = reference['dataset_identifier'] g.add((dataset_ref, RDFS.seeAlso, Literal(reference_identifier))) # noqa # Contact details if dataset_dict.get('contact_points'): contact_points = self._get_dataset_value(dataset_dict, 'contact_points') # noqa for contact_point in contact_points: contact_details = BNode() contact_point_email = contact_point['email'] contact_point_name = contact_point['name'] g.add((contact_details, RDF.type, VCARD.Organization)) g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) # noqa g.add((contact_details, VCARD.fn, Literal(contact_point_name))) g.add((dataset_ref, DCAT.contactPoint, contact_details)) # Publisher if dataset_dict.get('publishers'): publishers = dataset_dict.get('publishers') for publisher in publishers: publisher_name = publisher['label'] publisher_details = BNode() g.add((publisher_details, RDF.type, RDF.Description)) g.add((publisher_details, RDFS.label, Literal(publisher_name))) g.add((dataset_ref, DCT.publisher, publisher_details)) # Temporals temporals = dataset_dict.get('temporals') if temporals: for temporal in temporals: start = temporal['start_date'] end = temporal['end_date'] if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) # noqa if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) # noqa g.add((dataset_ref, DCT.temporal, temporal_extent)) # Themes groups = self._get_dataset_value(dataset_dict, 'groups') for group_name in groups: g.add(( dataset_ref, DCAT.theme, URIRef(ogd_theme_base_url + group_name.get('name')) )) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, Literal), ('identifier', DCT.identifier, None, Literal), ('media_type', DCAT.mediaType, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(resource_dict, distribution, items) self._add_multilang_value(distribution, DCT.title, 'display_name', dataset_dict) # noqa self._add_multilang_value(distribution, DCT.description, 'description', dataset_dict) # noqa # Lists items = [ ('documentation', FOAF.page, None, Literal), ('language', DCT.language, None, Literal), ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, URIRef(download_url))) g.add((distribution, DCAT.accessURL, URIRef(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, URIRef(url))) # Format from Download-Url if download_url: format_value = str(download_url).rsplit('.', 1)[1] mapped_format = map_to_valid_format(format_value) g.add((distribution, DCT['format'], Literal(mapped_format))) # Mime-Type if resource_dict.get('mimetype'): g.add(( distribution, DCAT.mediaType, Literal(resource_dict['mimetype']) )) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('byte_size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))
def graph_from_dataset(self, dataset_dict, dataset_ref): log.debug("dataset: {}".format(dataset_dict['name'])) g = self.g dist_additons = {} # bind namespaces to have readable names in RDF Document for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # TEMPORARY: fix whitespace in 'url': url = dataset_dict['url'] if url: g.remove((dataset_ref, DCAT.landingPage, URIRef(url))) url = url.replace(" ", "+") g.add((dataset_ref, DCAT.landingPage, URIRef(url))) # Nr. 40 - Contributor contributorId = pylons.config.get('ckanext.dcatde.contributorid') if contributorId: g.add((dataset_ref, DCATDE.contributorID, URIRef("{}{}".format(DCATDE_CONTRIBUTORS, contributorId)))) # Nr. 41 - Contact Point # If a maintainer name is given, set this to be the name of the # contact point. If not, use name of author/VÖ Stelle (ckanext-dcat default). for contactPoint_ref in g.objects(dataset_ref, DCAT.contactPoint): for email in g.objects(contactPoint_ref, VCARD.hasEmail): g.remove((contactPoint_ref, VCARD.hasEmail, Literal(email))) g.add((contactPoint_ref, VCARD.hasEmail, URIRef("mailto:" + email))) # Nr. 44 - Publisher publisher_ref = BNode() publisher_name = self._get_dataset_value(dataset_dict, 'author') publisher_url = self._get_dataset_value(dataset_dict, 'url') # first, remove the publishers added by the generic RDF profile, as they # are based on the CKAN Organization for publisher in g.objects(dataset_ref, DCT.publisher): g.remove((dataset_ref, DCT.publisher, publisher)) g.add((publisher_ref, RDF.type, FOAF.Organization)) g.add((publisher_ref, FOAF.name, Literal(publisher_name))) # if publisher_url: # g.add( (publisher_ref, FOAF.homepage, URIRef(publisher_url)) ) g.add((dataset_ref, DCT.publisher, publisher_ref)) # Nr. 45 - Kategorie groups = self._get_dataset_value(dataset_dict, 'groups') for group in groups: dcat_groups = self.category_mapping[group['name']] if dcat_groups is not None: for dcat_group in dcat_groups: g.add((dataset_ref, DCAT.theme, MDRTHEME[dcat_group])) # MDRTHEME.xyz is not dereferencable, so we add some additional # triples that link to the downloadable source: g.add((MDRTHEME[dcat_group], RDFS.isDefinedBy, URIRef(MDRTHEME))) g.add(( URIRef(MDRTHEME), RDFS.seeAlso, URIRef( "http://publications.europa.eu/mdr/resource/authority/data-theme/skos-ap-eu/data-theme-skos-ap-act.rdf" ))) # Nr. 48 - conformsTo (Application Profile der Metadaten) dcatapde_version = pylons.config.get('ckanext.dcatde.version') g.add((dataset_ref, DCT.conformsTo, URIRef("{}{}/".format(DCATDE, dcatapde_version)))) # Nr. 49 - 52 (Urheber, Verwalter, Bearbeiter, Autor) - we don't know this # Nr. 59 - Sprache g.add((dataset_ref, DCT.language, MDRLANG.DEU)) # MDRLANG.DEU is not dereferencable, so we add some additional # triples that link to the downloadable source: g.add((MDRLANG.DEU, RDFS.isDefinedBy, URIRef(MDRLANG))) g.add(( URIRef(MDRLANG), RDFS.seeAlso, URIRef( "http://publications.europa.eu/mdr/resource/authority/language/skos-ap-eu/languages-skos-ap-act.rdf" ))) # Nr. 61 - Provenienz # TODO: geharvestete Datensätze kennzeichnen? # Nr. 66 - dct:spatial via geonames reference # Nr. 72 - dcatde:politicalGeocodingLevelURI # Nr. 73 - dcatde:politicalGeocodingURI # passt leider nur bedingt auf Berlin (nur federal, state, administrativeDistrict) geographical_coverage = self._get_dataset_value( dataset_dict, 'geographical_coverage') if geographical_coverage in self.geo_coverage: coverage_object = self.geo_coverage[geographical_coverage] if 'geonames' in coverage_object: g.add((dataset_ref, DCT.spatial, URIRef(coverage_object['geonames']))) if 'politicalGeocodingURI' in coverage_object: g.add((dataset_ref, DCATDE.politicalGeocodingURI, URIRef(coverage_object['politicalGeocodingURI']))) if 'politicalGeocodingLevelURI' in coverage_object: g.add((dataset_ref, DCATDE.politicalGeocodingLevelURI, URIRef(coverage_object['politicalGeocodingLevelURI']))) # Nr. 75 - dcatde:legalbasisText legalbasisText = self.legalBasis['default'] org = dataset_dict.get('organization', {}) if org and org['name'] in self.legalBasis['mapping']: legalbasisText = self.legalBasis['mapping'][org['name']] g.add((dataset_ref, DCATDE.legalbasisText, Literal(legalbasisText))) # Enhance Distributions ## License if 'license_id' in dataset_dict: ogd_license_code = dataset_dict['license_id'] if ogd_license_code in self.license_mapping: dist_additons['license_id'] = self.license_mapping[ ogd_license_code]['dcatde-id'] ## Attribution Text if 'attribution_text' in dataset_dict: dist_additons['attribution_text'] = dataset_dict.get( 'attribution_text').encode('utf-8') for resource_dict in dataset_dict.get('resources', []): for distribution in g.objects(dataset_ref, DCAT.distribution): # Match distribution in graph and resource in ckan-dict if unicode(distribution) == resource_uri(resource_dict): self.enhance_distribution_resource(g, distribution, resource_dict, dist_additons) # custom: # add information about the technical source of this dataset (webform, simplesearch, harvester, etc.) source = self._get_dataset_value(dataset_dict, 'berlin_source') if (source): g.add((dataset_ref, DCT.accrualMethod, ACCRUAL_METHODS[source]))
def test_distribution_fields(self): resource = { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file', 'description': 'A CSV file', 'url': 'http://example.com/data/file.csv', 'status': 'http://purl.org/adms/status/Completed', 'rights': 'Some statement about rights', 'license': 'http://creativecommons.org/licenses/by/3.0/', 'issued': '2015-06-26T15:21:09.034694', 'modified': '2015-06-26T15:21:09.075774', 'size': 1234, 'documentation': '[\"http://dataset.info.org/distribution1/doc1\", \"http://dataset.info.org/distribution1/doc2\"]', 'language': '[\"en\", \"es\", \"http://publications.europa.eu/resource/authority/language/ITA\"]', 'conforms_to': '[\"Standard 1\", \"Standard 2\"]', 'hash': '4304cf2e751e6053c90b1804c89c0ebb758f395a', 'hash_algorithm': 'http://spdx.org/rdf/terms#checksumAlgorithm_sha1', } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [resource] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_( len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name']) assert self._triple(g, distribution, DCT.description, resource['description']) assert self._triple(g, distribution, DCT.rights, resource['rights']) assert self._triple(g, distribution, DCT.license, URIRef(resource['license'])) assert self._triple(g, distribution, ADMS.status, URIRef(resource['status'])) # List for item in [ ('documentation', FOAF.page, URIRef), ('language', DCT.language, [Literal, Literal, URIRef]), ('conforms_to', DCT.conformsTo, Literal), ]: values = json.loads(resource[item[0]]) eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) for num, value in enumerate(values): _type = item[2] if isinstance(item[2], list): eq_(len(item[2]), len(values)) _type = item[2][num] assert self._triple(g, distribution, item[1], _type(value)) # Dates assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime) assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) # Numbers assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal) # Checksum checksum = self._triple(g, distribution, SPDX.checksum, None)[2] assert checksum assert self._triple(g, checksum, RDF.type, SPDX.Checksum) assert self._triple( g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary') assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm']))
def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Create graph from dataset '%s'" % dataset_dict['name']) g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, Literal), ('access_rights', DCT.accessRights, None, Literal), ('dcat_type', DCT.type, None, Literal), ('provenance', DCT.provenance, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) self._add_multilang_value(dataset_ref, DCT.description, 'description', dataset_dict) self._add_multilang_value(dataset_ref, DCT.title, 'title', dataset_dict) # LandingPage try: landing_page = uri_to_iri(dataset_dict['url']) except ValueError: landing_page = '' g.add((dataset_ref, DCAT.landingPage, Literal(landing_page))) # Keywords self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords', dataset_dict) # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Update Interval accrual_periodicity = dataset_dict.get('accrual_periodicity') if accrual_periodicity: g.add((dataset_ref, DCT.accrualPeriodicity, URIRef(accrual_periodicity))) # Lists items = [ ('language', DCT.language, None, Literal), ('theme', DCAT.theme, None, URIRef), ('conforms_to', DCT.conformsTo, None, Literal), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, Literal), ('has_version', DCT.hasVersion, None, Literal), ('is_version_of', DCT.isVersionOf, None, Literal), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Relations if dataset_dict.get('relations'): relations = dataset_dict.get('relations') for relation in relations: relation_name = relation['label'] try: relation_url = uri_to_iri(relation['url']) except ValueError: # skip this relation if the URL is invalid continue relation = URIRef(relation_url) g.add((relation, RDFS.label, Literal(relation_name))) g.add((dataset_ref, DCT.relation, relation)) # References if dataset_dict.get('see_alsos'): references = dataset_dict.get('see_alsos') for reference in references: # we only excpect dicts here if not isinstance(reference, dict): continue reference_identifier = reference.get('dataset_identifier') if reference_identifier: g.add((dataset_ref, RDFS.seeAlso, Literal(reference_identifier))) # Contact details if dataset_dict.get('contact_points'): contact_points = self._get_dataset_value(dataset_dict, 'contact_points') # noqa for contact_point in contact_points: contact_details = BNode() contact_point_email = contact_point['email'] contact_point_name = contact_point['name'] g.add((contact_details, RDF.type, VCARD.Organization)) g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) # noqa g.add((contact_details, VCARD.fn, Literal(contact_point_name))) g.add((dataset_ref, DCAT.contactPoint, contact_details)) # Publisher if dataset_dict.get('publishers'): publishers = dataset_dict.get('publishers') for publisher in publishers: publisher_name = publisher['label'] publisher_details = BNode() g.add((publisher_details, RDF.type, RDF.Description)) g.add((publisher_details, RDFS.label, Literal(publisher_name))) g.add((dataset_ref, DCT.publisher, publisher_details)) # Temporals temporals = dataset_dict.get('temporals') if temporals: for temporal in temporals: start = temporal['start_date'] end = temporal['end_date'] if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Themes groups = self._get_dataset_value(dataset_dict, 'groups') for group_name in groups: g.add((dataset_ref, DCAT.theme, URIRef(ogd_theme_base_url + group_name.get('name')))) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, Literal), ('identifier', DCT.identifier, None, Literal), ('media_type', DCAT.mediaType, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(resource_dict, distribution, items) self._add_multilang_value(distribution, DCT.title, 'display_name', resource_dict) # noqa self._add_multilang_value(distribution, DCT.description, 'description', resource_dict) # noqa # Lists items = [ ('documentation', FOAF.page, None, Literal), ('language', DCT.language, None, Literal), ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Download URL & Access URL download_url = resource_dict.get('download_url') if download_url: try: download_url = uri_to_iri(download_url) g.add( (distribution, DCAT.downloadURL, URIRef(download_url))) except ValueError: # only add valid URL pass url = resource_dict.get('url') if (url and not download_url) or (url and url != download_url): try: url = uri_to_iri(url) g.add((distribution, DCAT.accessURL, URIRef(url))) except ValueError: # only add valid URL pass elif download_url: g.add((distribution, DCAT.accessURL, URIRef(download_url))) # Format if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) # Mime-Type if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # ByteSize if resource_dict.get('byte_size'): g.add((distribution, DCAT.byteSize, Literal(resource_dict['byte_size'])))
def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa: C90 try: g = self.g g.add((dataset_ref, RDF.type, DCAT.Dataset)) for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # Basic fields basic_items = [ ('version', OWL.versionInfo, ['dcat_version'], Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, basic_items) # landingPage is the original portal page site_url = pylons.config.get('ckan.site_url', '') g.add(( dataset_ref, DCAT.landingPage, Literal(site_url + '/dataset/' + dataset_dict['name']) )) # Language g.add((dataset_ref, DCT.language, Literal(ckan_locale_default))) # Basic date fields date_items = [ ('dateLastUpdated', DCT.modified, 'metadata_modified', Literal), # noqa ('dateFirstPublished', DCT.issued, 'metadata_created', Literal), # noqa ] self._add_date_triples_from_dict( dataset_dict, dataset_ref, date_items ) # Organization organization_id = pylons.config.get( 'ckanext.stadtzh-theme.dcat_ap_organization_slug', '', ) id = self._get_dataset_value(dataset_dict, 'id') title = self._get_dataset_value(dataset_dict, 'title') description = self._get_dataset_value(dataset_dict, 'notes') g.add(( dataset_ref, DCT.identifier, Literal(id + '@' + organization_id) )) g.add(( dataset_ref, DCT.title, Literal(title, lang=ckan_locale_default) )) g.add(( dataset_ref, DCT.description, Literal(description, lang=ckan_locale_default) )) # Update Interval try: update_interval = self._get_dataset_value( dataset_dict, 'updateInterval' ) accrualPeriodicity = mapping_accrualPeriodicity.get( update_interval[0] ) except IndexError: accrualPeriodicity = None if accrualPeriodicity: g.add(( dataset_ref, DCT.accrualPeriodicity, URIRef(accrualPeriodicity) )) # Temporal time_range = self._time_interval_from_dataset(dataset_dict) if time_range is not None and time_range.get('start_date') and time_range.get('end_date'): # noqa start = time_range.get('start_date') end = time_range.get('end_date') temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) g.add(( temporal_extent, SCHEMA.startDate, Literal(start, datatype=XSD.date) )) g.add(( temporal_extent, SCHEMA.endDate, Literal(end, datatype=XSD.date) )) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Themes groups = self._get_dataset_value(dataset_dict, 'groups') try: theme_names = set(itertools.chain.from_iterable( [self._themes(group.get('name')) for group in groups])) if any(tag['name'] == 'geodaten' for tag in dataset_dict.get('tags', [])): theme_names.add('geography') for theme_name in theme_names: g.add(( dataset_ref, DCAT.theme, URIRef(ogd_theme_base_url + theme_name) )) except IndexError: pass # Legal Information legal_information = self._get_dataset_value( dataset_dict, 'legalInformation' ) g.add((dataset_ref, DCT.accessRights, Literal(legal_information))) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) maintainer_email = self._get_dataset_value( dataset_dict, 'maintainer_email' ) g.add((contact_details, VCARD.hasEmail, URIRef(maintainer_email))) # noqa items = [ ('contact_name', VCARD.fn, ['maintainer', 'author'], Literal), # noqa ] self._add_triples_from_dict(dataset_dict, contact_details, items) # noqa # Tags for tag in dataset_dict.get('tags', []): g.add(( dataset_ref, DCAT.keyword, Literal(tag['name'], lang=ckan_locale_default) )) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) g.add((distribution, DCT.language, Literal(ckan_locale_default))) # noqa # Simple values items = [ ('id', DCT.identifier, None, Literal), ('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('state', ADMS.status, None, Literal), ] self._add_triples_from_dict(resource_dict, distribution, items) license_id = self._get_dataset_value(dataset_dict, 'license_id') # noqa license_title = self._rights(license_id) g.add((distribution, DCT.rights, Literal(license_title))) g.add((distribution, DCT.license, Literal(license_title))) # Lists items = [ ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict( resource_dict, distribution, items ) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URLs url = resource_dict.get('url') if url: g.add((distribution, DCAT.accessURL, Literal(url))) # if resource has the following format, the distribution is a # service and therefore doesn't need a downloadURL format = resource_dict.get('format').lower() if format not in ['xml', 'wms', 'wmts', 'wfs']: download_url = resource_dict.get('url') if download_url: g.add(( distribution, DCAT.downloadURL, Literal(download_url) )) # Dates items = [ ('created', DCT.issued, None, Literal), ('last_modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict( resource_dict, distribution, items ) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum)) # Publisher if dataset_dict.get('organization'): publisher_name = dataset_dict.get('author') publisher_details = BNode() g.add((publisher_details, RDF.type, RDF.Description)) g.add((publisher_details, RDFS.label, Literal(publisher_name))) g.add((dataset_ref, DCT.publisher, publisher_details)) except Exception, e: log.exception( "Something went wrong: %s / %s" % (e, traceback.format_exc()) ) raise
def graph_from_dataset(self, dataset_dict, dataset_ref): """ Transforms CKAN-Dictionary to DCAT-AP.de-Data """ g = self.g # bind namespaces to have readable names in RDF Document for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # Simple additional fields items = [('qualityProcessURI', DCATDE.qualityProcessURI, None, URIRef), ('metadata_original_html', DCAT.landingPage, None, URIRef), ('politicalGeocodingLevelURI', DCATDE.politicalGeocodingLevelURI, None, URIRef)] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Additional Lists items = [('contributorID', DCATDE.contributorID, None, Literal), ('politicalGeocodingURI', DCATDE.politicalGeocodingURI, None, URIRef), ('legalbasisText', DCATDE.legalBasis, None, Literal), ('geocodingText', DCATDE.geocodingDescription, None, Literal)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Add adminUnitL2 for every politicalGeocodingURI value. Compatibility. if self._get_dataset_value(dataset_dict, 'politicalGeocodingURI'): spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) items = [('politicalGeocodingURI', LOCN.adminUnitL2, None, URIRef)] self._add_list_triples_from_dict(dataset_dict, spatial_ref, items) # Contacts self._add_contact(dataset_dict, dataset_ref, DCATDE.originator, 'originator') self._add_contact(dataset_dict, dataset_ref, DCATDE.maintainer, 'maintainer') self._add_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor') self._add_contact(dataset_dict, dataset_ref, DCT.creator, 'author') # Add maintainer_url to contact_point maintainer_url = self._get_dataset_value(dataset_dict, 'maintainer_url') if maintainer_url: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) self._add_triple_from_dict(dataset_dict, contact_point, VCARD.hasURL, 'maintainer_url', _type=URIRef) # add maintainer_tel to contact_point maintainer_tel = self._get_dataset_value(dataset_dict, 'maintainer_tel') if maintainer_tel: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) self._add_triple_from_dict(dataset_dict, contact_point, VCARD.hasTelephone, 'maintainer_tel', _type=URIRef, value_modifier=self._add_tel) # add maintainer postal data to contact_point if available vcard_mapping = { 'street': VCARD.hasStreetAddress, 'city': VCARD.hasLocality, 'zip': VCARD.hasPostalCode, 'country': VCARD.hasCountryName } for vc_name in vcard_mapping: vcard_fld = self._get_dataset_value(dataset_dict, 'maintainer_' + vc_name) if vcard_fld: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) g.add((contact_point, vcard_mapping[vc_name], Literal(vcard_fld))) # Groups groups = self._get_dataset_value(dataset_dict, 'groups') for group in groups: group_name_in_dict = group['name'] if group_name_in_dict: value_to_add = self._removeWhitespaces(group_name_in_dict) if value_to_add: g.add((dataset_ref, DCAT.theme, URIRef(dcat_theme_prefix + value_to_add.upper()))) # used_datasets items = [ ('used_datasets', DCT.relation, None, URIRef), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Enhance Distributions for resource_dict in dataset_dict.get('resources', []): for distribution in g.objects(dataset_ref, DCAT.distribution): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_uri(resource_dict): items = [('licenseAttributionByText', DCATDE.licenseAttributionByText, None, Literal), ('plannedAvailability', DCATDE.plannedAvailability, None, URIRef)] self._add_triples_from_dict(resource_dict, distribution, items)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # -- start g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None, Literal), ('notes', DCT.description, None, Literal), ('url', DCAT.landingPage, None, URIRef), ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, URIRef), ('subject', DCT.subject, None, URIRef), # Mentioned in the vocabulary ('provenance', DCT.provenance, None, URIRef) ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [('language', DCT.language, None, URIRef), ('theme', DCAT.theme, None, URIRef), ('spatial_uri', DCT.spatial, None, URIRef), ('conforms_to', DCT.conformsTo, None, URIRef), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, URIRef), ('access_rights', DCT.accessRights, None, URIRef), ('related_resource', DCT.relation, None, URIRef), ('has_version', DCT.hasVersion, None, Literal), ('is_version_of', DCT.isVersionOf, None, Literal), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Kind)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer'], Literal), ('contact_email', VCARD.hasEmail, ['maintainer_email'], Literal), ] self._add_triples_from_dict(dataset_dict, contact_details, items) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), self._get_dataset_value(dataset_dict, 'publisher_identifier'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [('publisher_email', FOAF.mbox, None, Literal), ('publisher_identifier', DCT.identifier, None, Literal), ('publisher_url', FOAF.homepage, None, URIRef), ('publisher_type', DCT.type, None, Literal)] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # parts - has part/is part of if any([ self._get_dataset_value(dataset_dict, 'has_part'), self._get_dataset_value(dataset_dict, 'is_part_of') ]): items = [('has_part', DCT.hasPart, None, URIRef), ('is_part_of', DCT.isPartOf, None, URIRef)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri: spatial_uri = get_spatial_uri(spatial_uri) # map from code to URI if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) if 'license' not in resource_dict and 'license_id' in dataset_dict: lr = LicenseRegister() _license = lr.get(dataset_dict['license_id']) if _license: resource_dict['license'] = _license.url # Simple values items = [ ('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, URIRef), ] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None, URIRef), ('language', DCT.language, None, URIRef), ('conforms_to', DCT.conformsTo, None, URIRef), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, URIRef(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, URIRef(url))) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))
def test_distribution_fields(self): resource = { "id": "c041c635-054f-4431-b647-f9186926d021", "package_id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "CSV file", "description": "A CSV file", "url": "http://example.com/data/file.csv", "status": "http://purl.org/adms/status/Completed", "rights": "Some statement about rights", "license": "http://creativecommons.org/licenses/by/3.0/", "issued": "2015-06-26T15:21:09.034694", "modified": "2015-06-26T15:21:09.075774", "size": 1234, "documentation": '["http://dataset.info.org/distribution1/doc1", "http://dataset.info.org/distribution1/doc2"]', "language": '["en", "es", "ca"]', "conforms_to": '["Standard 1", "Standard 2"]', "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", } dataset = { "id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6", "name": "test-dataset", "title": "Test DCAT dataset", "resources": [resource], } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource["name"]) assert self._triple(g, distribution, DCT.description, resource["description"]) assert self._triple(g, distribution, DCT.rights, resource["rights"]) assert self._triple(g, distribution, DCT.license, resource["license"]) assert self._triple(g, distribution, ADMS.status, resource["status"]) # List for item in [("documentation", FOAF.page), ("language", DCT.language), ("conforms_to", DCT.conformsTo)]: values = json.loads(resource[item[0]]) eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) for value in values: assert self._triple(g, distribution, item[1], value) # Dates assert self._triple(g, distribution, DCT.issued, resource["issued"], XSD.dateTime) assert self._triple(g, distribution, DCT.modified, resource["modified"], XSD.dateTime) # Numbers assert self._triple(g, distribution, DCAT.byteSize, float(resource["size"]), XSD.decimal) # Checksum checksum = self._triple(g, distribution, SPDX.checksum, None)[2] assert checksum assert self._triple( g, checksum, SPDX.checksumValue, resource["hash"], data_type="http://www.w3.org/2001/XMLSchema#hexBinary" ) assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource["hash_algorithm"]))
class ItalianDCATAPProfile(RDFProfile): ''' An RDF profile for the Italian DCAT-AP recommendation for data portals It requires the European DCAT-AP profile (`euro_dcat_ap`) ''' def parse_dataset(self, dataset_dict, dataset_ref): # check the dataset type if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g: # not a DCATAPIT dataset return dataset_dict # date info for predicate, key, logf in ( (DCT.issued, 'issued', log.debug), (DCT.modified, 'modified', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) value = helpers.format(value, '%Y-%m-%d', 'date') dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..1 predicates for predicate, key, logf in ((DCT.identifier, 'identifier', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..n predicates list for predicate, key, logf in ( (ADMS.identifier, 'alternate_identifier', log.debug), (DCT.isVersionOf, 'is_version_of', log.debug), ): valueList = self._object_value_list(dataset_ref, predicate) if valueList: self._remove_from_extra(dataset_dict, key) value = ','.join(valueList) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # conformsTo self._remove_from_extra(dataset_dict, 'conforms_to') conform_list = [] for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo): conform_list.append(self._object_value(conforms_to, DCT.identifier)) if conform_list: value = ','.join(conform_list) dataset_dict['conforms_to'] = value else: log.debug('No DCT.conformsTo found for dataset "%s"', dataset_dict.get('title', '---')) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) for v, key, logf in ( (start, 'temporal_start', log.debug), (end, 'temporal_end', log.debug), ): if v: self._remove_from_extra(dataset_dict, key) value = helpers.format(v, '%Y-%m-%d', 'date') dataset_dict[key] = value else: log.warn('No %s Date found for dataset "%s"', key, dataset_dict.get('title', '---')) # URI 0..1 for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency', FREQ_BASE_URI), ): valueRef = self._object_value(dataset_ref, predicate) if valueRef: self._remove_from_extra(dataset_dict, key) value = self._strip_uri(valueRef, base_uri) dataset_dict[key] = value else: log.warn('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # URI lists for predicate, key, base_uri in ( (DCT.language, 'language', LANG_BASE_URI), (DCAT.theme, 'theme', THEME_BASE_URI), ): self._remove_from_extra(dataset_dict, key) valueRefList = self._object_value_list(dataset_ref, predicate) valueList = [ self._strip_uri(valueRef, base_uri) for valueRef in valueRefList ] value = ','.join(valueList) if len(valueList) > 1: value = '{' + value + '}' dataset_dict[key] = value # Spatial spatial_tags = [] geonames_url = None for spatial in self.g.objects(dataset_ref, DCT.spatial): for spatial_literal in self.g.objects( spatial, DCATAPIT.geographicalIdentifier): spatial_value = spatial_literal.value if GEO_BASE_URI in spatial_value: spatial_tags.append( self._strip_uri(spatial_value, GEO_BASE_URI)) else: if geonames_url: log.warn( "GeoName URL is already set to %s, value %s will not be imported", geonames_url, spatial_value) else: geonames_url = spatial_value if len(spatial_tags) > 0: value = ','.join(spatial_tags) if len(spatial_tags) > 1: value = '{' + value + '}' dataset_dict['geographical_name'] = value if geonames_url: dataset_dict['geographical_geonames_url'] = geonames_url ### Collect strings from multilang fields # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...} localized_dict = {} for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ): self._collect_multilang_strings(dataset_dict, key, dataset_ref, predicate, localized_dict) # Agents for predicate, basekey in ( (DCT.publisher, 'publisher'), (DCT.rightsHolder, 'holder'), (DCT.creator, 'creator'), ): agent_dict, agent_loc_dict = self._parse_agent( dataset_ref, predicate, basekey) for key, v in agent_dict.iteritems(): self._remove_from_extra(dataset_dict, key) dataset_dict[key] = v localized_dict.update(agent_loc_dict) # when all localized data have been parsed, check if there really any and add it to the dict if len(localized_dict) > 0: log.debug('Found multilang metadata') dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict ### Resources resources_loc_dict = {} # In ckan, the license is a dataset property, not resource's # We'll collect all of the resources' licenses, then we will postprocess them licenses = [] # contains tuples (url, name) for resource_dict in dataset_dict.get('resources', []): resource_uri = resource_dict['uri'] if not resource_uri: log.warn("URI not defined for resource %s", resource_dict['name']) continue distribution = URIRef(resource_uri) if not (dataset_ref, DCAT.distribution, distribution) in self.g: log.warn("Distribution not found in dataset %s", resource_uri) continue # URI 0..1 for predicate, key, base_uri in ( (DCT['format'], 'format', FORMAT_BASE_URI), # Format ): valueRef = self._object_value(distribution, predicate) if valueRef: value = self._strip_uri(valueRef, base_uri) resource_dict[key] = value else: log.warn('No %s found for resource "%s"::"%s"', predicate, dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # License license = self._object(distribution, DCT.license) if license: # just add this info in the resource extras resource_dict['license_url'] = str(license) license_name = self._object_value( license, FOAF.name) # may be either the title or the id if (license_name): # just add this info in the resource extras resource_dict['license_name'] = license_name else: license_name = "unknown" licenses.append((str(license), license_name)) else: log.warn('No license found for resource "%s"::"%s"', dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # Multilang loc_dict = {} for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ): self._collect_multilang_strings(resource_dict, key, distribution, predicate, loc_dict) if len(loc_dict) > 0: log.debug('Found multilang metadata in resource %s', resource_dict['name']) resources_loc_dict[resource_uri] = loc_dict if len(resources_loc_dict) > 0: log.debug('Found multilang metadata in resources') dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict # postprocess licenses # license_ids = {id for url,id in licenses} # does not work in python 2.6 license_ids = set() for url, id in licenses: license_ids.add(id) if license_ids: if len(license_ids) > 1: log.warn('More than one license found for dataset "%s"', dataset_dict.get('title', '---')) dataset_dict['license_id'] = license_ids.pop() # take a random one return dataset_dict def _collect_multilang_strings(self, base_dict, key, subj, pred, loc_dict): ''' Search for multilang Literals matching (subj, pred). - Literals not localized will be stored as source_dict[key] -- possibly replacing the value set by the EURO parser - Localized literals will be stored into target_dict[key][lang] ''' for obj in self.g.objects(subj, pred): value = obj.value lang = obj.language if not lang: # force default value in dataset base_dict[key] = value else: # add localized string lang_dict = loc_dict.setdefault(key, {}) lang_dict[lang_mapping_xmllang_to_ckan.get(lang)] = value def _remove_from_extra(self, dataset_dict, key): # search and replace for extra in dataset_dict.get('extras', []): if extra['key'] == key: dataset_dict['extras'].pop(dataset_dict['extras'].index(extra)) return def _add_or_replace_extra(self, dataset_dict, key, value): # search and replace for extra in dataset_dict.get('extras', []): if extra['key'] == key: extra['value'] = value return # add if not found dataset_dict['extras'].append({'key': key, 'value': value}) def _parse_agent(self, subject, predicate, base_name): agent_dict = {} loc_dict = {} for agent in self.g.objects(subject, predicate): agent_dict[base_name + '_identifier'] = self._object_value( agent, DCT.identifier) self._collect_multilang_strings(agent_dict, base_name + '_name', agent, FOAF.name, loc_dict) return agent_dict, loc_dict def _strip_uri(self, value, base_uri): return value.replace(base_uri, '') def graph_from_dataset(self, dataset_dict, dataset_ref): title = dataset_dict.get('title') g = self.g for prefix, namespace in it_namespaces.iteritems(): g.bind(prefix, namespace) ### add a further type for the Dataset node g.add((dataset_ref, RDF.type, DCATAPIT.Dataset)) ### replace themes value = self._get_dict_value(dataset_dict, 'theme') if value: for theme in value.split(','): self.g.remove((dataset_ref, DCAT.theme, URIRef(theme))) theme = theme.replace('{', '').replace('}', '') self.g.add( (dataset_ref, DCAT.theme, URIRef(THEME_BASE_URI + theme))) self._add_concept(THEME_CONCEPTS, theme) else: self.g.add((dataset_ref, DCAT.theme, URIRef(THEME_BASE_URI + DEFAULT_THEME_KEY))) self._add_concept(THEME_CONCEPTS, DEFAULT_THEME_KEY) ### replace languages value = self._get_dict_value(dataset_dict, 'language') if value: for lang in value.split(','): self.g.remove((dataset_ref, DCT.language, Literal(lang))) lang = lang.replace('{', '').replace('}', '') self.g.add( (dataset_ref, DCT.language, URIRef(LANG_BASE_URI + lang))) # self._add_concept(LANG_CONCEPTS, lang) ### add spatial (EU URI) value = self._get_dict_value(dataset_dict, 'geographical_name') if value: for gname in value.split(','): gname = gname.replace('{', '').replace('}', '') dct_location = BNode() self.g.add((dataset_ref, DCT.spatial, dct_location)) self.g.add((dct_location, RDF['type'], DCT.Location)) # Try and add a Concept from the spatial vocabulary if self._add_concept(GEO_CONCEPTS, gname): self.g.add((dct_location, DCATAPIT.geographicalIdentifier, Literal(GEO_BASE_URI + gname))) # geo concept is not really required, but may be a useful adding self.g.add((dct_location, LOCN.geographicalName, URIRef(GEO_BASE_URI + gname))) else: # The dataset field is not a controlled tag, let's create a Concept out of the label we have concept = BNode() self.g.add((concept, RDF['type'], SKOS.Concept)) self.g.add((concept, SKOS.prefLabel, Literal(gname))) self.g.add((dct_location, LOCN.geographicalName, concept)) ### add spatial (GeoNames) value = self._get_dict_value(dataset_dict, 'geographical_geonames_url') if value: dct_location = BNode() self.g.add((dataset_ref, DCT.spatial, dct_location)) self.g.add((dct_location, RDF['type'], DCT.Location)) self.g.add((dct_location, DCATAPIT.geographicalIdentifier, Literal(value))) ### replace periodicity self._remove_node(dataset_dict, dataset_ref, ('frequency', DCT.accrualPeriodicity, None, Literal)) self._add_uri_node( dataset_dict, dataset_ref, ('frequency', DCT.accrualPeriodicity, DEFAULT_FREQ_CODE, URIRef), FREQ_BASE_URI) # self._add_concept(FREQ_CONCEPTS, dataset_dict.get('frequency', DEFAULT_VOCABULARY_KEY)) ### replace landing page self._remove_node(dataset_dict, dataset_ref, ('url', DCAT.landingPage, None, URIRef)) landing_page_uri = None if dataset_dict.get('name'): landing_page_uri = '{0}/dataset/{1}'.format( catalog_uri().rstrip('/'), dataset_dict['name']) else: landing_page_uri = dataset_uri( dataset_dict) # TODO: preserve original URI if harvested self.g.add((dataset_ref, DCAT.landingPage, URIRef(landing_page_uri))) ### conformsTo self.g.remove((dataset_ref, DCT.conformsTo, None)) value = self._get_dict_value(dataset_dict, 'conforms_to') if value: for item in value.split(','): standard = BNode() self.g.add((dataset_ref, DCT.conformsTo, standard)) self.g.add((standard, RDF['type'], DCT.Standard)) self.g.add((standard, RDF['type'], DCATAPIT.Standard)) self.g.add((standard, DCT.identifier, Literal(item))) ### publisher # DCAT by default creates this node # <dct:publisher> # <foaf:Organization rdf:about="http://10.10.100.75/organization/55535226-f82a-4cf7-903a-3e10afeaa79a"> # <foaf:name>orga2_test</foaf:name> # </foaf:Organization> # </dct:publisher> for s, p, o in g.triples((dataset_ref, DCT.publisher, None)): #log.info("Removing publisher %r", o) g.remove((s, p, o)) self._add_agent(dataset_dict, dataset_ref, 'publisher', DCT.publisher) ### Rights holder : Agent holder_ref = self._add_agent(dataset_dict, dataset_ref, 'holder', DCT.rightsHolder) ### Autore : Agent self._add_agent(dataset_dict, dataset_ref, 'creator', DCT.creator) ### Point of Contact # <dcat:contactPoint rdf:resource="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri"/> # <!-- http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri --> # <dcatapit:Organization rdf:about="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri"> # <rdf:type rdf:resource="&vcard;Kind"/> # <rdf:type rdf:resource="&vcard;Organization"/> # <vcard:hasEmail rdf:resource="mailto:[email protected]"/> # <vcard:fn>Regione Liguria - Sportello Cartografico</vcard:fn> # </dcatapit:Organization> # TODO: preserve original info if harvested # retrieve the contactPoint added by the euro serializer euro_poc = g.value(subject=dataset_ref, predicate=DCAT.contactPoint, object=None, any=False) # euro poc has this format: # <dcat:contactPoint> # <vcard:Organization rdf:nodeID="Nfcd06f452bcd41f48f33c45b0c95979e"> # <vcard:fn>THE ORGANIZATION NAME</vcard:fn> # <vcard:hasEmail>THE ORGANIZATION EMAIL</vcard:hasEmail> # </vcard:Organization> # </dcat:contactPoint> if euro_poc: g.remove((dataset_ref, DCAT.contactPoint, euro_poc)) org_id = dataset_dict.get('organization', {}).get('id') # get orga info org_show = logic.get_action('organization_show') try: org_dict = org_show({}, { 'id': org_id, 'include_datasets': False, 'include_tags': False, 'include_users': False, 'include_groups': False, 'include_extras': True, 'include_followers': False }) except Exception, e: org_dict = {} org_uri = organization_uri(org_dict) poc = URIRef(org_uri) g.add((dataset_ref, DCAT.contactPoint, poc)) g.add((poc, RDF.type, DCATAPIT.Organization)) g.add((poc, RDF.type, VCARD.Kind)) g.add((poc, RDF.type, VCARD.Organization)) g.add((poc, VCARD.fn, Literal(org_dict.get('name')))) if 'email' in org_dict.keys( ): # this element is mandatory for dcatapit, but it may not have been filled for imported datasets g.add((poc, VCARD.hasEmail, URIRef(org_dict.get('email')))) if 'telephone' in org_dict.keys(): g.add( (poc, VCARD.hasTelephone, Literal(org_dict.get('telephone')))) if 'site' in org_dict.keys(): g.add((poc, VCARD.hasURL, Literal(org_dict.get('site')))) ### Multilingual # Add localized entries in dataset # TODO: should we remove the non-localized nodes? loc_dict = interfaces.get_for_package(dataset_dict['id']) # The multilang fields loc_package_mapping = { 'title': (dataset_ref, DCT.title), 'notes': (dataset_ref, DCT.description), 'holder_name': (holder_ref, FOAF.name) } self._add_multilang_values(loc_dict, loc_package_mapping) ### Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri( resource_dict)) # TODO: preserve original info if harvested # Add the DCATAPIT type g.add((distribution, RDF.type, DCATAPIT.Distribution)) ### format self._remove_node(resource_dict, distribution, ('format', DCT['format'], None, Literal)) if not self._add_uri_node(resource_dict, distribution, ('distribution_format', DCT['format'], None, URIRef), FORMAT_BASE_URI): guessed_format = guess_format(resource_dict) if guessed_format: self.g.add((distribution, DCT['format'], URIRef(FORMAT_BASE_URI + guessed_format))) else: log.warn('No format for resource: %s / %s', dataset_dict.get('title', 'N/A'), resource_dict.get('description', 'N/A')) self.g.add((distribution, DCT['format'], URIRef(FORMAT_BASE_URI + DEFAULT_FORMAT_CODE))) ### license # <dct:license rdf:resource="http://creativecommons.org/licenses/by/3.0/it/"/> # # <dcatapit:LicenseDocument rdf:about="http://creativecommons.org/licenses/by/3.0/it/"> # <rdf:type rdf:resource="&dct;LicenseDocument"/> # <owl:versionInfo>3.0 ITA</owl:versionInfo> # <foaf:name>CC BY</foaf:name> # <dct:type rdf:resource="http://purl.org/adms/licencetype/Attribution"/> # </dcatapit:LicenseDocument> # "license_id" : "cc-zero" # "license_title" : "Creative Commons CCZero", # "license_url" : "http://www.opendefinition.org/licenses/cc-zero", license_url = dataset_dict.get('license_url', '') license_id = dataset_dict.get('license_id', '') license_title = dataset_dict.get('license_title', '') if license_url: license = URIRef(license_url) g.add((license, RDF['type'], DCATAPIT.LicenseDocument)) g.add((license, RDF['type'], DCT.LicenseDocument)) g.add((license, DCT['type'], URIRef('http://purl.org/adms/licencetype/Attribution') )) # TODO: infer from CKAN license g.add((distribution, DCT.license, license)) if license_id: # log.debug('Adding license id: %s', license_id) g.add((license, FOAF.name, Literal(license_id))) elif license_title: # log.debug('Adding license title: %s', license_title) g.add((license, FOAF.name, Literal(license_title))) else: g.add((license, FOAF.name, Literal('unknown'))) log.warn('License not found for dataset: %s', title) ### Multilingual # Add localized entries in resource # TODO: should we remove the not-localized nodes? loc_dict = interfaces.get_for_resource(resource_dict['id']) # The multilang fields loc_resource_mapping = { 'name': (distribution, DCT.title), 'description': (distribution, DCT.description), } self._add_multilang_values(loc_dict, loc_resource_mapping)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None), ('notes', DCT.description, None), ('url', DCAT.landingPage, None), ('identifier', DCT.identifier, ['guid', 'id']), ('version', OWL.versionInfo, ['dcat_version']), ('version_notes', ADMS.versionNotes, None), ('frequency', DCT.accrualPeriodicity, None), ('access_rights', DCT.accessRights, None), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created']), ('modified', DCT.modified, ['metadata_modified']), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None), ('theme', DCAT.theme, None), ('conforms_to', DCT.conformsTo, None), ('alternate_identifier', ADMS.identifier, None), ('documentation', FOAF.page, None), ('related_resource', DCT.relation, None), ('has_version', DCT.hasVersion, None), ('is_version_of', DCT.isVersionOf, None), ('source', DCT.source, None), ('sample', ADMS.sample, None), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer', 'author']), ('contact_email', VCARD.hasEmail, ['maintainer_email', 'author_email']), ] self._add_triples_from_dict(dataset_dict, contact_details, items) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None), ('publisher_url', FOAF.homepage, None), ('publisher_type', DCT.type, None), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DCT.title, None), ('description', DCT.description, None), ('status', ADMS.status, None), ('rights', DCT.rights, None), ('license', DCT.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None), ('language', DCT.language, None), ('conforms_to', DCT.conformsTo, None), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, Literal(url))) # Dates items = [ ('issued', DCT.issued, None), ('modified', DCT.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))
def graph_from_dataset(self, dataset_dict, dataset_ref): log.debug("ODMDCATBasicProfileDataset graph_from_dataset") g = self.g namespaces = odm_rdf_helper.get_namespaces_by_dataset_type( dataset_dict['type']) for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, DCT.identifier, Literal(dataset_dict.get('id')))) g.add((dataset_ref, DCT.type, Literal(dataset_dict.get('type', 'dataset')))) g.add((dataset_ref, RDF.type, DCAT.Dataset)) items = [(dataset_ref, DCT.title, dataset_dict.get('title_translated') or dataset_dict.get('title')), (dataset_ref, DCT.description, dataset_dict.get('notes_translated') or dataset_dict.get('notes'))] raw_triples = odm_rdf_helper.get_triples_by_dataset_type( dataset_ref, dataset_dict, dataset_dict['type']) raw_triples.extend(items) for raw_triple in raw_triples: triples = odm_rdf_helper.split_multilingual_object_into_triples( raw_triple) for triple in triples: g.add(triple) #Organization organization = dataset_dict.get('organization') g.add((dataset_ref, FOAF.organization, URIRef( config.get('ckan.site_url') + "organization/" + organization['name']))) #license license = URIRef(dataset_dict.get('license_url')) g.add((license, DCT.title, Literal(dataset_dict.get('license_title')))) g.add((dataset_ref, DCT.license, license)) # odm_spatial_range for item in dataset_dict.get('odm_spatial_range', []): iso3_code = odm_rdf_helper.map_country_code_iso2_iso3(item.upper()) g.add((dataset_ref, GN.countrycode, URIRef("http://data.landportal.info/geo/" + iso3_code))) #taxonomy for term in dataset_dict.get('taxonomy', []): matches = odm_rdf_helper.map_internal_to_standard_taxonomic_term( term) if isinstance(matches, basestring): g.add((dataset_ref, FOAF.topic, Literal(matches))) else: node = BNode() if 'exact_match' in matches: node = URIRef(matches['exact_match']) if 'broad_matches' in matches: for broad_match in matches['broad_matches']: g.add((node, SKOS.broadMatch, URIRef(broad_match))) g.add((node, DCT.title, Literal(term))) g.add((dataset_ref, FOAF.topic, node)) # Language for item in dataset_dict.get('odm_language', []): g.add((dataset_ref, DC.language, Literal(item.upper()))) # Dates try: items = odm_rdf_helper.get_date_fields_by_dataset_type( dataset_dict['type']) self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) except ValueError: log.debug("Error adding date triples for dataset " + dataset_dict['id']) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.Distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) items = [(distribution, DCT.title, resource_dict.get('name_translated') or resource_dict.get('name')), (distribution, DCT.description, resource_dict.get('description_translated') or resource_dict.get('description'))] for item in items: triples = odm_rdf_helper.split_multilingual_object_into_triples( item) for triple in triples: g.add(triple) try: self._add_triples_from_dict(resource_dict, distribution, items) except ValueError: log.debug("Error adding triples for dataset " + dataset_dict['id']) # Language for item in resource_dict.get('odm_language', []): g.add((distribution, DC.language, Literal(item.upper()))) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.downloadURL, URIRef(url)))
def parse_dataset(self, dataset_dict, dataset_ref): """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """ # Simple additional fields for key, predicate in ( ('qualityProcessURI', DCATDE.qualityProcessURI), ('metadata_original_html', DCAT.landingPage), ('politicalGeocodingLevelURI', DCATDE.politicalGeocodingLevelURI), ): value = self._object_value(dataset_ref, predicate) if value: ds_utils.insert_new_extras_field(dataset_dict, key, value) # List fields for key, predicate, in ( ('contributorID', DCATDE.contributorID), ('politicalGeocodingURI', DCATDE.politicalGeocodingURI), ('legalbasisText', DCATDE.legalbasisText), ('geocodingText', DCATDE.geocodingText), ): values = self._object_value_list(dataset_ref, predicate) if values: ds_utils.insert_new_extras_field(dataset_dict, key, json.dumps(values)) self._parse_contact(dataset_dict, dataset_ref, DCATDE.originator, 'originator', True) self._parse_contact(dataset_dict, dataset_ref, DCATDE.maintainer, 'maintainer', False) self._parse_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor', True) self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author', False) # dcat:contactPoint # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer? contact = self._object(dataset_ref, DCAT.contactPoint) self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL) contact_tel = self._object_value(contact, VCARD.hasTelephone) if contact_tel: ds_utils.insert(dataset_dict, 'maintainer_tel', self._without_tel(contact_tel), True) self._add_maintainer_field(dataset_dict, contact, 'street', VCARD.hasStreetAddress) self._add_maintainer_field(dataset_dict, contact, 'city', VCARD.hasLocality) self._add_maintainer_field(dataset_dict, contact, 'zip', VCARD.hasPostalCode) self._add_maintainer_field(dataset_dict, contact, 'country', VCARD.hasCountryName) # Groups groups = self._get_dataset_value(dataset_dict, 'groups') if not groups: groups = [] for obj in self.g.objects(dataset_ref, DCAT.theme): current_theme = unicode(obj) if current_theme.startswith(dcat_theme_prefix): group = current_theme.replace(dcat_theme_prefix, '').lower() groups.append({'id': group, 'name': group}) dataset_dict['groups'] = groups # Add additional distribution fields for distribution in self.g.objects(dataset_ref, DCAT.distribution): for resource_dict in dataset_dict.get('resources', []): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_uri(resource_dict): for key, predicate in ( ('licenseAttributionByText', DCATDE.licenseAttributionByText), ('plannedAvailability', DCATDE.plannedAvailability) ): value = self._object_value(distribution, predicate) if value: ds_utils.insert_resource_extra(resource_dict, key, value) return dataset_dict
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) # Dataset g.add((dataset_ref, RDF.type, DCAT.Dataset)) ## Simple values items = [ ("title", DCTERMS.title, None, Literal), ("name", DCTERMS.identifier, None, Literal), ("author", DC.creator, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) ## Description dataset_desc = dataset_dict.get("notes") if dataset_desc: dataset_desc_value = markdown_extract(dataset_desc, extract_length=0) g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc))) ## Language langs = dataset_dict.get("language") if langs: for lang in langs: language_uri = LANG_PREFIX + lang g.add((dataset_ref, DCTERMS.language, URIRef(language_uri))) ## Tags for tag in dataset_dict.get("tags", []): g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) ## Wikidata keywords for keyword in dataset_dict.get("keywords", []): g.add((dataset_ref, DCAT.theme, WD[keyword])) ## Data Type data_types = dataset_dict.get("data_type") if data_types: for data_type in data_types: g.add((dataset_ref, DCTERMS.type, URIRef(DATA_TYPE_PREFIX + data_type))) ## Temporal Resolution temp_res = dataset_dict.get("temp_res") temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"} if temp_res: temp_res_value = temp_res_mapping[temp_res] g.add((dataset_ref, DCAT.temporalResolution, Literal(temp_res_value, datatype=XSD.duration))) ## Start Time, End Time, and Created Time items = [("start_time", SCHEMA.startDate, None, Literal), ("end_time", SCHEMA.endDate, None, Literal), ("created_time", DCTERMS.issued, None, Literal)] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) ## Spatial Coverage spatial = dataset_dict.get("spatial") x_min = dataset_dict.get("x_min") x_max = dataset_dict.get("x_max") y_min = dataset_dict.get("y_min") y_max = dataset_dict.get("y_max") if any([spatial, x_min, x_max, y_min, y_max]): spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCTERMS.Location)) g.add((dataset_ref, DCTERMS.spatial, spatial_ref)) if spatial: g.add((spatial_ref, LOCN.geometry, Literal(spatial, datatype=GEOJSON_IMT))) if x_min and x_max and y_min and y_max: box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max) box_ref = BNode() g.add((box_ref, RDF.type, SCHEMA.GeoShape)) g.add((box_ref, SCHEMA.box, Literal(box_value))) g.add((spatial_ref, LOCN.geometry, box_ref)) ## Spatial Resolution spatial_res = dataset_dict.get("spatial_res") if spatial_res: g.add((dataset_ref, DCAT.spatialResolutionInMeters, Literal(spatial_res, datatype=XSD.decimal))) ## Process Step proc_step = dataset_dict.get("process_step") if proc_step: proc_step_value = markdown_extract(proc_step, extract_length=0) proc_ref = BNode() g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement)) g.add((proc_ref, RDFS.label, Literal(proc_step_value))) g.add((dataset_ref, DCTERMS.provenance, proc_ref)) ## Project details project = dataset_dict.get("organization") if project: project["description"] = markdown_extract(project["description"], extract_length=0) project_details = BNode() g.add((project_details, RDF.type, ORG.Organization)) g.add((dataset_ref, DCTERMS.publisher, project_details)) items = [("title", FOAF.name, None, Literal), ("description", ORG.purpose, None, Literal)] self._add_triples_from_dict(project, project_details, items) ## Contact details contact_person = dataset_dict.get("contact_person") contact_email = dataset_dict.get("contact_email") if any([contact_person, contact_email]): contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Individual)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, "contact_person") self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, "contact_email", _type=URIRef, value_modifier=self._add_mailto) ## Theme themes = dataset_dict.get("groups") if themes: for theme in themes: theme_details = BNode() g.add((theme_details, RDF.type, SKOS.Concept)) g.add((theme_details, SKOS.prefLabel, Literal(theme["title"]))) g.add((dataset_ref, DCAT.theme, theme_details)) # Resources ## Depositar defines license in the dataset level license = dataset_dict.get("license_url") for resource_dict in dataset_dict.get("resources", []): distribution = CleanedURIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) ## Simple values items = [ ("name", DCTERMS.title, None, Literal), ("description", DCTERMS.description, None, Literal), ("encoding", CNT.characterEncoding, None, Literal), ("url", DCAT.downloadURL, None, URIRef), ] self._add_triples_from_dict(resource_dict, distribution, items) ## License if license: g.add((distribution, DCTERMS.license, URIRef(license))) ## Coordinate Systems crs = resource_dict.get("resource_crs") if crs: crs_value = EPSG_PREFIX + str(crs) g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value))) ## Format (mimetype) mimetype = resource_dict.get("mimetype") if mimetype: mimetype_value = IMT_PREFIX + mimetype g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))
def test_distribution_fields(self): resource = { 'id': 'c041c635-054f-4431-b647-f9186926d021', 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'CSV file', 'description': 'A CSV file', 'url': 'http://example.com/data/file.csv', 'status': 'http://purl.org/adms/status/Completed', 'rights': 'Some statement about rights', 'license': 'http://creativecommons.org/licenses/by/3.0/', 'issued': '2015-06-26T15:21:09.034694', 'modified': '2015-06-26T15:21:09.075774', 'size': 1234, 'documentation': '[\"http://dataset.info.org/distribution1/doc1\", \"http://dataset.info.org/distribution1/doc2\"]', 'language': '[\"en\", \"es\", \"ca\"]', 'conforms_to': '[\"Standard 1\", \"Standard 2\"]', 'hash': '4304cf2e751e6053c90b1804c89c0ebb758f395a', 'hash_algorithm': 'http://spdx.org/rdf/terms#checksumAlgorithm_sha1', } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Test DCAT dataset', 'resources': [ resource ] } s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] eq_(unicode(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) assert self._triple(g, distribution, DCT.title, resource['name']) assert self._triple(g, distribution, DCT.description, resource['description']) assert self._triple(g, distribution, DCT.rights, resource['rights']) assert self._triple(g, distribution, DCT.license, resource['license']) assert self._triple(g, distribution, ADMS.status, resource['status']) # List for item in [ ('documentation', FOAF.page), ('language', DCT.language), ('conforms_to', DCT.conformsTo), ]: values = json.loads(resource[item[0]]) eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) for value in values: assert self._triple(g, distribution, item[1], value) # Dates assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime) assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) # Numbers assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal) # Checksum checksum = self._triple(g, distribution, SPDX.checksum, None)[2] assert checksum assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary') assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm']))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # dct:title titles = (t for t in self._get_dataset_value(dataset_dict, 'title_translated').values() if t) for title in titles: g.add((dataset_ref, DCT.title, Literal(title))) # dct:description descriptions = (d for d in self._get_dataset_value(dataset_dict, 'notes_translated').values() if d) for description in descriptions: g.add((dataset_ref, DCT.description, Literal(description))) # dct:contactPoint contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, 'maintainer') self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, 'maintainer_email', _type=URIRef, value_modifier=self._add_mailto) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasUrl, 'maintainer_website', _type=URIRef) # dcat:distribution for resource_dict in self._get_dataset_value(dataset_dict, 'resources'): distribution = BNode() g.add((distribution, RDF.type, DCAT.Distribution)) g.add((dataset_ref, DCAT.distribution, distribution)) titles = (t for t in set(resource_dict.get('name_translated').values()) if t) for title in titles: g.add((distribution, DCT.title, Literal(title))) descriptions = (d for d in set(resource_dict.get('description_translated').values()) if d) for description in descriptions: g.add((distribution, DCT.description, Literal(description))) g.add((distribution, DCAT.accessUrl, URIRef(resource_uri(resource_dict)))) resource_url = resource_dict.get('url') if resource_url: g.add((distribution, DCAT.downloadUrl, URIRef(resource_url))) # dcat:keyword keywords = set( keyword for keyword_language in dataset_dict.get('keywords', {}).values() for keyword in keyword_language) for keyword in keywords: g.add((dataset_ref, DCAT.keyword, Literal(keyword))) # dct:publisher context = {'user': p.c.user} organization = p.get_action('organization_show')(context, data_dict={'id': dataset_dict['owner_org']}) publisher = URIRef(p.url_for(controller='organization', action='read', id=organization['id'], qualified=True)) g.add((publisher, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher)) organization_titles = (t for t in organization.get('title_translated', {}).values() if t) for title in organization_titles: g.add((publisher, FOAF.name, Literal(title))) self._add_triple_from_dict(organization, publisher, FOAF.homepage, 'homepage') # dcat:theme groups = dataset_dict.get('groups', []) for group_item in groups: group_dict = p.get_action('group_show')(context, data_dict={'id': group_item['id']}) theme = URIRef(p.url_for(controller='group', action='read', id=group_dict['id'], qualified=True)) g.add((theme, RDF.type, SKOS.Concept)) g.add((dataset_ref, DCAT.theme, theme)) group_titles = (t for t in group_dict.get('title_translated', {}).values() if t) for title in group_titles: g.add((theme, SKOS.prefLabel, Literal(title)))