def _publisher_graph(self, dataset_ref, dataset_dict): if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) self.g.add((dataset_ref, SCHEMA.sourceOrganization, publisher_details)) # noqa publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] self._add_multilang_value(publisher_details, SCHEMA.name, multilang_values=publisher_name) else: g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) # noqa contact_point = BNode() self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) self.g.add((contact_point, SCHEMA.contactType, Literal('customer service'))) # noqa publisher_url = self._get_dataset_value(dataset_dict, 'publisher_url') # noqa if not publisher_url and dataset_dict.get('organization'): publisher_url = dataset_dict['organization'].get( 'url') or config.get('ckan.site_url', '') # noqa self.g.add((contact_point, SCHEMA.url, Literal(publisher_url))) items = [ ('publisher_email', SCHEMA.email, ['contact_email', 'maintainer_email', 'author_email'], Literal), # noqa ('publisher_name', SCHEMA.name, ['contact_name', 'maintainer', 'author'], Literal), # noqa ] self._add_triples_from_dict(dataset_dict, contact_point, items)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # -- start g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None, Literal), ('notes', DCT.description, None, Literal), ('url', DCAT.landingPage, None, URIRef), ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, URIRef), ('subject', DCT.subject, None, URIRef), # Mentioned in the vocabulary ('provenance', DCT.provenance, None, URIRef) ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [('language', DCT.language, None, URIRef), ('theme', DCAT.theme, None, URIRef), ('spatial_uri', DCT.spatial, None, URIRef), ('conforms_to', DCT.conformsTo, None, URIRef), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, URIRef), ('access_rights', DCT.accessRights, None, URIRef), ('related_resource', DCT.relation, None, URIRef), ('has_version', DCT.hasVersion, None, Literal), ('is_version_of', DCT.isVersionOf, None, Literal), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Kind)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer'], Literal), ('contact_email', VCARD.hasEmail, ['maintainer_email'], Literal), ] self._add_triples_from_dict(dataset_dict, contact_details, items) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), self._get_dataset_value(dataset_dict, 'publisher_identifier'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [('publisher_email', FOAF.mbox, None, Literal), ('publisher_identifier', DCT.identifier, None, Literal), ('publisher_url', FOAF.homepage, None, URIRef), ('publisher_type', DCT.type, None, Literal)] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # parts - has part/is part of if any([ self._get_dataset_value(dataset_dict, 'has_part'), self._get_dataset_value(dataset_dict, 'is_part_of') ]): items = [('has_part', DCT.hasPart, None, URIRef), ('is_part_of', DCT.isPartOf, None, URIRef)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri: spatial_uri = get_spatial_uri(spatial_uri) # map from code to URI if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) if 'license' not in resource_dict and 'license_id' in dataset_dict: lr = LicenseRegister() _license = lr.get(dataset_dict['license_id']) if _license: resource_dict['license'] = _license.url # Simple values items = [ ('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, URIRef), ] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None, URIRef), ('language', DCT.language, None, URIRef), ('conforms_to', DCT.conformsTo, None, URIRef), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, URIRef(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, URIRef(url))) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None), ('notes', DCT.description, None), ('url', DCAT.landingPage, None), ('identifier', DCT.identifier, ['guid', 'id']), ('version', OWL.versionInfo, ['dcat_version']), ('version_notes', ADMS.versionNotes, None), ('frequency', DCT.accrualPeriodicity, None), ('access_rights', DCT.accessRights, None), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created']), ('modified', DCT.modified, ['metadata_modified']), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None), ('theme', DCAT.theme, None), ('conforms_to', DCT.conformsTo, None), ('alternate_identifier', ADMS.identifier, None), ('documentation', FOAF.page, None), ('related_resource', DCT.relation, None), ('has_version', DCT.hasVersion, None), ('is_version_of', DCT.isVersionOf, None), ('source', DCT.source, None), ('sample', ADMS.sample, None), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer', 'author']), ('contact_email', VCARD.hasEmail, ['maintainer_email', 'author_email']), ] self._add_triples_from_dict(dataset_dict, contact_details, items) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None), ('publisher_url', FOAF.homepage, None), ('publisher_type', DCT.type, None), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DCT.title, None), ('description', DCT.description, None), ('status', ADMS.status, None), ('rights', DCT.rights, None), ('license', DCT.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None), ('language', DCT.language, None), ('conforms_to', DCT.conformsTo, None), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format if '/' in resource_dict.get('format', ''): g.add((distribution, DCAT.mediaType, Literal(resource_dict['format']))) else: if resource_dict.get('format'): g.add((distribution, DCT['format'], Literal(resource_dict['format']))) if resource_dict.get('mimetype'): g.add((distribution, DCAT.mediaType, Literal(resource_dict['mimetype']))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, Literal(url))) # Dates items = [ ('issued', DCT.issued, None), ('modified', DCT.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef(resource_dict['hash_algorithm']))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None), ('notes', DCT.description, None), ('url', DCAT.landingPage, None), ('identifier', DCT.identifier, ['guid', 'id']), ('version', OWL.versionInfo, ['dcat_version']), ('alternate_identifier', ADMS.identifier, None), ('version_notes', ADMS.versionNotes, None), ('frequency', DCT.accrualPeriodicity, None), ('accrualPeriodicity', DCT.accrualPeriodicity, None) ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created']), ('modified', DCT.modified, ['metadata_modified']), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None), ('theme-primary', DCAT.theme, None), ('theme-secondary', DCAT.theme, None), ('conforms-to', DCAT.conformsTo, None), ('lineage', DCT.provenance, None) ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(contact_uri) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) items = [ ('contact_name', VCARD.fn, ['maintainer', 'author']), ('contact_email', VCARD.hasEmail, ['maintainer_email', 'author_email']), ] self._add_triples_from_dict(dataset_dict, contact_details, items) license_id = self._get_dataset_value(dataset_dict, 'license_id') if (license_id == 'cc-by'): g.add((dataset_ref, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/'))) else: g.add((dataset_ref, DCT.license, Literal(license_id))) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None), ('publisher_url', FOAF.homepage, None), ('publisher_type', DCT.type, None), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Update Frequency # accrualPeriodicity update_freq = self._get_dataset_value(dataset_dict, 'update_frequency') if (update_freq): has_uri = False # check if there exists a URI for the update_frequency value from ckanext.dgu.forms.dataset_form import update_frequency_uri for freq_name, freq_uri in update_frequency_uri: if freq_name.lower() == update_freq.lower(): has_uri = True break g.add((dataset_ref, DCT.accrualPeriodicity, URIRef(freq_uri) if has_uri else Literal(update_freq))) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('name', DCT.title, None), ('description', DCT.description, None), ('status', ADMS.status, None), ('rights', DCT.rights, None), ('license', DCT.license, None), ] self._add_triples_from_dict(resource_dict, distribution, items) # Format _format = resource_dict['format'] if _format: if '/' in _format: # add dct:format dctFormat = _format.strip().replace("/", ".").replace(" ", "").lower() g.add((distribution, DCT['format'], Literal(dctFormat))) else: g.add((distribution, DCT['format'], Literal(_format.lower()))) # add dcat:mediaType fmt = formats.Formats.match(_format.strip().lower()) mime_types = fmt['mime_types'] if fmt else None if mime_types: g.add((distribution, DCAT.mediaType, Literal(mime_types))) license_id = self._get_dataset_value(dataset_dict, 'license_id') if (license_id == 'cc-by'): g.add((distribution, DCT.license, Literal('https://creativecommons.org/licenses/by/4.0/'))) else: g.add((distribution, DCT.license, Literal(license_id))) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, Literal(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, Literal(url))) # Dates items = [ ('issued', DCT.issued, None), ('modified', DCT.modified, None), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('title', DCT.title, None, Literal), ('notes', DCT.description, None, Literal), ('url', DCAT.landingPage, None, URIRef), ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, URIRef), ('access_rights', DCT.accessRights, None, Literal), ('dcat_type', DCT.type, None, Literal), ('provenance', DCT.provenance, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags for tag in dataset_dict.get('tags', []): g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Lists items = [ ('language', DCT.language, None, Literal), ('theme', DCAT.theme, None, URIRef), ('conforms_to', DCT.conformsTo, None, Literal), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, URIRef), ('related_resource', DCT.relation, None, URIRef), ('has_version', DCT.hasVersion, None, URIRef), ('is_version_of', DCT.isVersionOf, None, URIRef), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Contact details if any([ self._get_dataset_value(dataset_dict, 'contact_uri'), self._get_dataset_value(dataset_dict, 'contact_name'), self._get_dataset_value(dataset_dict, 'contact_email'), self._get_dataset_value(dataset_dict, 'maintainer'), self._get_dataset_value(dataset_dict, 'maintainer_email'), self._get_dataset_value(dataset_dict, 'author'), self._get_dataset_value(dataset_dict, 'author_email'), ]): contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = URIRef(self._removeWhitespaces(contact_uri)) else: contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Organization)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, 'contact_name', ['maintainer', 'author']) # Add mail address as URIRef, and ensure it has a mailto: prefix self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, 'contact_email', ['maintainer_email', 'author_email'], _type=URIRef, value_modifier=self._add_mailto) # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), dataset_dict.get('organization'), ]): publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) if publisher_uri: publisher_details = URIRef( self._removeWhitespaces(publisher_uri)) else: # No organization nor publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if not publisher_name and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) # TODO: It would make sense to fallback these to organization # fields but they are not in the default schema and the # `organization` object in the dataset_dict does not include # custom fields items = [ ('publisher_email', FOAF.mbox, None, Literal), ('publisher_url', FOAF.homepage, None, URIRef), ('publisher_type', DCT.type, None, URIRef), ] self._add_triples_from_dict(dataset_dict, publisher_details, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') end = self._get_dataset_value(dataset_dict, 'temporal_end') if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Spatial spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = URIRef(self._removeWhitespaces(spatial_uri)) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: # GeoJSON g.add((spatial_ref, LOCN.geometry, Literal(spatial_geom, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: g.add((spatial_ref, LOCN.geometry, Literal(wkt.dumps(json.loads(spatial_geom), decimals=4), datatype=GSP.wktLiteral))) except (TypeError, ValueError, InvalidGeoJSONException): pass # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef( self._removeWhitespaces(resource_uri(resource_dict))) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [('name', DCT.title, None, Literal), ('description', DCT.description, None, Literal), ('status', ADMS.status, None, URIRef), ('rights', DCT.rights, None, URIRef), ('license', DCT.license, None, URIRef), ('access_url', DCAT.accessURL, None, URIRef), ('download_url', DCAT.downloadURL, None, URIRef)] self._add_triples_from_dict(resource_dict, distribution, items) # Lists items = [ ('documentation', FOAF.page, None, URIRef), ('language', DCT.language, None, Literal), ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # Format mimetype = resource_dict.get('mimetype') fmt = resource_dict.get('format') # IANA media types (either URI or Literal) should be mapped as mediaType. # In case format is available and mimetype is not set or identical to format, # check which type is appropriate. if fmt and (not mimetype or mimetype == fmt): if ('iana.org/assignments/media-types' in fmt or not fmt.startswith('http') and '/' in fmt): # output format value as dcat:mediaType instead of dct:format mimetype = fmt fmt = None else: # Use dct:format mimetype = None if mimetype: if mimetype.startswith('http'): g.add((distribution, DCAT.mediaType, URIRef(self._removeWhitespaces(mimetype)))) else: g.add((distribution, DCAT.mediaType, Literal(mimetype))) if fmt: if fmt.startswith('http'): g.add((distribution, DCT['format'], URIRef(self._removeWhitespaces(fmt)))) else: g.add((distribution, DCT['format'], Literal(fmt))) # URL fallback and old behavior url = resource_dict.get('url') download_url = resource_dict.get('download_url') access_url = resource_dict.get('access_url') # Use url as fallback for access_url if access_url is not set and download_url is not equal if (url and ((not (access_url or download_url)) or ((not access_url) and (download_url and url != download_url)))): self._add_triple_from_dict(resource_dict, distribution, DCAT.accessURL, 'url', _type=URIRef) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size']))) # Checksum if resource_dict.get('hash'): checksum = BNode() g.add((checksum, RDF.type, SPDX.Checksum)) g.add((checksum, SPDX.checksumValue, Literal(resource_dict['hash'], datatype=XSD.hexBinary))) if resource_dict.get('hash_algorithm'): if resource_dict['hash_algorithm'].startswith('http'): g.add((checksum, SPDX.algorithm, URIRef( self._removeWhitespaces( resource_dict['hash_algorithm'])))) else: g.add((checksum, SPDX.algorithm, Literal(resource_dict['hash_algorithm']))) g.add((distribution, SPDX.checksum, checksum))