def test_with_valid_items(self): testUriPart = "://www.w3.org/ns/dcat#" for prefix in ['http', 'https']: assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + testUriPart) # leading and trailing whitespace should be removed assert CleanedURIRef(' ' + prefix + testUriPart + ' ') == URIRef(prefix + testUriPart) testNonHttpUri = "mailto:[email protected]" assert CleanedURIRef(testNonHttpUri) == URIRef(testNonHttpUri) # leading and trailing whitespace should be removed again assert CleanedURIRef(' ' + testNonHttpUri + ' ') == URIRef(testNonHttpUri)
def test_with_invalid_items(self): testUriPart = "://www.w3.org/ns/!dcat #" expectedUriPart = "://www.w3.org/ns/%21dcat%20#" for prefix in ['http', 'https']: assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + expectedUriPart) # applying on escaped data should have no effect assert CleanedURIRef(prefix + expectedUriPart) == URIRef(prefix + expectedUriPart) # leading and trailing space should not be escaped testNonHttpUri = " mailto:with [email protected] " expectedNonHttpUri = "mailto:with%20space%[email protected]" assert CleanedURIRef(testNonHttpUri) == URIRef(expectedNonHttpUri) # applying on escaped data should have no effect assert CleanedURIRef(expectedNonHttpUri) == URIRef(expectedNonHttpUri)
def _get_or_create_contact_point(self, dataset_dict, dataset_ref): contact_point_objects = self.g.objects(dataset_ref, DCAT.contactPoint) contact_object_list = list(contact_point_objects) if len(contact_object_list) == 0: contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = CleanedURIRef(contact_uri) else: contact_details = BNode() self.g.add((contact_details, RDF.type, VCARD.Organization)) self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) return contact_details return next(iter(contact_object_list))
def _get_or_create_contact_point(self, dataset_dict, dataset_ref): """ Returns the contact point object in the graph or a newly created object if no one is found in the given graph. """ contact_point_objects = self.g.objects(dataset_ref, DCAT.contactPoint) contact_object_list = list(contact_point_objects) if len(contact_object_list) == 0: contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') if contact_uri: contact_details = CleanedURIRef(contact_uri) else: contact_details = BNode() self.g.add((contact_details, RDF.type, VCARD.Organization)) self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) return contact_details return next(iter(contact_object_list))
def _publisher_graph(self, dataset_ref, dataset_dict): if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), ]): publisher_uri = dataset_dict.get('publisher_uri') if publisher_uri: publisher_details = CleanedURIRef(publisher_uri) else: # No organization nor publisher_uri publisher_details = BNode() # remove all previous contact points set by base profile as it is garbage. for s, p, o in self.g.triples((None, SCHEMA.contactType, Literal('customer service'))): self.g.remove((s, None, None)) self.g.remove((dataset_ref, SCHEMA.publisher, None)) self.g.remove((publisher_details, SCHEMA.name, None)) self.g.remove((publisher_details, SCHEMA.contactPoint, None)) # add publisher self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) publisher_name = dataset_dict.get('publisher_name') self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) contact_point = BNode() self.g.add((contact_point, RDF.type, SCHEMA.ContactPoint)) self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) self.g.add((contact_point, SCHEMA.contactType, Literal('Publisher'))) publisher_url = dataset_dict.get('publisher_url') self.g.add((contact_point, SCHEMA.url, Literal(publisher_url)))
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g # Creators for responsible_party in load_json(dataset_dict['cited-responsible-party']): if 'publisher' in responsible_party['role']: continue name = responsible_party.get('individual-name') org = responsible_party.get('organisation-name') email = responsible_party.get('contact-info_email') url = responsible_party.get('contact-info_online-resource_url') ind_identifier = responsible_party.get('individual-uri', {}) if isinstance(ind_identifier, str): ind_uri = ind_identifier else: code = ind_identifier.get('code') codeSpace = ind_identifier.get('code-space') authority = ind_identifier.get('authority') version = ind_identifier.get('version') if code: id_list = [authority, codeSpace, code, version] ind_uri = '/'.join(x.strip() for x in id_list if x.strip()) else: ind_uri = '' org_identifier = responsible_party.get('organisation-uri', {}) if isinstance(org_identifier, str): org_uri = org_identifier else: code = org_identifier.get('code') codeSpace = org_identifier.get('code-space') authority = org_identifier.get('authority') version = org_identifier.get('version') if code: id_list = [authority, codeSpace, code, version] org_uri = '/'.join(x.strip() for x in id_list if x.strip()) else: org_uri = '' if ind_uri: creator_details = CleanedURIRef(uri) elif org_uri: creator_details = CleanedURIRef(uri) else: creator_details = BNode() if name: ind_names = name.split(' ') self.g.add((creator_details, RDF.type, SCHEMA.Person)) self.g.add((creator_details, SCHEMA.name, Literal(name))) self.g.add((creator_details, SCHEMA.sameAs, Literal(ind_uri))) self.g.add((creator_details, SCHEMA.givenName, Literal(ind_names[0]))) self.g.add((creator_details, SCHEMA.additionalName, Literal(','.join(ind_names[1:-1])))) self.g.add((creator_details, SCHEMA.familyName, Literal(ind_names[-1]))) self.g.add((creator_details, SCHEMA.affiliation, Literal(org))) elif org: self.g.add((creator_details, RDF.type, SCHEMA.Organization)) self.g.add((creator_details, SCHEMA.name, Literal(org))) self.g.add((creator_details, SCHEMA.sameAs, Literal(org_uri))) self.g.add((dataset_ref, SCHEMA.creator, creator_details)) # change license over to "use-limitations" use_limitations_str = dataset_dict.get('use-limitations', '[]') dataset_name = dataset_dict.get('name') try: use_limitations = json.loads(use_limitations_str) if use_limitations: for use_limitation in use_limitations: creative_work = BNode() g.add((creative_work, RDF.type, SCHEMA.CreativeWork)) license_str = "License text for {}".format(dataset_name) g.add((creative_work, SCHEMA.text, Literal(use_limitation))) g.add((creative_work, SCHEMA.name, Literal(license_str))) g.add((dataset_ref, SCHEMA.license, creative_work)) # NB: this is accurate in Python 2. In Python 3 JSON parsing # exceptions are moved to json.JSONDecodeError except ValueError: pass try: std_names = dataset_dict.get('cf_standard_names') except Exception: # TODO: add logging, etc pass if (std_names is not None and hasattr(std_names, '__iter__')): for standard_name in sorted(std_names): g.add((dataset_ref, SCHEMA.variableMeasured, Literal(standard_name))) spatial_uri = dataset_dict.get('spatial_uri') spatial_text = dataset_dict.get('spatial_text') if spatial_uri: spatial_ref = URIRef(spatial_uri) else: spatial_ref = BNode() if spatial_text: g.add((dataset_ref, DCT.spatial, spatial_ref)) g.add((spatial_ref, RDF.type, DCT.Location)) g.add((spatial_ref, RDFS.label, Literal(spatial_text))) spatial_uri = dataset_dict.get('spatial_uri') spatial_text = dataset_dict.get('spatial_text') spatial_geom = dataset_dict.get('spatial') if spatial_uri or spatial_text or spatial_geom: if spatial_uri: spatial_ref = CleanedURIRef(spatial_uri) else: spatial_ref = BNode() g.add((spatial_ref, RDF.type, SCHEMA.Place)) g.add((dataset_ref, SCHEMA.spatialCoverage, spatial_ref)) if spatial_text: g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) if spatial_geom: try: gj = load_json(spatial_geom) bounds = shape(gj).bounds bbox = [str(bound) for bound in bounds[1::-1] + bounds[:1:-1]] except Exception: pass else: bbox_str = ' '.join(bbox) geo_shape = BNode() g.add((geo_shape, RDF.type, SCHEMA.GeoShape)) g.add((geo_shape, SCHEMA.box, Literal(bbox_str))) # Add bounding box element g.add((spatial_ref, SCHEMA.geo, geo_shape)) # Basic fields self._basic_fields_graph(dataset_ref, dataset_dict) # Catalog self._catalog_graph(dataset_ref, dataset_dict) # Publisher self.infer_publisher(dataset_dict) self._publisher_graph(dataset_ref, dataset_dict) # Add contentUrl to Distribution for s, p, o in self.g.triples((None, RDF.type, SCHEMA.DataDownload)): url = self.g.value(s, SCHEMA.url, None) g.add((s, SCHEMA.contentUrl, Literal(url))) # Identifier unique_identifiers = dataset_dict.get('unique-resource-identifier-full', {}) if unique_identifiers: self.g.remove((dataset_ref, SCHEMA.identifier, None)) for unique_identifier in unique_identifiers: if 'doi.org' in unique_identifier.get('authority', '') or not unique_identifier.get('authority'): doi = re.sub(r'^http.*doi\.org/', '', unique_identifier['code'], flags=re.IGNORECASE) # strip https://doi.org/ and the like if doi and re.match(r'^10.\d{4,9}\/[-._;()/:A-Z0-9]+$', doi, re.IGNORECASE): identifier = BNode() g.add((dataset_ref, SCHEMA.identifier, identifier)) self.g.add((identifier, RDF.type, SCHEMA.PropertyValue)) self.g.add((identifier, SCHEMA.propertyID, Literal("https://registry.identifiers.org/registry/doi"))) self.g.add((identifier, SCHEMA.name, Literal("DOI: %s" % doi))) self.g.add((identifier, SCHEMA.value, Literal("doi:%s" % doi))) self.g.add((identifier, SCHEMA.url, Literal("https://doi.org/%s" % doi))) # Temporal temporal_extent = load_json(dataset_dict.get('temporal-extent', {})) if (isinstance(temporal_extent, list)): temporal_extent = temporal_extent[0] start = temporal_extent.get('begin') end = temporal_extent.get('end') if start or end: if start and end: self.g.add((dataset_ref, SCHEMA.temporalCoverage, Literal('%s/%s' % (start, end)))) elif start: self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start) elif end: self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) # Dataset g.add((dataset_ref, RDF.type, DCAT.Dataset)) ## Simple values items = [ ("title", DCTERMS.title, None, Literal), ("name", DCTERMS.identifier, None, Literal), ("author", DC.creator, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) ## Description dataset_desc = dataset_dict.get("notes") if dataset_desc: dataset_desc_value = markdown_extract(dataset_desc, extract_length=0) g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc))) ## Language langs = dataset_dict.get("language") if langs: for lang in langs: language_uri = LANG_PREFIX + lang g.add((dataset_ref, DCTERMS.language, URIRef(language_uri))) ## Tags for tag in dataset_dict.get("tags", []): g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) ## Wikidata keywords for keyword in dataset_dict.get("keywords", []): g.add((dataset_ref, DCAT.theme, WD[keyword])) ## Data Type data_types = dataset_dict.get("data_type") if data_types: for data_type in data_types: g.add((dataset_ref, DCTERMS.type, URIRef(DATA_TYPE_PREFIX + data_type))) ## Temporal Resolution temp_res = dataset_dict.get("temp_res") temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"} if temp_res: temp_res_value = temp_res_mapping[temp_res] g.add((dataset_ref, DCAT.temporalResolution, Literal(temp_res_value, datatype=XSD.duration))) ## Start Time, End Time, and Created Time items = [("start_time", SCHEMA.startDate, None, Literal), ("end_time", SCHEMA.endDate, None, Literal), ("created_time", DCTERMS.issued, None, Literal)] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) ## Spatial Coverage spatial = dataset_dict.get("spatial") x_min = dataset_dict.get("x_min") x_max = dataset_dict.get("x_max") y_min = dataset_dict.get("y_min") y_max = dataset_dict.get("y_max") if any([spatial, x_min, x_max, y_min, y_max]): spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCTERMS.Location)) g.add((dataset_ref, DCTERMS.spatial, spatial_ref)) if spatial: g.add((spatial_ref, LOCN.geometry, Literal(spatial, datatype=GEOJSON_IMT))) if x_min and x_max and y_min and y_max: box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max) box_ref = BNode() g.add((box_ref, RDF.type, SCHEMA.GeoShape)) g.add((box_ref, SCHEMA.box, Literal(box_value))) g.add((spatial_ref, LOCN.geometry, box_ref)) ## Spatial Resolution spatial_res = dataset_dict.get("spatial_res") if spatial_res: g.add((dataset_ref, DCAT.spatialResolutionInMeters, Literal(spatial_res, datatype=XSD.decimal))) ## Process Step proc_step = dataset_dict.get("process_step") if proc_step: proc_step_value = markdown_extract(proc_step, extract_length=0) proc_ref = BNode() g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement)) g.add((proc_ref, RDFS.label, Literal(proc_step_value))) g.add((dataset_ref, DCTERMS.provenance, proc_ref)) ## Project details project = dataset_dict.get("organization") if project: project["description"] = markdown_extract(project["description"], extract_length=0) project_details = BNode() g.add((project_details, RDF.type, ORG.Organization)) g.add((dataset_ref, DCTERMS.publisher, project_details)) items = [("title", FOAF.name, None, Literal), ("description", ORG.purpose, None, Literal)] self._add_triples_from_dict(project, project_details, items) ## Contact details contact_person = dataset_dict.get("contact_person") contact_email = dataset_dict.get("contact_email") if any([contact_person, contact_email]): contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Individual)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, "contact_person") self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, "contact_email", _type=URIRef, value_modifier=self._add_mailto) ## Theme themes = dataset_dict.get("groups") if themes: for theme in themes: theme_details = BNode() g.add((theme_details, RDF.type, SKOS.Concept)) g.add((theme_details, SKOS.prefLabel, Literal(theme["title"]))) g.add((dataset_ref, DCAT.theme, theme_details)) # Resources ## Depositar defines license in the dataset level license = dataset_dict.get("license_url") for resource_dict in dataset_dict.get("resources", []): distribution = CleanedURIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) ## Simple values items = [ ("name", DCTERMS.title, None, Literal), ("description", DCTERMS.description, None, Literal), ("encoding", CNT.characterEncoding, None, Literal), ("url", DCAT.downloadURL, None, URIRef), ] self._add_triples_from_dict(resource_dict, distribution, items) ## License if license: g.add((distribution, DCTERMS.license, URIRef(license))) ## Coordinate Systems crs = resource_dict.get("resource_crs") if crs: crs_value = EPSG_PREFIX + str(crs) g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value))) ## Format (mimetype) mimetype = resource_dict.get("mimetype") if mimetype: mimetype_value = IMT_PREFIX + mimetype g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))
def graph_from_dataset(self, dataset_dict, dataset_ref): """ Transforms CKAN-Dictionary to DCAT-AP.de-Data """ g = self.g # bind namespaces to have readable names in RDF Document for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) # Simple additional fields items = [('qualityProcessURI', DCATDE.qualityProcessURI, None, URIRef), ('metadata_original_html', DCAT.landingPage, None, URIRef), ('politicalGeocodingLevelURI', DCATDE.politicalGeocodingLevelURI, None, URIRef), ('granularity', DCAT.granularity, None, URIRefOrLiteral)] self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Additional Lists items = [('contributorID', DCATDE.contributorID, None, URIRefOrLiteral), ('politicalGeocodingURI', DCATDE.politicalGeocodingURI, None, URIRef), ('legalbasisText', DCATDE.legalBasis, None, Literal), ('geocodingText', DCATDE.geocodingDescription, None, Literal)] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Add adminUnitL2 for every politicalGeocodingURI value. Compatibility. if self._get_dataset_value(dataset_dict, 'politicalGeocodingURI'): spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCT.Location)) g.add((dataset_ref, DCT.spatial, spatial_ref)) items = [('politicalGeocodingURI', LOCN.adminUnitL2, None, URIRef)] self._add_list_triples_from_dict(dataset_dict, spatial_ref, items) # Contacts self._add_contact(dataset_dict, dataset_ref, DCATDE.originator, 'originator') self._add_contact(dataset_dict, dataset_ref, DCATDE.maintainer, 'maintainer') self._add_contact(dataset_dict, dataset_ref, DCT.contributor, 'contributor') self._add_contact(dataset_dict, dataset_ref, DCT.creator, 'author') # Add maintainer_url to contact_point maintainer_url = self._get_dataset_value(dataset_dict, 'maintainer_url') if maintainer_url: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) self._add_triple_from_dict(dataset_dict, contact_point, VCARD.hasURL, 'maintainer_url', _type=URIRef) # add maintainer_tel to contact_point maintainer_tel = self._get_dataset_value(dataset_dict, 'maintainer_tel') if maintainer_tel: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) self._add_triple_from_dict(dataset_dict, contact_point, VCARD.hasTelephone, 'maintainer_tel', _type=URIRef, value_modifier=self._add_tel) # add maintainer postal data to contact_point if available vcard_mapping = { 'street': VCARD.hasStreetAddress, 'city': VCARD.hasLocality, 'zip': VCARD.hasPostalCode, 'country': VCARD.hasCountryName } for vc_name in vcard_mapping: vcard_fld = self._get_dataset_value(dataset_dict, 'maintainer_' + vc_name) if vcard_fld: contact_point = self._get_or_create_contact_point( dataset_dict, dataset_ref) g.add((contact_point, vcard_mapping[vc_name], Literal(vcard_fld))) # Groups groups = self._get_dataset_value(dataset_dict, 'groups') for group in groups: group_name_in_dict = group['name'] if group_name_in_dict: g.add((dataset_ref, DCAT.theme, CleanedURIRef(dcat_theme_prefix + group_name_in_dict.upper()))) # used_datasets items = [ ('used_datasets', DCT.relation, None, URIRef), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Enhance Distributions for resource_dict in dataset_dict.get('resources', []): for distribution in g.objects(dataset_ref, DCAT.distribution): # Match distribution in graph and distribution in ckan-dict if unicode(distribution) == resource_uri(resource_dict): items = [('licenseAttributionByText', DCATDE.licenseAttributionByText, None, Literal), ('plannedAvailability', DCATDE.plannedAvailability, None, URIRef)] self._add_triples_from_dict(resource_dict, distribution, items)