def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) # we shouldn't create resources without URLs if not url: log.warning(f'Resource without url: {distrib}') return if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.format) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.term('format')) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def dataset_from_rdf(graph, dataset=None, node=None): ''' Create or update a dataset from a RDF/DCAT graph ''' dataset = dataset or Dataset() if node is None: # Assume first match is the only match node = graph.value(predicate=RDF.type, object=DCAT.Dataset) d = graph.resource(node) dataset.title = rdf_value(d, DCT.title) # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) dataset.description = sanitize_html(description) dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity)) dataset.created_at = rdf_value(d, DCT.issued, dataset.created_at) dataset.last_modified = rdf_value(d, DCT.modified, dataset.last_modified) acronym = rdf_value(d, SKOS.altLabel) if acronym: dataset.acronym = acronym tags = [tag.toPython() for tag in d.objects(DCAT.keyword)] tags += [theme.toPython() for theme in d.objects(DCAT.theme) if not isinstance(theme, RdfResource)] dataset.tags = list(set(tags)) identifier = rdf_value(d, DCT.identifier) if identifier: dataset.extras['dct:identifier'] = identifier if isinstance(d.identifier, URIRef): dataset.extras['uri'] = d.identifier.toPython() landing_page = url_from_rdf(d, DCAT.landingPage) if landing_page: try: uris.validate(landing_page) dataset.extras['remote_url'] = landing_page except uris.ValidationError: pass dataset.temporal_coverage = temporal_from_rdf(d.value(DCT.temporal)) licenses = set() for distrib in d.objects(DCAT.distribution | DCAT.distributions): resource_from_rdf(distrib, dataset) for predicate in DCT.license, DCT.rights: value = distrib.value(predicate) if isinstance(value, (URIRef, Literal)): licenses.add(value.toPython()) elif isinstance(value, RdfResource): licenses.add(value.identifier.toPython()) default_license = dataset.license or License.default() dataset_license = rdf_value(d, DCT.license) dataset.license = License.guess(dataset_license, *licenses, default=default_license) return dataset
def parse_graph(self, url, fmt): graph = Graph(namespace_manager=namespace_manager) graph.parse(data=requests.get(url).text, format=fmt) for id, data in self.dcat_datasets(graph): self.add_item(id, graph=data) for cls, prop in KNOWN_PAGINATION: if (None, RDF.type, cls) in graph: pagination = graph.value(predicate=RDF.type, object=cls) pagination = graph.resource(pagination) next_url = url_from_rdf(pagination, prop) if next_url: self.parse_graph(next_url, fmt) break
def parse_graph(self, url, fmt): graph = Graph(namespace_manager=namespace_manager) while url: subgraph = Graph(namespace_manager=namespace_manager) subgraph.parse(data=requests.get(url).text, format=fmt) url = None for cls, prop in KNOWN_PAGINATION: if (None, RDF.type, cls) in subgraph: pagination = subgraph.value(predicate=RDF.type, object=cls) pagination = subgraph.resource(pagination) url = url_from_rdf(pagination, prop) break graph += subgraph for node in graph.subjects(RDF.type, DCAT.Dataset): id = graph.value(node, DCT.identifier) kwargs = {'nid': str(node)} kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank' self.add_item(id, **kwargs) return graph