Ejemplo n.º 1
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)
    # we shouldn't create resources without URLs
    if not url:
        log.warning(f'Resource without url: {distrib}')
        return

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.format)
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Ejemplo n.º 2
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.term('format'))
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Ejemplo n.º 3
0
def dataset_from_rdf(graph, dataset=None, node=None):
    '''
    Create or update a dataset from a RDF/DCAT graph
    '''
    dataset = dataset or Dataset()

    if node is None:  # Assume first match is the only match
        node = graph.value(predicate=RDF.type, object=DCAT.Dataset)

    d = graph.resource(node)

    dataset.title = rdf_value(d, DCT.title)
    # Support dct:abstract if dct:description is missing (sometimes used instead)
    description = d.value(DCT.description) or d.value(DCT.abstract)
    dataset.description = sanitize_html(description)
    dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity))
    dataset.created_at = rdf_value(d, DCT.issued, dataset.created_at)
    dataset.last_modified = rdf_value(d, DCT.modified, dataset.last_modified)

    acronym = rdf_value(d, SKOS.altLabel)
    if acronym:
        dataset.acronym = acronym

    tags = [tag.toPython() for tag in d.objects(DCAT.keyword)]
    tags += [theme.toPython() for theme in d.objects(DCAT.theme) if not isinstance(theme, RdfResource)]
    dataset.tags = list(set(tags))

    identifier = rdf_value(d, DCT.identifier)
    if identifier:
        dataset.extras['dct:identifier'] = identifier

    if isinstance(d.identifier, URIRef):
        dataset.extras['uri'] = d.identifier.toPython()

    landing_page = url_from_rdf(d, DCAT.landingPage)
    if landing_page:
        try:
            uris.validate(landing_page)
            dataset.extras['remote_url'] = landing_page
        except uris.ValidationError:
            pass

    dataset.temporal_coverage = temporal_from_rdf(d.value(DCT.temporal))

    licenses = set()
    for distrib in d.objects(DCAT.distribution | DCAT.distributions):
        resource_from_rdf(distrib, dataset)
        for predicate in DCT.license, DCT.rights:
            value = distrib.value(predicate)
            if isinstance(value, (URIRef, Literal)):
                licenses.add(value.toPython())
            elif isinstance(value, RdfResource):
                licenses.add(value.identifier.toPython())

    default_license = dataset.license or License.default()
    dataset_license = rdf_value(d, DCT.license)
    dataset.license = License.guess(dataset_license, *licenses, default=default_license)

    return dataset
Ejemplo n.º 4
0
    def parse_graph(self, url, fmt):
        graph = Graph(namespace_manager=namespace_manager)
        graph.parse(data=requests.get(url).text, format=fmt)
        for id, data in self.dcat_datasets(graph):
            self.add_item(id, graph=data)

        for cls, prop in KNOWN_PAGINATION:
            if (None, RDF.type, cls) in graph:
                pagination = graph.value(predicate=RDF.type, object=cls)
                pagination = graph.resource(pagination)
                next_url = url_from_rdf(pagination, prop)
                if next_url:
                    self.parse_graph(next_url, fmt)
                break
Ejemplo n.º 5
0
    def parse_graph(self, url, fmt):
        graph = Graph(namespace_manager=namespace_manager)
        while url:
            subgraph = Graph(namespace_manager=namespace_manager)
            subgraph.parse(data=requests.get(url).text, format=fmt)

            url = None
            for cls, prop in KNOWN_PAGINATION:
                if (None, RDF.type, cls) in subgraph:
                    pagination = subgraph.value(predicate=RDF.type, object=cls)
                    pagination = subgraph.resource(pagination)
                    url = url_from_rdf(pagination, prop)
                    break

            graph += subgraph

        for node in graph.subjects(RDF.type, DCAT.Dataset):
            id = graph.value(node, DCT.identifier)
            kwargs = {'nid': str(node)}
            kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
            self.add_item(id, **kwargs)

        return graph