def test_with_valid_items(self):
        testUriPart = "://www.w3.org/ns/dcat#"

        for prefix in ['http', 'https']:
            assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + testUriPart)
            # leading and trailing whitespace should be removed
            assert CleanedURIRef(' ' + prefix + testUriPart + ' ') == URIRef(prefix + testUriPart)

        testNonHttpUri = "mailto:[email protected]"
        assert CleanedURIRef(testNonHttpUri) == URIRef(testNonHttpUri)
        # leading and trailing whitespace should be removed again
        assert CleanedURIRef(' ' + testNonHttpUri + ' ') == URIRef(testNonHttpUri)
    def test_with_invalid_items(self):
        testUriPart = "://www.w3.org/ns/!dcat #"
        expectedUriPart = "://www.w3.org/ns/%21dcat%20#"

        for prefix in ['http', 'https']:
            assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + expectedUriPart)
            # applying on escaped data should have no effect
            assert CleanedURIRef(prefix + expectedUriPart) == URIRef(prefix + expectedUriPart)

        # leading and trailing space should not be escaped
        testNonHttpUri = " mailto:with [email protected] "
        expectedNonHttpUri = "mailto:with%20space%[email protected]"

        assert CleanedURIRef(testNonHttpUri) == URIRef(expectedNonHttpUri)
        # applying on escaped data should have no effect
        assert CleanedURIRef(expectedNonHttpUri) == URIRef(expectedNonHttpUri)
Exemple #3
0
    def _get_or_create_contact_point(self, dataset_dict, dataset_ref):
        contact_point_objects = self.g.objects(dataset_ref, DCAT.contactPoint)
        contact_object_list = list(contact_point_objects)

        if len(contact_object_list) == 0:
            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = CleanedURIRef(contact_uri)
            else:
                contact_details = BNode()

            self.g.add((contact_details, RDF.type, VCARD.Organization))
            self.g.add((dataset_ref, DCAT.contactPoint, contact_details))
            return contact_details

        return next(iter(contact_object_list))
    def _get_or_create_contact_point(self, dataset_dict, dataset_ref):
        """
        Returns the contact point object in the graph or a newly created object
        if no one is found in the given graph.
        """
        contact_point_objects = self.g.objects(dataset_ref, DCAT.contactPoint)
        contact_object_list = list(contact_point_objects)

        if len(contact_object_list) == 0:
            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = CleanedURIRef(contact_uri)
            else:
                contact_details = BNode()

            self.g.add((contact_details, RDF.type, VCARD.Organization))
            self.g.add((dataset_ref, DCAT.contactPoint, contact_details))
            return contact_details

        return next(iter(contact_object_list))
    def _publisher_graph(self, dataset_ref, dataset_dict):
        if any([
            self._get_dataset_value(dataset_dict, 'publisher_uri'),
            self._get_dataset_value(dataset_dict, 'publisher_name'),
        ]):

            publisher_uri = dataset_dict.get('publisher_uri')

            if publisher_uri:
                publisher_details = CleanedURIRef(publisher_uri)
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            # remove all previous contact points set by base profile as it is garbage.
            for s, p, o in self.g.triples((None, SCHEMA.contactType, Literal('customer service'))):
                self.g.remove((s, None, None))

            self.g.remove((dataset_ref, SCHEMA.publisher, None))
            self.g.remove((publisher_details, SCHEMA.name, None))
            self.g.remove((publisher_details, SCHEMA.contactPoint, None))

            # add publisher
            self.g.add((publisher_details, RDF.type, SCHEMA.Organization))
            self.g.add((dataset_ref, SCHEMA.publisher, publisher_details))

            publisher_name = dataset_dict.get('publisher_name')
            self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name)))

            contact_point = BNode()
            self.g.add((contact_point, RDF.type, SCHEMA.ContactPoint))

            self.g.add((publisher_details, SCHEMA.contactPoint, contact_point))

            self.g.add((contact_point, SCHEMA.contactType, Literal('Publisher')))

            publisher_url = dataset_dict.get('publisher_url')
            self.g.add((contact_point, SCHEMA.url, Literal(publisher_url)))
    def graph_from_dataset(self, dataset_dict, dataset_ref):
        g = self.g

        # Creators
        for responsible_party in load_json(dataset_dict['cited-responsible-party']):
            if 'publisher' in responsible_party['role']:
                continue

            name = responsible_party.get('individual-name')
            org = responsible_party.get('organisation-name')
            email = responsible_party.get('contact-info_email')
            url = responsible_party.get('contact-info_online-resource_url')
            ind_identifier = responsible_party.get('individual-uri', {})
            if isinstance(ind_identifier, str):
                ind_uri = ind_identifier
            else:
                code = ind_identifier.get('code')
                codeSpace = ind_identifier.get('code-space')
                authority = ind_identifier.get('authority')
                version = ind_identifier.get('version')
                if code:
                    id_list = [authority, codeSpace, code, version]
                    ind_uri = '/'.join(x.strip() for x in id_list if x.strip())
                else:
                    ind_uri = ''
            org_identifier = responsible_party.get('organisation-uri', {})
            if isinstance(org_identifier, str):
                org_uri = org_identifier
            else:
                code = org_identifier.get('code')
                codeSpace = org_identifier.get('code-space')
                authority = org_identifier.get('authority')
                version = org_identifier.get('version')
                if code:
                    id_list = [authority, codeSpace, code, version]
                    org_uri = '/'.join(x.strip() for x in id_list if x.strip())
                else:
                    org_uri = ''
            if ind_uri:
                creator_details = CleanedURIRef(uri)
            elif org_uri:
                creator_details = CleanedURIRef(uri)
            else:
                creator_details = BNode()
            if name:
                ind_names = name.split(' ')
                self.g.add((creator_details, RDF.type, SCHEMA.Person))
                self.g.add((creator_details, SCHEMA.name, Literal(name)))
                self.g.add((creator_details, SCHEMA.sameAs, Literal(ind_uri)))
                self.g.add((creator_details, SCHEMA.givenName, Literal(ind_names[0])))
                self.g.add((creator_details, SCHEMA.additionalName, Literal(','.join(ind_names[1:-1]))))
                self.g.add((creator_details, SCHEMA.familyName, Literal(ind_names[-1])))
                self.g.add((creator_details, SCHEMA.affiliation, Literal(org)))
            elif org:
                self.g.add((creator_details, RDF.type, SCHEMA.Organization))
                self.g.add((creator_details, SCHEMA.name, Literal(org)))
                self.g.add((creator_details, SCHEMA.sameAs, Literal(org_uri)))

            self.g.add((dataset_ref, SCHEMA.creator, creator_details))

        # change license over to "use-limitations"
        use_limitations_str = dataset_dict.get('use-limitations', '[]')
        dataset_name = dataset_dict.get('name')
        try:
            use_limitations = json.loads(use_limitations_str)
            if use_limitations:
                for use_limitation in use_limitations:
                    creative_work = BNode()
                    g.add((creative_work, RDF.type, SCHEMA.CreativeWork))
                    license_str = "License text for {}".format(dataset_name)
                    g.add((creative_work, SCHEMA.text, Literal(use_limitation)))
                    g.add((creative_work, SCHEMA.name, Literal(license_str)))
                    g.add((dataset_ref, SCHEMA.license, creative_work))
        # NB: this is accurate in Python 2.  In Python 3 JSON parsing
        #     exceptions are moved to json.JSONDecodeError
        except ValueError:
            pass

        try:
            std_names = dataset_dict.get('cf_standard_names')
        except Exception:
            # TODO: add logging, etc
            pass

        if (std_names is not None and
           hasattr(std_names, '__iter__')):
            for standard_name in sorted(std_names):
                g.add((dataset_ref, SCHEMA.variableMeasured,
                      Literal(standard_name)))

        spatial_uri = dataset_dict.get('spatial_uri')
        spatial_text = dataset_dict.get('spatial_text')

        if spatial_uri:
            spatial_ref = URIRef(spatial_uri)
        else:
            spatial_ref = BNode()

        if spatial_text:
            g.add((dataset_ref, DCT.spatial, spatial_ref))
            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((spatial_ref, RDFS.label, Literal(spatial_text)))

        spatial_uri = dataset_dict.get('spatial_uri')
        spatial_text = dataset_dict.get('spatial_text')
        spatial_geom = dataset_dict.get('spatial')

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = CleanedURIRef(spatial_uri)
            else:
                spatial_ref = BNode()

        g.add((spatial_ref, RDF.type, SCHEMA.Place))
        g.add((dataset_ref, SCHEMA.spatialCoverage, spatial_ref))

        if spatial_text:
            g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

        if spatial_geom:
            try:
                gj = load_json(spatial_geom)
                bounds = shape(gj).bounds
                bbox = [str(bound) for bound in bounds[1::-1] + bounds[:1:-1]]
            except Exception:
                pass
            else:
                bbox_str = ' '.join(bbox)
                geo_shape = BNode()
                g.add((geo_shape, RDF.type, SCHEMA.GeoShape))
                g.add((geo_shape, SCHEMA.box, Literal(bbox_str)))
                # Add bounding box element
                g.add((spatial_ref, SCHEMA.geo, geo_shape))

        # Basic fields
        self._basic_fields_graph(dataset_ref, dataset_dict)

        # Catalog
        self._catalog_graph(dataset_ref, dataset_dict)

        # Publisher
        self.infer_publisher(dataset_dict)
        self._publisher_graph(dataset_ref, dataset_dict)

        # Add contentUrl to Distribution
        for s, p, o in self.g.triples((None, RDF.type, SCHEMA.DataDownload)):
            url = self.g.value(s, SCHEMA.url, None)
            g.add((s, SCHEMA.contentUrl, Literal(url)))

        # Identifier
        unique_identifiers = dataset_dict.get('unique-resource-identifier-full', {})
        if unique_identifiers:
            self.g.remove((dataset_ref, SCHEMA.identifier, None))
            for unique_identifier in unique_identifiers:
                if 'doi.org' in unique_identifier.get('authority', '') or not unique_identifier.get('authority'):
                    doi = re.sub(r'^http.*doi\.org/', '', unique_identifier['code'], flags=re.IGNORECASE)  # strip https://doi.org/ and the like
                    if doi and re.match(r'^10.\d{4,9}\/[-._;()/:A-Z0-9]+$', doi, re.IGNORECASE):
                        identifier = BNode()
                        g.add((dataset_ref, SCHEMA.identifier, identifier))
                        self.g.add((identifier, RDF.type, SCHEMA.PropertyValue))
                        self.g.add((identifier, SCHEMA.propertyID, Literal("https://registry.identifiers.org/registry/doi")))
                        self.g.add((identifier, SCHEMA.name, Literal("DOI: %s" % doi)))
                        self.g.add((identifier, SCHEMA.value, Literal("doi:%s" % doi)))
                        self.g.add((identifier, SCHEMA.url, Literal("https://doi.org/%s" % doi)))

        # Temporal
        temporal_extent = load_json(dataset_dict.get('temporal-extent', {}))
        if (isinstance(temporal_extent, list)):
            temporal_extent = temporal_extent[0]
        start = temporal_extent.get('begin')
        end = temporal_extent.get('end')
        if start or end:
            if start and end:
                self.g.add((dataset_ref, SCHEMA.temporalCoverage, Literal('%s/%s' % (start, end))))
            elif start:
                self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start)
            elif end:
                self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end)
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.items():
            g.bind(prefix, namespace)

        # Dataset

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        ## Simple values
        items = [
            ("title", DCTERMS.title, None, Literal),
            ("name", DCTERMS.identifier, None, Literal),
            ("author", DC.creator, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Description
        dataset_desc = dataset_dict.get("notes")
        if dataset_desc:
            dataset_desc_value = markdown_extract(dataset_desc,
                                                  extract_length=0)
        g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc)))

        ## Language
        langs = dataset_dict.get("language")
        if langs:
            for lang in langs:
                language_uri = LANG_PREFIX + lang
                g.add((dataset_ref, DCTERMS.language, URIRef(language_uri)))

        ## Tags
        for tag in dataset_dict.get("tags", []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag["name"])))

        ## Wikidata keywords
        for keyword in dataset_dict.get("keywords", []):
            g.add((dataset_ref, DCAT.theme, WD[keyword]))

        ## Data Type
        data_types = dataset_dict.get("data_type")
        if data_types:
            for data_type in data_types:
                g.add((dataset_ref, DCTERMS.type,
                       URIRef(DATA_TYPE_PREFIX + data_type)))

        ## Temporal Resolution
        temp_res = dataset_dict.get("temp_res")
        temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"}
        if temp_res:
            temp_res_value = temp_res_mapping[temp_res]
            g.add((dataset_ref, DCAT.temporalResolution,
                   Literal(temp_res_value, datatype=XSD.duration)))

        ## Start Time, End Time, and Created Time
        items = [("start_time", SCHEMA.startDate, None, Literal),
                 ("end_time", SCHEMA.endDate, None, Literal),
                 ("created_time", DCTERMS.issued, None, Literal)]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Spatial Coverage
        spatial = dataset_dict.get("spatial")
        x_min = dataset_dict.get("x_min")
        x_max = dataset_dict.get("x_max")
        y_min = dataset_dict.get("y_min")
        y_max = dataset_dict.get("y_max")

        if any([spatial, x_min, x_max, y_min, y_max]):
            spatial_ref = BNode()
            g.add((spatial_ref, RDF.type, DCTERMS.Location))
            g.add((dataset_ref, DCTERMS.spatial, spatial_ref))

            if spatial:
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial, datatype=GEOJSON_IMT)))

            if x_min and x_max and y_min and y_max:
                box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max)
                box_ref = BNode()
                g.add((box_ref, RDF.type, SCHEMA.GeoShape))
                g.add((box_ref, SCHEMA.box, Literal(box_value)))
                g.add((spatial_ref, LOCN.geometry, box_ref))

        ## Spatial Resolution
        spatial_res = dataset_dict.get("spatial_res")

        if spatial_res:
            g.add((dataset_ref, DCAT.spatialResolutionInMeters,
                   Literal(spatial_res, datatype=XSD.decimal)))

        ## Process Step
        proc_step = dataset_dict.get("process_step")

        if proc_step:
            proc_step_value = markdown_extract(proc_step, extract_length=0)
            proc_ref = BNode()
            g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement))
            g.add((proc_ref, RDFS.label, Literal(proc_step_value)))
            g.add((dataset_ref, DCTERMS.provenance, proc_ref))

        ## Project details
        project = dataset_dict.get("organization")

        if project:
            project["description"] = markdown_extract(project["description"],
                                                      extract_length=0)
            project_details = BNode()
            g.add((project_details, RDF.type, ORG.Organization))
            g.add((dataset_ref, DCTERMS.publisher, project_details))
            items = [("title", FOAF.name, None, Literal),
                     ("description", ORG.purpose, None, Literal)]

            self._add_triples_from_dict(project, project_details, items)

        ## Contact details
        contact_person = dataset_dict.get("contact_person")
        contact_email = dataset_dict.get("contact_email")

        if any([contact_person, contact_email]):
            contact_details = BNode()
            g.add((contact_details, RDF.type, VCARD.Individual))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn,
                                       "contact_person")

            self._add_triple_from_dict(dataset_dict,
                                       contact_details,
                                       VCARD.hasEmail,
                                       "contact_email",
                                       _type=URIRef,
                                       value_modifier=self._add_mailto)

        ## Theme
        themes = dataset_dict.get("groups")

        if themes:
            for theme in themes:
                theme_details = BNode()
                g.add((theme_details, RDF.type, SKOS.Concept))
                g.add((theme_details, SKOS.prefLabel, Literal(theme["title"])))
                g.add((dataset_ref, DCAT.theme, theme_details))

        # Resources

        ## Depositar defines license in the dataset level
        license = dataset_dict.get("license_url")

        for resource_dict in dataset_dict.get("resources", []):
            distribution = CleanedURIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            ## Simple values
            items = [
                ("name", DCTERMS.title, None, Literal),
                ("description", DCTERMS.description, None, Literal),
                ("encoding", CNT.characterEncoding, None, Literal),
                ("url", DCAT.downloadURL, None, URIRef),
            ]
            self._add_triples_from_dict(resource_dict, distribution, items)

            ## License
            if license:
                g.add((distribution, DCTERMS.license, URIRef(license)))

            ## Coordinate Systems
            crs = resource_dict.get("resource_crs")

            if crs:
                crs_value = EPSG_PREFIX + str(crs)
                g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value)))

            ## Format (mimetype)
            mimetype = resource_dict.get("mimetype")

            if mimetype:
                mimetype_value = IMT_PREFIX + mimetype
                g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))
    def graph_from_dataset(self, dataset_dict, dataset_ref):
        """ Transforms CKAN-Dictionary to DCAT-AP.de-Data """
        g = self.g

        # bind namespaces to have readable names in RDF Document
        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # Simple additional fields
        items = [('qualityProcessURI', DCATDE.qualityProcessURI, None, URIRef),
                 ('metadata_original_html', DCAT.landingPage, None, URIRef),
                 ('politicalGeocodingLevelURI',
                  DCATDE.politicalGeocodingLevelURI, None, URIRef),
                 ('granularity', DCAT.granularity, None, URIRefOrLiteral)]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Additional Lists
        items = [('contributorID', DCATDE.contributorID, None,
                  URIRefOrLiteral),
                 ('politicalGeocodingURI', DCATDE.politicalGeocodingURI,
                  None, URIRef),
                 ('legalbasisText', DCATDE.legalBasis, None, Literal),
                 ('geocodingText', DCATDE.geocodingDescription, None, Literal)]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Add adminUnitL2 for every politicalGeocodingURI value. Compatibility.
        if self._get_dataset_value(dataset_dict, 'politicalGeocodingURI'):
            spatial_ref = BNode()
            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            items = [('politicalGeocodingURI', LOCN.adminUnitL2, None, URIRef)]
            self._add_list_triples_from_dict(dataset_dict, spatial_ref, items)

        # Contacts
        self._add_contact(dataset_dict, dataset_ref, DCATDE.originator,
                          'originator')
        self._add_contact(dataset_dict, dataset_ref, DCATDE.maintainer,
                          'maintainer')
        self._add_contact(dataset_dict, dataset_ref, DCT.contributor,
                          'contributor')
        self._add_contact(dataset_dict, dataset_ref, DCT.creator, 'author')

        # Add maintainer_url to contact_point
        maintainer_url = self._get_dataset_value(dataset_dict,
                                                 'maintainer_url')
        if maintainer_url:
            contact_point = self._get_or_create_contact_point(
                dataset_dict, dataset_ref)
            self._add_triple_from_dict(dataset_dict,
                                       contact_point,
                                       VCARD.hasURL,
                                       'maintainer_url',
                                       _type=URIRef)

        # add maintainer_tel to contact_point
        maintainer_tel = self._get_dataset_value(dataset_dict,
                                                 'maintainer_tel')
        if maintainer_tel:
            contact_point = self._get_or_create_contact_point(
                dataset_dict, dataset_ref)
            self._add_triple_from_dict(dataset_dict,
                                       contact_point,
                                       VCARD.hasTelephone,
                                       'maintainer_tel',
                                       _type=URIRef,
                                       value_modifier=self._add_tel)

        # add maintainer postal data to contact_point if available
        vcard_mapping = {
            'street': VCARD.hasStreetAddress,
            'city': VCARD.hasLocality,
            'zip': VCARD.hasPostalCode,
            'country': VCARD.hasCountryName
        }
        for vc_name in vcard_mapping:
            vcard_fld = self._get_dataset_value(dataset_dict,
                                                'maintainer_' + vc_name)
            if vcard_fld:
                contact_point = self._get_or_create_contact_point(
                    dataset_dict, dataset_ref)
                g.add((contact_point, vcard_mapping[vc_name],
                       Literal(vcard_fld)))

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group in groups:
            group_name_in_dict = group['name']
            if group_name_in_dict:
                g.add((dataset_ref, DCAT.theme,
                       CleanedURIRef(dcat_theme_prefix +
                                     group_name_in_dict.upper())))

        # used_datasets
        items = [
            ('used_datasets', DCT.relation, None, URIRef),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Enhance Distributions
        for resource_dict in dataset_dict.get('resources', []):
            for distribution in g.objects(dataset_ref, DCAT.distribution):
                # Match distribution in graph and distribution in ckan-dict
                if unicode(distribution) == resource_uri(resource_dict):
                    items = [('licenseAttributionByText',
                              DCATDE.licenseAttributionByText, None, Literal),
                             ('plannedAvailability',
                              DCATDE.plannedAvailability, None, URIRef)]
                    self._add_triples_from_dict(resource_dict, distribution,
                                                items)