Example #1
0
def dataontosearch_dataset_delete(context, data_dict):
    '''
    Remove all existing association between the specified dataset and concepts.

    This will also remove the dataset from DataOntoSearch's data store.

    :param id: Name or ID of the dataset to remove from DataOntoSearch
    :type id: string
    :return: True if the dataset was removed, or False if the dataset was not
        found.
    :rtype: bool
    '''
    toolkit.check_access(u'dataontosearch_dataset_delete', context, data_dict)

    # Extract parameters from data_dict
    dataset_id_or_name = toolkit.get_or_bust(data_dict, u'id')

    # What dataset is specified?
    dataset = toolkit.get_action(u'package_show')(None, {
        u'id': dataset_id_or_name,
    })
    dataset_rdf_uri = dataset_uri(dataset)

    # Make the request
    r = make_tagger_delete_request(u'/dataset', {
        u'dataset_id': dataset_rdf_uri,
    })
    r.raise_for_status()
    data = r.json()

    return data[u'success']
Example #2
0
    def graph_from_dataset(self, dataset_dict):
        '''
        Given a CKAN dataset dict, creates a graph using the loaded profiles

        The class RDFLib graph (accessible via `serializer.g`) will be updated
        by the loaded profiles.

        Returns the reference to the dataset, which will be an rdflib URIRef.
        '''

        uri_value = dataset_dict.get('uri')
        if not uri_value:
            for extra in dataset_dict.get('extras', []):
                if extra['key'] == 'uri':
                    uri_value = extra['value']
                    break

        dataset_ref = URIRef(dataset_uri(dataset_dict))

        for profile_class in self._profiles:
            profile = profile_class(self.g, self.compatibility_mode)
            if hasattr(self, 'validation_mode'):
                profile.validation_mode = self.validation_mode
            profile.graph_from_dataset(dataset_dict, dataset_ref)

        return dataset_ref
Example #3
0
def dataontosearch_tag_delete(context, data_dict):
    '''
    Remove an existing association between the specified dataset and concept.

    :param dataset: Name or ID of the dataset to disassociate with a concept
    :type dataset: string
    :param concept: RDF URI or human-readable label for the concept to no longer
        associate with the dataset
    :type dataset: string
    :return: True
    :rtype: bool
    '''
    toolkit.check_access(u'dataontosearch_tag_delete', context, data_dict)

    # Extract parameters from data_dict
    dataset_id_or_name = toolkit.get_or_bust(data_dict, u'dataset')
    concept_url_or_label = toolkit.get_or_bust(data_dict, u'concept')

    # What dataset is specified?
    dataset = toolkit.get_action(u'package_show')(None, {
        u'id': dataset_id_or_name,
    })
    dataset_rdf_uri = dataset_uri(dataset)

    # Make the request
    r = make_tagger_delete_request(u'/tag', {
        u'dataset_id': dataset_rdf_uri,
        u'concept': concept_url_or_label,
    })
    r.raise_for_status()
    data = r.json()

    return data[u'success']
Example #4
0
    def _csc_dataset_uri(self, dataset_dict):
        '''
        Returns an URI for the dataset
    
        This will be used to uniquely reference the dataset on the RDF
        serializations.
    
        The value will be the first found of:
    
            1. `catalog_uri()` + '/catalogo/' + `name` field
            2. The value of the `uri` field
            3. The value of an extra with key `uri`
            4. `catalog_uri()` + '/catalogo/' + `id` field
    
        Check the documentation for `catalog_uri()` for the recommended ways of
        setting it.
    
        Returns a string with the dataset URI.
        '''

        if dataset_dict.get('name'):
            uri = '{0}/catalogo/{1}'.format(catalog_uri().rstrip('/'),
                                            dataset_dict['name'])
        if not uri:
            uri = dataset_uri(dataset_dict)
        return uri
    def test_graph_from_dataset(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'notes': 'Lorem ipsum',
            'url': 'http://example.com/ds1',
            'version': '1.0b',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'extras': [
                {'key': 'alternate_identifier', 'value': 'xyz'},
                {'key': 'version_notes', 'value': 'This is a beta version'},
                {'key': 'frequency', 'value': 'monthly'},
                {'key': 'language', 'value': '[\"en\"]'},
                {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'},
                {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'},

            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, OWL.versionInfo, dataset['version'])
        assert self._triple(g, dataset_ref, ADMS.versionNotes, extras['version_notes'])
        assert self._triple(g, dataset_ref, ADMS.identifier, extras['alternate_identifier'])
        assert self._triple(g, dataset_ref, DCT.accrualPeriodicity, extras['frequency'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])

        # Dates
        assert self._triple(g, dataset_ref, DCT.issued, dataset['metadata_created'], XSD.dateTime)
        assert self._triple(g, dataset_ref, DCT.modified, dataset['metadata_modified'], XSD.dateTime)

        # List
        for item in [
            ('language', DCT.language),
            ('theme', DCAT.theme),
            ('conforms_to', DCAT.conformsTo),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], value)
Example #6
0
def dataontosearch_tag_list(context, data_dict):
    '''
    List concepts associated with the specified dataset.

    :param id: id or name of the dataset to fetch tags for
    :type id: string
    :rtype: list of concepts. Each concept is a dict, with 'label' being
        human-readable label and 'uri' being the URI identifying this concept
    '''
    toolkit.check_access(u'dataontosearch_tag_list', context, data_dict)

    # What dataset is specified?
    dataset_id_or_name = toolkit.get_or_bust(data_dict, u'id')
    dataset = toolkit.get_action(u'package_show')(None, {
        u'id': dataset_id_or_name
    })

    # Generate the RDF URI for this dataset, using the very same code used by
    # ckanext-dcat. We need this to be consistent with what DataOntoSearch found
    # when it retrieved the dataset RDF, thus this use of the internal DCAT API.
    dataset_rdf_uri = dataset_uri(dataset)

    r = make_tagger_get_request(u'/tag', {u'dataset_id': dataset_rdf_uri})
    r.raise_for_status()

    data = r.json()

    if data is None:
        return []
    else:
        return data[u'concepts']
    def test_graph_from_dataset(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued': '2016-11-29',
            'modified': '2016-11-29',
            'identifier': 'ISBN',
            'temporal_start': '2016-11-01',
            'temporal_end': '2016-11-30',
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '412946129',
            'holder_name': 'bolzano',
            'holder_identifier': '234234234',
            'alternate_identifier': 'ISBN,TEST',
            'theme': '{ECON,ENVI}',
            'geographical_geonames_url': 'http://www.geonames.org/3181913',
            'language': '{DEU,ENG,ITA}',
            'is_version_of':
            'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'conforms_to': '{CONF1,CONF2,CONF3}'
        }

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCATAPIT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, DCT.identifier,
                            dataset['identifier'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])
    def _extras(self):
        '''
        Yields some additional triples that don't really fit under any of the other existing method
        groupings.

        :return: yields triples
        '''
        yield (self.record_ref, self.namespaces.aiiso.Department,
               Literal(get_department(self.record[u'collectionCode'])))
        yield self.record_ref, self.namespaces.aiiso.Division, self._get_value(u'subDepartment')

        yield (self.record_ref, self.namespaces.void.inDataset,
               URIRef(dataset_uri({u'id': self.resource.get_package_id()}) + u'#dataset'))
    def graph_from_dataset(self, dataset_dict):
        '''
        Given a CKAN dataset dict, creates a graph using the loaded profiles

        The class RDFLib graph (accessible via `serializer.g`) will be updated
        by the loaded profiles.

        Returns the reference to the dataset, which will be an rdflib URIRef.
        '''

        uri_value = dataset_dict.get('uri')
        if not uri_value:
            for extra in dataset_dict.get('extras', []):
                if extra['key'] == 'uri':
                    uri_value = extra['value']
                    break

        dataset_ref = URIRef(dataset_uri(dataset_dict))

        for profile_class in self._profiles:
            profile = profile_class(self.g, self.compatibility_mode)
            profile.graph_from_dataset(dataset_dict, dataset_ref)

        return dataset_ref
Example #10
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        title = dataset_dict.get('title')

        g = self.g

        for prefix, namespace in it_namespaces.iteritems():
            g.bind(prefix, namespace)

        ### add a further type for the Dataset node
        g.add((dataset_ref, RDF.type, DCATAPIT.Dataset))

        ### replace themes
        value = self._get_dict_value(dataset_dict, 'theme')
        self._add_themes(dataset_ref, value)

        ### replace languages
        value = self._get_dict_value(dataset_dict, 'language')
        if value:
            for lang in value.split(','):
                self.g.remove((dataset_ref, DCT.language, Literal(lang)))
                lang = lang.replace('{', '').replace('}', '')
                self.g.add(
                    (dataset_ref, DCT.language, URIRef(LANG_BASE_URI + lang)))
                # self._add_concept(LANG_CONCEPTS, lang)

        ### add spatial (EU URI)
        value = self._get_dict_value(dataset_dict, 'geographical_name')
        if value:
            for gname in value.split(','):
                gname = gname.replace('{', '').replace('}', '')

                dct_location = BNode()
                self.g.add((dataset_ref, DCT.spatial, dct_location))

                self.g.add((dct_location, RDF['type'], DCT.Location))

                # Try and add a Concept from the spatial vocabulary
                if self._add_concept(GEO_CONCEPTS, gname):
                    self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                                Literal(GEO_BASE_URI + gname)))

                    # geo concept is not really required, but may be a useful adding
                    self.g.add((dct_location, LOCN.geographicalName,
                                URIRef(GEO_BASE_URI + gname)))
                else:
                    # The dataset field is not a controlled tag, let's create a Concept out of the label we have
                    concept = BNode()
                    self.g.add((concept, RDF['type'], SKOS.Concept))
                    self.g.add((concept, SKOS.prefLabel, Literal(gname)))
                    self.g.add((dct_location, LOCN.geographicalName, concept))

        ### add spatial (GeoNames)
        value = self._get_dict_value(dataset_dict, 'geographical_geonames_url')
        if value:
            dct_location = BNode()
            self.g.add((dataset_ref, DCT.spatial, dct_location))

            self.g.add((dct_location, RDF['type'], DCT.Location))
            self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                        Literal(value)))

        ### replace periodicity
        self._remove_node(dataset_dict, dataset_ref,
                          ('frequency', DCT.accrualPeriodicity, None, Literal))
        self._add_uri_node(
            dataset_dict, dataset_ref,
            ('frequency', DCT.accrualPeriodicity, DEFAULT_FREQ_CODE, URIRef),
            FREQ_BASE_URI)
        # self._add_concept(FREQ_CONCEPTS, dataset_dict.get('frequency', DEFAULT_VOCABULARY_KEY))

        ### replace landing page
        self._remove_node(dataset_dict, dataset_ref,
                          ('url', DCAT.landingPage, None, URIRef))
        landing_page_uri = None
        if dataset_dict.get('name'):
            landing_page_uri = '{0}/dataset/{1}'.format(
                catalog_uri().rstrip('/'), dataset_dict['name'])
        else:
            landing_page_uri = dataset_uri(
                dataset_dict)  # TODO: preserve original URI if harvested

        self.g.add((dataset_ref, DCAT.landingPage, URIRef(landing_page_uri)))

        ### conformsTo
        self.g.remove((dataset_ref, DCT.conformsTo, None))
        value = self._get_dict_value(dataset_dict, 'conforms_to')
        if value:
            try:
                conforms_to = json.loads(value)
            except (
                    TypeError,
                    ValueError,
            ):
                log.warn("Cannot deserialize DCATAPIT:conformsTo value: %s",
                         value)
                conforms_to = []

            for item in conforms_to:
                standard = URIRef(item['uri']) if item.get('uri') else BNode()

                self.g.add((dataset_ref, DCT.conformsTo, standard))
                self.g.add((standard, RDF['type'], DCT.Standard))
                self.g.add((standard, RDF['type'], DCATAPIT.Standard))

                self.g.add(
                    (standard, DCT.identifier, Literal(item['identifier'])))

                for lang, val in (item.get('title') or {}).items():
                    if lang in OFFERED_LANGS:
                        self.g.add(
                            (standard, DCT.title,
                             Literal(val,
                                     lang=lang_mapping_ckan_to_xmllang.get(
                                         lang, lang))))

                for lang, val in (item.get('description') or {}).items():
                    if lang in OFFERED_LANGS:
                        self.g.add(
                            (standard, DCT.description,
                             Literal(val,
                                     lang=lang_mapping_ckan_to_xmllang.get(
                                         lang, lang))))

                for reference_document in (item.get('referenceDocumentation')
                                           or []):
                    self.g.add((standard, DCATAPIT.referenceDocumentation,
                                URIRef(reference_document)))

        ### ADMS:identifier alternative identifiers
        self.g.remove((
            dataset_ref,
            ADMS.identifier,
            None,
        ))
        try:
            alt_ids = json.loads(dataset_dict['alternate_identifier'])
        except (
                KeyError,
                TypeError,
                ValueError,
        ):
            alt_ids = []

        for alt_identifier in alt_ids:
            node = BNode()
            self.g.add((dataset_ref, ADMS.identifier, node))

            identifier = Literal(alt_identifier['identifier'])
            self.g.add((node, SKOS.notation, identifier))

            if alt_identifier.get('agent'):
                adata = alt_identifier['agent']
                agent = BNode()

                self.g.add((agent, RDF['type'], DCATAPIT.Agent))
                self.g.add((agent, RDF['type'], FOAF.Agent))
                self.g.add((node, DCT.creator, agent))
                if adata.get('agent_name'):
                    for alang, aname in adata['agent_name'].items():
                        self.g.add((agent, FOAF.name, Literal(aname,
                                                              lang=alang)))

                if adata.get('agent_identifier'):
                    self.g.add((agent, DCT.identifier,
                                Literal(adata['agent_identifier'])))

        self._set_temporal_coverage(self.g, dataset_dict, dataset_ref)

        ### publisher

        # DCAT by default creates this node
        # <dct:publisher>
        #   <foaf:Organization rdf:about="http://10.10.100.75/organization/55535226-f82a-4cf7-903a-3e10afeaa79a">
        #     <foaf:name>orga2_test</foaf:name>
        #   </foaf:Organization>
        # </dct:publisher>

        for s, p, o in g.triples((dataset_ref, DCT.publisher, None)):
            #log.info("Removing publisher %r", o)
            g.remove((s, p, o))

        publisher_ref = self._add_agent(dataset_dict,
                                        dataset_ref,
                                        'publisher',
                                        DCT.publisher,
                                        use_default_lang=True)

        ### Autore : Agent
        self._add_creators(dataset_dict, dataset_ref)

        ### Point of Contact

        # <dcat:contactPoint rdf:resource="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri"/>

        # <!-- http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri -->
        # <dcatapit:Organization rdf:about="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri">
        #    <rdf:type rdf:resource="&vcard;Kind"/>
        #    <rdf:type rdf:resource="&vcard;Organization"/>
        #    <vcard:hasEmail rdf:resource="mailto:[email protected]"/>
        #    <vcard:fn>Regione Liguria - Sportello Cartografico</vcard:fn>
        # </dcatapit:Organization>

        # TODO: preserve original info if harvested

        # retrieve the contactPoint added by the euro serializer
        euro_poc = g.value(subject=dataset_ref,
                           predicate=DCAT.contactPoint,
                           object=None,
                           any=False)

        # euro poc has this format:
        # <dcat:contactPoint>
        #    <vcard:Organization rdf:nodeID="Nfcd06f452bcd41f48f33c45b0c95979e">
        #       <vcard:fn>THE ORGANIZATION NAME</vcard:fn>
        #       <vcard:hasEmail>THE ORGANIZATION EMAIL</vcard:hasEmail>
        #    </vcard:Organization>
        # </dcat:contactPoint>

        if euro_poc:
            g.remove((dataset_ref, DCAT.contactPoint, euro_poc))

        org_id = dataset_dict.get('owner_org')

        # get orga info
        org_show = logic.get_action('organization_show')

        org_dict = {}
        if org_id:
            try:
                org_dict = org_show({'ignore_auth': True}, {
                    'id': org_id,
                    'include_datasets': False,
                    'include_tags': False,
                    'include_users': False,
                    'include_groups': False,
                    'include_extras': True,
                    'include_followers': False
                })
            except Exception, err:
                log.warning("Cannot get org for %s: %s",
                            org_id,
                            err,
                            exc_info=err)
    def test_graph_from_dataset(self):

        dataset = {
            "id": "4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6",
            "name": "test-dataset",
            "title": "Test DCAT dataset",
            "notes": "Lorem ipsum",
            "url": "http://example.com/ds1",
            "version": "1.0b",
            "metadata_created": "2015-06-26T15:21:09.034694",
            "metadata_modified": "2015-06-26T15:21:09.075774",
            "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}],
            "extras": [
                {"key": "alternate_identifier", "value": '["xyz", "abc"]'},
                {"key": "version_notes", "value": "This is a beta version"},
                {"key": "frequency", "value": "monthly"},
                {"key": "language", "value": '["en"]'},
                {"key": "theme", "value": '["http://eurovoc.europa.eu/100142", "http://eurovoc.europa.eu/100152"]'},
                {"key": "conforms_to", "value": '["Standard 1", "Standard 2"]'},
                {"key": "access_rights", "value": "public"},
                {"key": "documentation", "value": '["http://dataset.info.org/doc1", "http://dataset.info.org/doc2"]'},
                {"key": "provenance", "value": "Some statement about provenance"},
                {"key": "dcat_type", "value": "test-type"},
                {
                    "key": "related_resource",
                    "value": '["http://dataset.info.org/related1", "http://dataset.info.org/related2"]',
                },
                {
                    "key": "has_version",
                    "value": '["https://data.some.org/catalog/datasets/derived-dataset-1", "https://data.some.org/catalog/datasets/derived-dataset-2"]',
                },
                {"key": "is_version_of", "value": '["https://data.some.org/catalog/datasets/original-dataset"]'},
                {
                    "key": "source",
                    "value": '["https://data.some.org/catalog/datasets/source-dataset-1", "https://data.some.org/catalog/datasets/source-dataset-2"]',
                },
                {
                    "key": "sample",
                    "value": '["https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample"]',
                },
            ],
        }
        extras = self._extras(dataset)

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset["title"])
        assert self._triple(g, dataset_ref, DCT.description, dataset["notes"])

        assert self._triple(g, dataset_ref, OWL.versionInfo, dataset["version"])
        assert self._triple(g, dataset_ref, ADMS.versionNotes, extras["version_notes"])
        assert self._triple(g, dataset_ref, DCT.accrualPeriodicity, extras["frequency"])
        assert self._triple(g, dataset_ref, DCT.accessRights, extras["access_rights"])
        assert self._triple(g, dataset_ref, DCT.provenance, extras["provenance"])
        assert self._triple(g, dataset_ref, DCT.type, extras["dcat_type"])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset["tags"]:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag["name"])

        # Dates
        assert self._triple(g, dataset_ref, DCT.issued, dataset["metadata_created"], XSD.dateTime)
        assert self._triple(g, dataset_ref, DCT.modified, dataset["metadata_modified"], XSD.dateTime)

        # List
        for item in [
            ("language", DCT.language, Literal),
            ("theme", DCAT.theme, URIRef),
            ("conforms_to", DCT.conformsTo, Literal),
            ("alternate_identifier", ADMS.identifier, Literal),
            ("documentation", FOAF.page, Literal),
            ("related_resource", DCT.relation, Literal),
            ("has_version", DCT.hasVersion, Literal),
            ("is_version_of", DCT.isVersionOf, Literal),
            ("source", DCT.source, Literal),
            ("sample", ADMS.sample, Literal),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], item[2](value))
    def test_graph_from_dataset(self):

        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'Test DCAT dataset',
            'url':
            'http://example.com/ds1',
            'version':
            '1.0b',
            'metadata_created':
            '2015-06-26T15:21:09.034694',
            'metadata_modified':
            '2015-06-26T15:21:09.075774',
            'keywords': {
                'fr': [],
                'de': ['alter', 'sozialhilfe'],
                'en': ['age'],
                'it': []
            },
            'groups': [{
                'display_name': {
                    'fr': 'Economie nationale',
                    'de': 'Volkswirtschaft',
                    'en': 'National economy',
                    'it': 'Economia'
                },
                'description': {
                    'fr': '',
                    'de': '',
                    'en': 'some descriptiom'
                    '',
                    'it': ''
                },
                'image_display_url': '',
                'title': {
                    'fr': 'Economie nationale',
                    'de': 'Volkswirtschaft',
                    'en': 'National economy',
                    'it': 'Economia'
                },
                'id': '5389c3f2-2f64-436b-9fac-2d1fc342f7b5',
                'name': 'national-economy'
            }, {
                'display_name': {
                    'fr': 'Education, science',
                    'de': 'Bildung, Wissenschaft',
                    'en': 'Education and science',
                    'it': 'Formazione e scienza'
                },
                'description': {
                    'fr': '',
                    'de': '',
                    'en': '',
                    'it': ''
                },
                'image_display_url': '',
                'title': {
                    'fr': 'Education, science',
                    'de': 'Bildung, Wissenschaft',
                    'en': 'Education and science',
                    'it': 'Formazione e scienza'
                },
                'id': 'afcb4a2a-b4b0-4d7c-984a-9078e964be49',
                'name': 'education'
            }, {
                'display_name': {
                    'fr': 'Finances',
                    'de': 'Finanzen',
                    'en': 'Finances',
                    'it': 'Finanze'
                },
                'description': {
                    'fr': '',
                    'de': '',
                    'en': '',
                    'it': ''
                },
                'image_display_url': '',
                'title': {
                    'fr': 'Finances',
                    'de': 'Finanzen',
                    'en': 'Finances',
                    'it': 'Finanze'
                },
                'id': '79cbe120-e9c6-4249-b934-58ca980606d7',
                'name': 'finances'
            }],
            'description': {
                'fr': '',
                'de': 'Deutsche Beschreibung',
                'en': 'English Description',
                'it': ''
            },
            'extras': [
                {
                    'key': 'alternate_identifier',
                    'value': '[\"xyz\", \"abc\"]'
                },
                {
                    'key': 'identifier',
                    'value': '26be5452-fc5c-11e7-8450-fea9aa178066'
                },
                {
                    'key': 'version_notes',
                    'value': 'This is a beta version'
                },
                {
                    'key': 'frequency',
                    'value': 'monthly'
                },
                {
                    'key': 'language',
                    'value': '[\"en\"]'
                },
                {
                    'key':
                    'theme',
                    'value':
                    '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'
                },
                {
                    'key': 'conforms_to',
                    'value': '[\"Standard 1\", \"Standard 2\"]'
                },
                {
                    'key': 'access_rights',
                    'value': 'public'
                },
                {
                    'key':
                    'documentation',
                    'value':
                    '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'
                },
                {
                    'key': 'provenance',
                    'value': 'Some statement about provenance'
                },
                {
                    'key': 'dcat_type',
                    'value': 'test-type'
                },
                {
                    'key':
                    'related_resource',
                    'value':
                    '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'
                },
                {
                    'key':
                    'has_version',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'
                },
                {
                    'key':
                    'is_version_of',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/original-dataset\"]'
                },
                {
                    'key':
                    'source',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'
                },
                {
                    'key':
                    'sample',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'
                },
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer(profiles=['swiss_schemaorg'])
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, SCHEMA.Dataset)
        assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title'])
        assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version'])
        assert self._triple(g, dataset_ref, SCHEMA.identifier,
                            extras['identifier'])

        # Dates
        assert self._triple(g, dataset_ref, SCHEMA.datePublished,
                            dataset['metadata_created'])
        assert self._triple(g, dataset_ref, SCHEMA.dateModified,
                            dataset['metadata_modified'])

        for key, value in dataset['description'].iteritems():
            if dataset['description'].get(key):
                assert self._triple(g, dataset_ref, SCHEMA.description,
                                    Literal(value, lang=key))
        eq_(
            len([
                t for t in g.triples((dataset_ref, SCHEMA.description, None))
            ]), 2)

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, SCHEMA.keywords, None))]),
            3)
        for key, keywords in dataset['keywords'].iteritems():
            if dataset['keywords'].get(key):
                for keyword in keywords:
                    assert self._triple(g, dataset_ref, SCHEMA.keywords,
                                        Literal(keyword, lang=key))

        # List
        for item in [
            ('language', SCHEMA.inLanguage, Literal),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]),
                len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], item[2](value))
Example #13
0
    def test_graph_from_dataset(self):

        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'Test DCAT dataset',
            'notes':
            'Lorem ipsum',
            'url':
            'http://example.com/ds1',
            'version':
            '1.0b',
            'metadata_created':
            '2015-06-26T15:21:09.034694',
            'metadata_modified':
            '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'extras': [
                {
                    'key': 'alternate_identifier',
                    'value': 'xyz'
                },
                {
                    'key': 'version_notes',
                    'value': 'This is a beta version'
                },
                {
                    'key': 'frequency',
                    'value': 'monthly'
                },
                {
                    'key': 'language',
                    'value': '[\"en\"]'
                },
                {
                    'key':
                    'theme',
                    'value':
                    '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'
                },
                {
                    'key': 'conforms_to',
                    'value': '[\"Standard 1\", \"Standard 2\"]'
                },
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, OWL.versionInfo,
                            dataset['version'])
        assert self._triple(g, dataset_ref, ADMS.versionNotes,
                            extras['version_notes'])
        assert self._triple(g, dataset_ref, ADMS.identifier,
                            extras['alternate_identifier'])
        assert self._triple(g, dataset_ref, DCT.accrualPeriodicity,
                            extras['frequency'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])

        # Dates
        assert self._triple(g, dataset_ref, DCT.issued,
                            dataset['metadata_created'], XSD.dateTime)
        assert self._triple(g, dataset_ref, DCT.modified,
                            dataset['metadata_modified'], XSD.dateTime)

        # List
        for item in [
            ('language', DCT.language),
            ('theme', DCAT.theme),
            ('conforms_to', DCAT.conformsTo),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]),
                len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], value)
Example #14
0
    def test_graph_from_dataset(self):

        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'Test DCAT dataset',
            'notes':
            'Lorem ipsum',
            'url':
            'http://example.com/ds1',
            'version':
            '1.0b',
            'metadata_created':
            '2015-06-26T15:21:09.034694',
            'metadata_modified':
            '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'extras': [
                {
                    'key': 'alternate_identifier',
                    'value': '[\"xyz\", \"abc\"]'
                },
                {
                    'key': 'version_notes',
                    'value': 'This is a beta version'
                },
                {
                    'key': 'frequency',
                    'value': 'monthly'
                },
                {
                    'key':
                    'language',
                    'value':
                    '[\"en\", \"http://publications.europa.eu/resource/authority/language/ITA\"]'
                },
                {
                    'key':
                    'theme',
                    'value':
                    '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'
                },
                {
                    'key': 'conforms_to',
                    'value': '[\"Standard 1\", \"Standard 2\"]'
                },
                {
                    'key': 'access_rights',
                    'value': 'public'
                },
                {
                    'key':
                    'documentation',
                    'value':
                    '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'
                },
                {
                    'key': 'provenance',
                    'value': 'Some statement about provenance'
                },
                {
                    'key': 'dcat_type',
                    'value': 'test-type'
                },
                {
                    'key':
                    'related_resource',
                    'value':
                    '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'
                },
                {
                    'key':
                    'has_version',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'
                },
                {
                    'key':
                    'is_version_of',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/original-dataset\"]'
                },
                {
                    'key':
                    'source',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'
                },
                {
                    'key':
                    'sample',
                    'value':
                    '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'
                },
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, OWL.versionInfo,
                            dataset['version'])
        assert self._triple(g, dataset_ref, ADMS.versionNotes,
                            extras['version_notes'])
        assert self._triple(g, dataset_ref, DCT.accrualPeriodicity,
                            extras['frequency'])
        assert self._triple(g, dataset_ref, DCT.accessRights,
                            extras['access_rights'])
        assert self._triple(g, dataset_ref, DCT.provenance,
                            extras['provenance'])
        assert self._triple(g, dataset_ref, DCT.type, extras['dcat_type'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])

        # Dates
        assert self._triple(g, dataset_ref, DCT.issued,
                            dataset['metadata_created'], XSD.dateTime)
        assert self._triple(g, dataset_ref, DCT.modified,
                            dataset['metadata_modified'], XSD.dateTime)

        # List
        for item in [
            ('language', DCT.language, [Literal, URIRef]),
            ('theme', DCAT.theme, URIRef),
            ('conforms_to', DCT.conformsTo, Literal),
            ('alternate_identifier', ADMS.identifier, Literal),
            ('documentation', FOAF.page, URIRef),
            ('related_resource', DCT.relation, URIRef),
            ('has_version', DCT.hasVersion, URIRef),
            ('is_version_of', DCT.isVersionOf, URIRef),
            ('source', DCT.source, Literal),
            ('sample', ADMS.sample, Literal),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]),
                len(values))
            for num, value in enumerate(values):
                _type = item[2]
                if isinstance(item[2], list):
                    eq_(len(item[2]), len(values))
                    _type = item[2][num]
                assert self._triple(g, dataset_ref, item[1], _type(value))
    def test_graph_from_dataset(self):

        conforms_to_in = [{'identifier': 'CONF1',
                                       'uri': 'conf01',
                                 'title': {'en': 'title', 'it': 'title'},
                                 'referenceDocumentation': ['http://abc.efg/'],},
                                {'identifier': 'CONF2',
                                 'title': {'en': 'title', 'it': 'title'},
                                 'description': {'en': 'descen', 'it': 'descit'},
                                 'referenceDocumentation': ['http://abc.efg/'],},
                                 ]

        alternate_identifiers = [{'identifier': 'aaaabc',
                                 'agent': {'agent_identifier': 'agent01',
                                           'agent_name': {'en': 'Agent en 01', 'it': 'Agent it 01'}},
                                 },
                                 {'identifier': 'other identifier', 'agent': {}}]
        creators = [{'creator_name': {'en': 'abc'}, 'creator_identifier': "ABC"},
                    {'creator_name': {'en': 'cde'}, 'creator_identifier': "CDE"},
                    ]

        temporal_coverage = [{'temporal_start': '2001-01-01', 'temporal_end': '2001-02-01 10:11:12'},
                             {'temporal_start': '2001-01-01', 'temporal_end': '2001-02-01 11:12:13'},
                            ]

        subthemes = [{'theme': 'AGRI', 'subthemes': ['http://eurovoc.europa.eu/100253',
                                                     'http://eurovoc.europa.eu/100258']},
                     {'theme': 'ENVI', 'subthemes': []}]

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'issued':'2016-11-29',
            'modified':'2016-11-29',
            'identifier':'ISBN',
            'temporal_start':'2016-11-01',
            'temporal_end':'2016-11-30',
            'frequency':'UPDATE_CONT',
            'publisher_name':'bolzano',
            'publisher_identifier':'234234234',
            'creator_name':'test',
            'creator_identifier':'412946129',
            'holder_name':'bolzano',
            'holder_identifier':'234234234',
            'alternate_identifier':json.dumps(alternate_identifiers),
            'temporal_coverage': json.dumps(temporal_coverage),
            #'theme':'ECON',
            'geographical_geonames_url':'http://www.geonames.org/3181913',
            'language':'{DEU,ENG,ITA}',
            'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'conforms_to':json.dumps(conforms_to_in),
            'creator': json.dumps(creators),
            'theme': json.dumps(subthemes),


        }
        
        pkg_id = dataset['id']
        
        pub_names = {'it': 'IT publisher',
                     'es': 'EN publisher'}
        holder_names = {'it': 'IT holder name',
                        'es': 'EN holder name'}

        multilang_fields = [('publisher_name', 'package', k, v) for k, v in pub_names.items()] +\
                           [('holder_name', 'package', k, v) for k, v in holder_names.items()]
        
        pkg = helpers.call_action('package_create', {'defer_commit': True}, **dataset)
        rev = getattr(Session,  'revision', repo.new_revision())
        Session.flush()
        Session.revision = rev
        pkg_id = pkg['id']

        for field_name, field_type, lang, text in multilang_fields:
            interfaces.upsert_package_multilang(pkg_id, field_name, field_type, lang, text)

        loc_dict = interfaces.get_for_package(pkg_id)
        #assert loc_dict['publisher_name'] == pub_names
        #assert loc_dict['holder_name'] == holder_names


        # temporary bug for comaptibility with interfaces.get_language(),
        # which will return lang[0]
        pub_names.update({DEFAULT_LANG: dataset['publisher_name']})
        # pub_names.update({DEFAULT_LANG[0]: dataset['publisher_name']})
        holder_names.update({DEFAULT_LANG: dataset['holder_name']})
        # holder_names.update({DEFAULT_LANG[0]: dataset['holder_name']})
        
        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCATAPIT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, DCT.identifier, dataset['identifier'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])
        
        # conformsTo
        conforms_to = list(g.triples((None, DCT.conformsTo, None)))
        assert conforms_to

        conforms_to_dict = dict((d['identifier'], d) for d in conforms_to_in)
        for conf in conforms_to:
            conf_id = conf[-1]

            identifier = g.value(conf_id, DCT.identifier)
            titles = list(g.objects(conf_id, DCT.title))
            descs = list(g.objects(conf_id, DCT.description))
            references = list(g.objects(conf_id, DCATAPIT.referenceDocumentation))
            
            check = conforms_to_dict.get(str(identifier))
            
            assert isinstance(check, dict)

            if check.get('uri'):
                assert check['uri'] == str(conf_id)
            assert len(titles), "missing titles"
            
            assert (len(descs)> 0) == bool(check.get('description')), "missing descriptions"

            for title in titles:
                tlang = title.language
                tval = str(title)
                assert tval == check['title'][tlang], (tlang, tval, check['title'])

            for desc in descs:
                tlang = desc.language
                tval = str(desc)
                assert tval == check['description'][tlang], (tlang, str(tval), check['description'])
            
            ref_docs = check.get('referenceDocumentation')
            assert len(references) == len(ref_docs), "missing reference documentation"
            
            for dref in references:
                assert str(dref) in ref_docs, "{} not in {}".format(dref, ref_docs)
                                                                
            for ref in ref_docs:
                assert URIRef(ref) in references

        # alternate identifiers
        alt_ids = [a[-1] for a in g.triples((None, ADMS.identifier, None))]
        alt_ids_dict = dict((a['identifier'], a) for a in alternate_identifiers)

        for alt_id in alt_ids:
            identifier = g.value(alt_id, SKOS.notation)
            check = alt_ids_dict[str(identifier)]
            assert str(identifier) == check['identifier']
            if check.get('agent'):
                agent_ref = g.value(alt_id, DCT.creator)
                assert agent_ref is not None

                agent_identifier = g.value(agent_ref, DCT.identifier)

                agent_name = dict((v.language, str(v)) for v in g.objects(agent_ref, FOAF.name))
                
                assert set(agent_name.items()) == set(check['agent']['agent_name'].items()),\
                    "expected {}, got {} for {}".format(check['agent']['agent_name'], agent_name, agent_ref)

                assert str(agent_identifier) == check['agent']['agent_identifier'],\
                    "expected {}, got {}".format(check['agent']['agent_identifier'], agent_identifier)
        # creators
        creators.append({'creator_name':{'en': 'test'},
                         'creator_identifier':'412946129'})
        creators_in = list(g.objects(dataset_ref, DCT.creator))
        assert len(creators) == len(creators_in)

        for cref in creators_in:
            cnames = dict((str(c.language) if c.language else DEFAULT_LANG, str(c)) for c in g.objects(cref, FOAF.name))
            c_identifier = g.value(cref, DCT.identifier)
            c_dict = {'creator_name': cnames,
                      'creator_identifier': str(c_identifier)}
            assert c_dict in creators, "no {} in {}".format(c_dict, creators)

        # temporal coverage
        temporal_coverage.append({'temporal_start': dataset['temporal_start'],
                                  'temporal_end': dataset['temporal_end']})
        temp_exts = list(g.triples((dataset_ref, DCT.temporal, None)))
        assert len(temp_exts) == len(temporal_coverage)
        
        # normalize values
        for item in temporal_coverage:
            for k, v in item.items():
                item[k] = pdate(v)

        temp_ext = []
        for interval_t in temp_exts:
            interval = interval_t[-1]
            start = g.value(interval, SCHEMA.startDate)
            end = g.value(interval, SCHEMA.endDate)
            assert start is not None
            assert end is not None
            temp_ext.append({'temporal_start': pdate(str(start)),
                             'temporal_end': pdate(str(end))})

        set1 = set([tuple(d.items()) for d in temp_ext])
        set2 = set([tuple(d.items()) for d in temporal_coverage])
        assert set1 == set2, "Got different temporal coverage sets: \n{}\n vs\n {}".format(set1, set2)

        for pub_ref in g.objects(dataset_ref, DCT.publisher):
            _pub_names = list(g.objects(pub_ref, FOAF.name))

            assert len(_pub_names) 

            for pub_name in _pub_names:
                if pub_name.language:
                    assert str(pub_name.language) in pub_names, "no {} in {}".format(pub_name.language, pub_names)
                    assert pub_names[str(pub_name.language)] == str(pub_name), "{} vs {}".format(pub_name, pub_names)

        for holder_ref in g.objects(dataset_ref, DCT.rightsHolder):
            _holder_names = list(g.objects(holder_ref, FOAF.name))

            assert len(_holder_names) 

            for holder_name in _holder_names:
                if holder_name.language:
                    assert str(holder_name.language) in holder_names, "no {} in {}".format(holder_name.language, holder_names)
                    assert holder_names[str(holder_name.language)] == str(holder_name), "{} vs {}".format(holder_name, holder_names)
Example #16
0
    def test_graph_from_dataset(self):

        src_conforms_to = [
            {
                'identifier': 'CONF1',
                'uri': 'conf01',
                'title': {
                    'en': 'title1EN',
                    'it': 'title1IT'
                },
                'referenceDocumentation': ['http://abc.efg/'],
            },
            {
                'identifier': 'CONF2',
                'title': {
                    'en': 'title2EN',
                    'it': 'title2IT'
                },
                'description': {
                    'en': 'desc2EN',
                    'it': 'desc2IT'
                },
                'referenceDocumentation': ['http://abc.efg/'],
            },
        ]

        src_alt_identifiers = [{
            'identifier': 'aaaabc',
            'agent': {
                'agent_identifier': 'agent01',
                'agent_name': {
                    'en': 'Agent en 01',
                    'it': 'Agent it 01'
                }
            },
        }, {
            'identifier': 'other identifier',
            'agent': {}
        }]
        src_creators = [
            {
                'creator_name': {
                    'en': 'abcEN',
                    'it': 'abcIT'
                },
                'creator_identifier': 'ABC'
            },
            {
                'creator_name': {
                    'en': 'cde'
                },
                'creator_identifier': 'CDE'
            },
        ]

        src_temporal_coverage = [
            {
                'temporal_start': '2001-01-01',
                'temporal_end': '2001-02-01 10:11:12'
            },
            {
                'temporal_start': '2001-01-01',
                'temporal_end': '2001-02-01 11:12:13'
            },
        ]

        subthemes = [{
            'theme':
            'AGRI',
            'subthemes': [
                'http://eurovoc.europa.eu/100253',
                'http://eurovoc.europa.eu/100258'
            ]
        }, {
            'theme': 'ENVI',
            'subthemes': []
        }]

        pub_it = 'IT publisher'
        holder_it = 'IT holder'

        org = factories.Organization(identifier=uuid.uuid4(),
                                     is_org=True,
                                     name=uuid.uuid4())
        src_dataset = {
            # 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'owner_org': org['id'],
            'name': str(uuid.uuid4()),
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued': '2016-11-29',
            'modified': '2016-11-29',
            'identifier': str(uuid.uuid4()),
            'temporal_start': '2016-11-01',
            'temporal_end': '2016-11-30',
            'frequency': 'UPDATE_CONT',
            'publisher_name': pub_it,
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '412946129',
            'holder_name': holder_it,
            'holder_identifier': '234234234',
            'alternate_identifier': json.dumps(src_alt_identifiers),
            'temporal_coverage': json.dumps(src_temporal_coverage),
            # 'theme':'ECON',
            'geographical_geonames_url': 'http://www.geonames.org/3181913',
            'language': '{DEU,ENG,ITA}',
            'is_version_of':
            'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'conforms_to': json.dumps(src_conforms_to),
            'creator': json.dumps(src_creators),
            FIELD_THEMES_AGGREGATE: json.dumps(subthemes),
            'theme': theme_aggr_to_theme_uris(subthemes),
        }

        src_pub_names = {'it': pub_it, 'en': 'EN publisher'}
        src_holder_names = {'it': holder_it, 'en': 'EN holder name'}

        multilang_fields = [('publisher_name', 'package', k, v) for k, v in src_pub_names.items()] +\
                           [('holder_name', 'package', k, v) for k, v in src_holder_names.items()]

        pkg = helpers.call_action('package_create', {'defer_commit': True},
                                  **src_dataset)
        Session.flush()
        pkg_id = pkg['id']
        src_dataset['id'] = pkg_id

        for field_name, field_type, lang, text in multilang_fields:
            interfaces.upsert_package_multilang(pkg_id, field_name, field_type,
                                                lang, text)

        # loc_dict = interfaces.get_for_package(pkg_id)
        #assert loc_dict['publisher_name'] == pub_names
        #assert loc_dict['holder_name'] == holder_names

        # LEGACY: temporary bug for comaptibility with interfaces.get_language(),
        # which will return lang[0]
        # pub_names.update({DEFAULT_LANG: src_dataset['publisher_name']})
        # pub_names.update({DEFAULT_LANG[0]: dataset['publisher_name']})
        # holder_names.update({DEFAULT_LANG: src_dataset['holder_name']})
        # holder_names.update({DEFAULT_LANG[0]: dataset['holder_name']})

        s = RDFSerializer()
        g = s.g

        dataset_graph = s.graph_from_dataset(pkg)

        self.assertEqual(str(dataset_graph),
                         str(utils.dataset_uri(src_dataset)),
                         'Dataset URI changes')

        # Basic fields
        self.assertIsNotNone(
            self._triple(g, dataset_graph, RDF.type, DCATAPIT.Dataset))
        self.assertIsNotNone(
            self._triple(g, dataset_graph, DCT.title, src_dataset['title']))
        self.assertIsNotNone(
            self._triple(g, dataset_graph, DCT.description,
                         src_dataset['notes']))

        self.assertIsNotNone(
            self._triple(g, dataset_graph, DCT.identifier,
                         src_dataset['identifier']))

        # Tags
        self.assertEqual(
            2,
            len([t for t in g.triples((dataset_graph, DCAT.keyword, None))]))
        for tag in src_dataset['tags']:
            self.assertIsNotNone(
                self._triple(g, dataset_graph, DCAT.keyword, tag['name']))

        # conformsTo
        conforms_to_nodes = list(g.objects(dataset_graph, DCT.conformsTo))
        self.assertEqual(2, len(conforms_to_nodes))

        src_conforms_dict = {d['identifier']: d for d in src_conforms_to}
        for conf_node in conforms_to_nodes:
            conf_id = str(conf_node)

            identifier = g.value(conf_node, DCT.identifier)
            titles = list(g.objects(conf_node, DCT.title))
            descs = list(g.objects(conf_node, DCT.description))
            references = list(
                g.objects(conf_node, DCATAPIT.referenceDocumentation))

            src_conforms = src_conforms_dict.get(str(identifier))

            assert isinstance(src_conforms, dict)

            if src_conforms.get('uri'):
                assert src_conforms['uri'] == str(conf_node)
            assert len(titles), 'missing titles'

            assert (len(descs) > 0) == bool(
                src_conforms.get('description')), 'missing descriptions'

            titles_dict = {title.language: str(title) for title in titles}
            for lang, src_value in src_conforms['title'].items(
            ):  # looping on the source items bc graph info may have been augmented
                self.assertEqual(src_value, titles_dict[lang],
                                 f'Titles do not match for lang:{lang}')

            descr_dict = {descr.language: str(descr) for descr in descs}
            for lang, src_value in src_conforms.get('description', {}).items(
            ):  # looping on the source items bc graph info may have been augmented
                self.assertEqual(src_value, descr_dict[lang],
                                 f'descriptions do not match for lang:{lang}')

            ref_docs = src_conforms.get('referenceDocumentation')
            assert len(references) == len(
                ref_docs), 'missing reference documentation'

            for dref in references:
                assert str(dref) in ref_docs, '{} not in {}'.format(
                    dref, ref_docs)

            for ref in ref_docs:
                assert URIRef(ref) in references

        # alternate identifiers
        alt_ids = [a[-1] for a in g.triples((None, ADMS.identifier, None))]
        alt_ids_dict = dict((a['identifier'], a) for a in src_alt_identifiers)

        for alt_id in alt_ids:
            identifier = g.value(alt_id, SKOS.notation)
            src_conforms = alt_ids_dict[str(identifier)]
            assert str(identifier) == src_conforms['identifier']
            if src_conforms.get('agent'):
                agent_ref = g.value(alt_id, DCT.creator)
                assert agent_ref is not None

                # agent_identifier = g.value(agent_ref, DCT.identifier)
                agent_name = {
                    v.language: str(v)
                    for v in g.objects(agent_ref, FOAF.name)
                }

                for a in set(src_conforms['agent']['agent_name'].items()):
                    self.assertIn(a, set(agent_name.items()),
                                  "Agents name not found")

                self.assertEqual(src_conforms['agent']['agent_identifier'],
                                 str(g.value(agent_ref, DCT.identifier)),
                                 "Agents identifier mismatch")
        # creators
        creators_in = list(g.objects(dataset_graph, DCT.creator))
        assert len(src_creators) == len(creators_in)

        for cref in creators_in:
            c_identifier = str(g.value(cref, DCT.identifier))
            cnames = dict(
                (str(c.language) if c.language else DEFAULT_LANG, str(c))
                for c in g.objects(cref, FOAF.name))
            src_creator = [
                x for x in src_creators
                if x['creator_identifier'] == c_identifier
            ]
            self.assertEqual(1, len(src_creator))
            for lang, name in src_creator[0]['creator_name'].items():
                self.assertEqual(name, cnames[lang])

            # c_dict = {'creator_name': cnames,
            #           'creator_identifier': str(c_identifier)}
            # assert c_dict in src_creators, 'no {} in {}'.format(c_dict, src_creators)

        # temporal coverage
        temp_exts = list(g.triples((dataset_graph, DCT.temporal, None)))
        assert len(temp_exts) == len(src_temporal_coverage)

        # normalize values
        for item in src_temporal_coverage:
            for k, v in item.items():
                item[k] = pdate(v)

        temp_ext = []
        for interval_t in temp_exts:
            interval = interval_t[-1]
            start = g.value(interval, SCHEMA.startDate)
            end = g.value(interval, SCHEMA.endDate)
            assert start is not None
            assert end is not None
            temp_ext.append({
                'temporal_start': pdate(str(start)),
                'temporal_end': pdate(str(end))
            })

        set1 = set([tuple(d.items()) for d in temp_ext])
        set2 = set([tuple(d.items()) for d in src_temporal_coverage])
        assert set1 == set2, 'Got different temporal coverage sets: \n{}\n vs\n {}'.format(
            set1, set2)

        for pub_ref in g.objects(dataset_graph, DCT.publisher):
            _pub_names = list(g.objects(pub_ref, FOAF.name))

            assert len(_pub_names)

            for pub_name in _pub_names:
                if pub_name.language:
                    self.assertIn(
                        str(pub_name.language), src_pub_names.keys(),
                        f'Missing publisher lang:{pub_name.language}')
                    self.assertEqual(
                        src_pub_names[str(pub_name.language)], str(pub_name),
                        f'Mismatching publisger name lang:{pub_name.language} '
                    )
Example #17
0
    def graph_from_record(self, record_dict, resource, record_ref):
        """
        RDF for an individual record - currently this is a specimen record

        Similar approach to: curl -L -H "Accept: application/rdf+ttl" http://data.rbge.org.uk/herb/E00321910

        :param record_dict:
        :param resource:
        :param record_ref:
        :return:
        """
        context = self.get_context()
        namespaces = {
            'dc': DC,
            'dcat': DCAT,
            'dwc': DWC,
            'sdwc': SDWC,
            'void': VOID,
            'cc': CC,
            'foaf': FOAF,
            'dqv': DQV,
            'aiiso': AIISO,
            'tdwgi': TDWGI,
            'owl': OWL
        }

        g = self.g

        # Add some more namespaces
        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # Get the GBIF record if it exists
        occurrence_id = record_dict.get('occurrenceID')

        package_id = resource.get_package_id()

        # Create licences metadata for record
        object_uri = URIRef(record_ref + '#object')

        # Add publisher - as per BBC we don't need the full org description here
        nhm_uri = URIRef('http://nhm.ac.uk')

        # Add object description - the metadata and license
        g.add((record_ref, RDF.type, FOAF.Document))
        g.add((record_ref, CC.license, URIRef(METADATA_LICENCE)))
        # This metadata describes #dataset
        g.add((record_ref, FOAF.primaryTopic, object_uri))
        # Add the de-referenced link to record
        record_link = url_for('record', action='view', package_name=package_id, resource_id=resource.id, record_id=record_dict['_id'], qualified=True)
        g.add((record_ref, DC.hasVersion, URIRef(record_link)))
        # Add institution properties
        g.add((record_ref, FOAF.organization, nhm_uri))
        g.add((record_ref, AIISO.Department, Literal(get_department(record_dict['collectionCode']))))

        try:
            sub_dept = record_dict.pop('subDepartment')
        except KeyError:
            pass
        else:
            g.add((record_ref, AIISO.Division, Literal(sub_dept)))

        # Created and modified belong to the metadata record, not the specimen
        for term in ['created', 'modified']:
            try:
                value = record_dict.get(term)
            except KeyError:
                pass
            else:
                # Parse into data format, and add as dates
                _date = parse_date(value)
                g.add((record_ref, getattr(DWC, term), Literal(_date.isoformat(), datatype=XSD.dateTime)))

        try:
            gbif_record = toolkit.get_action('gbif_record_show')(context, {
                'occurrence_id': occurrence_id
            })
        except NotFound:
            gbif_record = {}
        else:
            # Assert equivalence with the GBIF record
            gbif_uri = os.path.join('http://www.gbif.org/occurrence', gbif_record['gbifID'])
            g.add((object_uri, OWL.sameAs, URIRef(gbif_uri)))
            # If we have a GBIF country code, add it
            # Annoyingly, this seems to be the only geographic element on GBIF with URI
            country_code = gbif_record.get('gbifCountryCode')
            if country_code:
                g.add((object_uri, DWC.countryCode, URIRef(os.path.join('http://www.gbif.org/country', country_code))))

        # Now, create the specimen object
        # Remove nulls and hidden fields from record_dict
        record_dict = dict((k, v) for k, v in record_dict.iteritems() if v)

        # Now add the actual specimen object
        g.add((object_uri, RDF.type, FOAF.Document))
        g.add((object_uri, RDF.type, SDWC.SimpleDarwinRecordSet))

        # Make sure decimal latitude and longitude are strings
        for d in ['decimalLatitude', 'decimalLongitude']:
            try:
                record_dict[d] = str(record_dict[d])
            except KeyError:
                pass

        # Adding images as JSON is rubbish! So lets try and do it properly
        try:
            associated_media = record_dict.pop('associatedMedia')
        except KeyError:
            pass
        else:
            images = json.loads(associated_media)
            for image in images:
                image_uri = URIRef(image['identifier'])
                g.set((image_uri, RDF.type, FOAF.Image))
                title = image.get('title', None)
                if title:
                    g.set((image_uri, DC.title, Literal(title)))
                g.set((image_uri, CC.license, URIRef(image['license'])))
                g.set((image_uri, DC.RightsStatement, Literal(image['rightsHolder'])))
                g.set((image_uri, DC.Format, Literal(image['format'])))
                # Add link from image to object...
                g.set((image_uri, FOAF.depicts, object_uri))
                # And object to image
                g.add((object_uri, FOAF.depiction, image_uri))

        # This record belongs in X dataset
        dataset_ref = URIRef(dataset_uri({'id': package_id}) + '#dataset')
        g.add((object_uri, VOID.inDataset, dataset_ref))

        dwc_terms_dict = dwc_terms(record_dict.keys())

        # Handle dynamic properties separately
        dynamic_properties = dwc_terms_dict.pop('dynamicProperties')

        for group, terms in dwc_terms_dict.items():
            for uri, term in terms.items():
                # Do we have a GBIF key value?
                # Uppercase first letter of term, and convert to GBIF key format => gbifGenusKey
                uc_term = term[0].upper() + term[1:]
                gbif_term_key = 'gbif%sKey' % uc_term
                gbif_key = gbif_record.get(gbif_term_key)

                # Do we have a GBIF key value? If we do, we can provide a URI to GBIF
                if gbif_key:
                    gbif_uri = URIRef(os.path.join('http://www.gbif.org/species', gbif_key))
                    # Add the GBIF species URI with label
                    g.add((gbif_uri, RDFS.label, Literal(record_dict.get(term))))
                    # And associated our specimen object's DWC term with the GBIF URI
                    g.add((object_uri, getattr(DWC, term), gbif_uri))
                else:
                    # We do not have a GBIF key, so no URI: Add the term value as a literal
                    g.add((object_uri, getattr(DWC, term), Literal(record_dict.get(term))))

        g.add((object_uri, DC.identifier, Literal(record_dict.get('uuid'))))

        dynamic_properties_dict = {}
        for properties in dynamic_properties.values():
            for property in properties:
                dynamic_properties_dict[property] = record_dict.get(property)
        if dynamic_properties_dict:
            g.add((object_uri, DWC.dynamicProperties, Literal(json.dumps(dynamic_properties_dict))))
    def test_graph_from_dataset(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'notes': 'Lorem ipsum',
            'url': 'http://example.com/ds1',
            'version': '1.0b',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'extras': [
                {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'},
                {'key': 'identifier', 'value': '26be5452-fc5c-11e7-8450-fea9aa178066'},
                {'key': 'version_notes', 'value': 'This is a beta version'},
                {'key': 'frequency', 'value': 'monthly'},
                {'key': 'language', 'value': '[\"en\"]'},
                {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'},
                {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'},
                {'key': 'access_rights', 'value': 'public'},
                {'key': 'documentation', 'value': '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'},
                {'key': 'provenance', 'value': 'Some statement about provenance'},
                {'key': 'dcat_type', 'value': 'test-type'},
                {'key': 'related_resource', 'value': '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'},
                {'key': 'has_version', 'value': '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'},
                {'key': 'is_version_of', 'value': '[\"https://data.some.org/catalog/datasets/original-dataset\"]'},
                {'key': 'source', 'value': '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'},
                {'key': 'sample', 'value': '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'},
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer(profiles=['schemaorg'])
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, SCHEMA.Dataset)
        assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title'])
        assert self._triple(g, dataset_ref, SCHEMA.description, dataset['notes'])
        assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version'])
        assert self._triple(g, dataset_ref, SCHEMA.identifier, extras['identifier'])

        # Dates
        assert self._triple(g, dataset_ref, SCHEMA.datePublished, dataset['metadata_created'])
        assert self._triple(g, dataset_ref, SCHEMA.dateModified, dataset['metadata_modified'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, SCHEMA.keywords, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, SCHEMA.keywords, tag['name'])

        # List
        for item in [
            ('language', SCHEMA.inLanguage, Literal),
            ('theme', SCHEMA.about, URIRef),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], item[2](value))
    def test_graph_from_dataset(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'notes': 'Lorem ipsum',
            'url': 'http://example.com/ds1',
            'version': '1.0b',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'license_title': 'CC-BY 3.0',
            'license_url': 'http://creativecommons.org/licenses/by/3.0/',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'extras': [
                {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'},
                {'key': 'identifier', 'value': '26be5452-fc5c-11e7-8450-fea9aa178066'},
                {'key': 'version_notes', 'value': 'This is a beta version'},
                {'key': 'frequency', 'value': 'monthly'},
                {'key': 'language', 'value': '[\"en\"]'},
                {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'},
                {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'},
                {'key': 'access_rights', 'value': 'public'},
                {'key': 'documentation', 'value': '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'},
                {'key': 'provenance', 'value': 'Some statement about provenance'},
                {'key': 'dcat_type', 'value': 'test-type'},
                {'key': 'related_resource', 'value': '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'},
                {'key': 'has_version', 'value': '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'},
                {'key': 'is_version_of', 'value': '[\"https://data.some.org/catalog/datasets/original-dataset\"]'},
                {'key': 'source', 'value': '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'},
                {'key': 'sample', 'value': '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'},
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer(profiles=['schemaorg'])
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, SCHEMA.Dataset)
        assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title'])
        assert self._triple(g, dataset_ref, SCHEMA.description, dataset['notes'])
        assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version'])
        assert self._triple(g, dataset_ref, SCHEMA.license, dataset['license_url'])
        assert self._triple(g, dataset_ref, SCHEMA.identifier, extras['identifier'])
        url = self._triple(g, dataset_ref, SCHEMA.url, None)[2]
        assert url
        eq_(url, Literal('http://test.ckan.net/dataset/%s' % dataset['name']))

        # Dates
        assert self._triple(g, dataset_ref, SCHEMA.datePublished, dataset['metadata_created'])
        assert self._triple(g, dataset_ref, SCHEMA.dateModified, dataset['metadata_modified'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, SCHEMA.keywords, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, SCHEMA.keywords, tag['name'])

        # List
        for item in [
            ('language', SCHEMA.inLanguage, Literal),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], item[2](value))
    def test_graph_from_dataset(self):

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Test DCAT dataset',
            'notes': 'Lorem ipsum',
            'url': 'http://example.com/ds1',
            'version': '1.0b',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'extras': [
                {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'},
                {'key': 'version_notes', 'value': 'This is a beta version'},
                {'key': 'frequency', 'value': 'monthly'},
                {'key': 'language', 'value': '[\"en\"]'},
                {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'},
                {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'},
                {'key': 'access_rights', 'value': 'public'},
                {'key': 'documentation', 'value': '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'},
                {'key': 'provenance', 'value': 'Some statement about provenance'},
                {'key': 'dcat_type', 'value': 'test-type'},
                {'key': 'related_resource', 'value': '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'},
                {'key': 'has_version', 'value': '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'},
                {'key': 'is_version_of', 'value': '[\"https://data.some.org/catalog/datasets/original-dataset\"]'},
                {'key': 'source', 'value': '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'},
                {'key': 'sample', 'value': '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'},
            ]
        }
        extras = self._extras(dataset)

        s = RDFSerializer()
        g = s.g

        dataset_ref = s.graph_from_dataset(dataset)

        eq_(unicode(dataset_ref), utils.dataset_uri(dataset))

        # Basic fields
        assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset)
        assert self._triple(g, dataset_ref, DCT.title, dataset['title'])
        assert self._triple(g, dataset_ref, DCT.description, dataset['notes'])

        assert self._triple(g, dataset_ref, OWL.versionInfo, dataset['version'])
        assert self._triple(g, dataset_ref, ADMS.versionNotes, extras['version_notes'])
        assert self._triple(g, dataset_ref, DCT.accrualPeriodicity, extras['frequency'])
        assert self._triple(g, dataset_ref, DCT.accessRights, extras['access_rights'])
        assert self._triple(g, dataset_ref, DCT.provenance, extras['provenance'])
        assert self._triple(g, dataset_ref, DCT.type, extras['dcat_type'])

        # Tags
        eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2)
        for tag in dataset['tags']:
            assert self._triple(g, dataset_ref, DCAT.keyword, tag['name'])

        # Dates
        assert self._triple(g, dataset_ref, DCT.issued, dataset['metadata_created'], XSD.dateTime)
        assert self._triple(g, dataset_ref, DCT.modified, dataset['metadata_modified'], XSD.dateTime)

        # List
        for item in [
            ('language', DCT.language, Literal),
            ('theme', DCAT.theme, URIRef),
            ('conforms_to', DCT.conformsTo, Literal),
            ('alternate_identifier', ADMS.identifier, Literal),
            ('documentation', FOAF.page, Literal),
            ('related_resource', DCT.relation, Literal),
            ('has_version', DCT.hasVersion, Literal),
            ('is_version_of', DCT.isVersionOf, Literal),
            ('source', DCT.source, Literal),
            ('sample', ADMS.sample, Literal),
        ]:
            values = json.loads(extras[item[0]])
            eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
            for value in values:
                assert self._triple(g, dataset_ref, item[1], item[2](value))
Example #21
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        title = dataset_dict.get('title')

        g = self.g

        for prefix, namespace in it_namespaces.iteritems():
            g.bind(prefix, namespace)

        ### add a further type for the Dataset node
        g.add((dataset_ref, RDF.type, DCATAPIT.Dataset))

        ### replace themes
        value = self._get_dict_value(dataset_dict, 'theme')
        if value:
            for theme in value.split(','):
                self.g.remove((dataset_ref, DCAT.theme, URIRef(theme)))
                theme = theme.replace('{', '').replace('}', '')
                self.g.add(
                    (dataset_ref, DCAT.theme, URIRef(THEME_BASE_URI + theme)))
                self._add_concept(THEME_CONCEPTS, theme)
        else:
            self.g.add((dataset_ref, DCAT.theme,
                        URIRef(THEME_BASE_URI + DEFAULT_THEME_KEY)))
            self._add_concept(THEME_CONCEPTS, DEFAULT_THEME_KEY)

        ### replace languages
        value = self._get_dict_value(dataset_dict, 'language')
        if value:
            for lang in value.split(','):
                self.g.remove((dataset_ref, DCT.language, Literal(lang)))
                lang = lang.replace('{', '').replace('}', '')
                self.g.add(
                    (dataset_ref, DCT.language, URIRef(LANG_BASE_URI + lang)))
                # self._add_concept(LANG_CONCEPTS, lang)

        ### add spatial (EU URI)
        value = self._get_dict_value(dataset_dict, 'geographical_name')
        if value:
            for gname in value.split(','):
                gname = gname.replace('{', '').replace('}', '')

                dct_location = BNode()
                self.g.add((dataset_ref, DCT.spatial, dct_location))

                self.g.add((dct_location, RDF['type'], DCT.Location))

                # Try and add a Concept from the spatial vocabulary
                if self._add_concept(GEO_CONCEPTS, gname):
                    self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                                Literal(GEO_BASE_URI + gname)))

                    # geo concept is not really required, but may be a useful adding
                    self.g.add((dct_location, LOCN.geographicalName,
                                URIRef(GEO_BASE_URI + gname)))
                else:
                    # The dataset field is not a controlled tag, let's create a Concept out of the label we have
                    concept = BNode()
                    self.g.add((concept, RDF['type'], SKOS.Concept))
                    self.g.add((concept, SKOS.prefLabel, Literal(gname)))
                    self.g.add((dct_location, LOCN.geographicalName, concept))

        ### add spatial (GeoNames)
        value = self._get_dict_value(dataset_dict, 'geographical_geonames_url')
        if value:
            dct_location = BNode()
            self.g.add((dataset_ref, DCT.spatial, dct_location))

            self.g.add((dct_location, RDF['type'], DCT.Location))
            self.g.add((dct_location, DCATAPIT.geographicalIdentifier,
                        Literal(value)))

        ### replace periodicity
        self._remove_node(dataset_dict, dataset_ref,
                          ('frequency', DCT.accrualPeriodicity, None, Literal))
        self._add_uri_node(
            dataset_dict, dataset_ref,
            ('frequency', DCT.accrualPeriodicity, DEFAULT_FREQ_CODE, URIRef),
            FREQ_BASE_URI)
        # self._add_concept(FREQ_CONCEPTS, dataset_dict.get('frequency', DEFAULT_VOCABULARY_KEY))

        ### replace landing page
        self._remove_node(dataset_dict, dataset_ref,
                          ('url', DCAT.landingPage, None, URIRef))
        landing_page_uri = None
        if dataset_dict.get('name'):
            landing_page_uri = '{0}/dataset/{1}'.format(
                catalog_uri().rstrip('/'), dataset_dict['name'])
        else:
            landing_page_uri = dataset_uri(
                dataset_dict)  # TODO: preserve original URI if harvested

        self.g.add((dataset_ref, DCAT.landingPage, URIRef(landing_page_uri)))

        ### conformsTo
        self.g.remove((dataset_ref, DCT.conformsTo, None))
        value = self._get_dict_value(dataset_dict, 'conforms_to')
        if value:
            for item in value.split(','):

                standard = BNode()
                self.g.add((dataset_ref, DCT.conformsTo, standard))

                self.g.add((standard, RDF['type'], DCT.Standard))
                self.g.add((standard, RDF['type'], DCATAPIT.Standard))
                self.g.add((standard, DCT.identifier, Literal(item)))

        ### publisher

        # DCAT by default creates this node
        # <dct:publisher>
        #   <foaf:Organization rdf:about="http://10.10.100.75/organization/55535226-f82a-4cf7-903a-3e10afeaa79a">
        #     <foaf:name>orga2_test</foaf:name>
        #   </foaf:Organization>
        # </dct:publisher>

        for s, p, o in g.triples((dataset_ref, DCT.publisher, None)):
            #log.info("Removing publisher %r", o)
            g.remove((s, p, o))

        self._add_agent(dataset_dict, dataset_ref, 'publisher', DCT.publisher)

        ### Rights holder : Agent
        holder_ref = self._add_agent(dataset_dict, dataset_ref, 'holder',
                                     DCT.rightsHolder)

        ### Autore : Agent
        self._add_agent(dataset_dict, dataset_ref, 'creator', DCT.creator)

        ### Point of Contact

        # <dcat:contactPoint rdf:resource="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri"/>

        # <!-- http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri -->
        # <dcatapit:Organization rdf:about="http://dati.gov.it/resource/PuntoContatto/contactPointRegione_r_liguri">
        #    <rdf:type rdf:resource="&vcard;Kind"/>
        #    <rdf:type rdf:resource="&vcard;Organization"/>
        #    <vcard:hasEmail rdf:resource="mailto:[email protected]"/>
        #    <vcard:fn>Regione Liguria - Sportello Cartografico</vcard:fn>
        # </dcatapit:Organization>

        # TODO: preserve original info if harvested

        # retrieve the contactPoint added by the euro serializer
        euro_poc = g.value(subject=dataset_ref,
                           predicate=DCAT.contactPoint,
                           object=None,
                           any=False)

        # euro poc has this format:
        # <dcat:contactPoint>
        #    <vcard:Organization rdf:nodeID="Nfcd06f452bcd41f48f33c45b0c95979e">
        #       <vcard:fn>THE ORGANIZATION NAME</vcard:fn>
        #       <vcard:hasEmail>THE ORGANIZATION EMAIL</vcard:hasEmail>
        #    </vcard:Organization>
        # </dcat:contactPoint>

        if euro_poc:
            g.remove((dataset_ref, DCAT.contactPoint, euro_poc))

        org_id = dataset_dict.get('organization', {}).get('id')

        # get orga info
        org_show = logic.get_action('organization_show')

        try:
            org_dict = org_show({}, {
                'id': org_id,
                'include_datasets': False,
                'include_tags': False,
                'include_users': False,
                'include_groups': False,
                'include_extras': True,
                'include_followers': False
            })
        except Exception, e:
            org_dict = {}