Esempio n. 1
0
    def parse(self, data, _format=None):
        '''
        Parses and RDF graph serialization and into the class graph

        It calls the rdflib parse function with the provided data and format.

        Data is a string with the serialized RDF graph (eg RDF/XML, N3
        ... ). By default RF/XML is expected. The optional parameter _format
        can be used to tell rdflib otherwise.

        It raises a ``RDFParserException`` if there was some error during
        the parsing.

        Returns nothing.
        '''

        _format = url_to_rdflib_format(_format)
        if _format == 'pretty-xml':
            _format = 'xml'

        try:
            self.g.parse(data=data, format=_format)
        # Apparently there is no single way of catching exceptions from all
        # rdflib parsers at once, so if you use a new one and the parsing
        # exceptions are not cached, add them here.
        # PluginException indicates that an unknown format was passed.
        except (SyntaxError, xml.sax.SAXParseException,
                rdflib.plugin.PluginException, TypeError), e:

            raise RDFParserException(e)
Esempio n. 2
0
    def serialize_catalog(self,
                          catalog_dict=None,
                          dataset_dicts=None,
                          _format='xml',
                          pagination_info=None):
        '''
        Returns an RDF serialization of the whole catalog

        `catalog_dict` can contain literal values for the dcat:Catalog class
        like `title`, `homepage`, etc. If not provided these would get default
        values from the CKAN config (eg from `ckan.site_title`).

        If passed a list of CKAN dataset dicts, these will be also serializsed
        as part of the catalog.
        **Note:** There is no hard limit on the number of datasets at this
        level, this should be handled upstream.

        The serialization format can be defined using the `_format` parameter.
        It must be one of the ones supported by RDFLib, defaults to `xml`.

        `pagination_info` may be a dict containing keys describing the results
        pagination. See the `_add_pagination_triples()` method for details.

        Returns a string with the serialized catalog
        '''

        catalog_ref = self.graph_from_catalog(catalog_dict)
        if dataset_dicts:
            i = 0
            publishers = {}
            formats = {}
            themes = dhh.dge_harvest_dict_theme_option_label()
            for dataset_dict in dataset_dicts:
                #Add available resource formats in catalog and publishers
                dataset_dict[dhc.EXPORT_AVAILABLE_RESOURCE_FORMATS] = formats
                dataset_dict[dhc.EXPORT_AVAILABLE_PUBLISHERS] = publishers
                dataset_dict[dhc.EXPORT_AVAILABLE_THEMES] = themes
                dataset_ref = self.graph_from_dataset(dataset_dict)
                publishers = dataset_dict.get(dhc.EXPORT_AVAILABLE_PUBLISHERS,
                                              {})
                formats = dataset_dict.get(
                    dhc.EXPORT_AVAILABLE_RESOURCE_FORMATS, {})
                i = i + 1
                self.g.add((catalog_ref, DCAT.dataset, dataset_ref))

            log.debug("[processors] serialize_catalog Total datasets i=%s", i)
            self.g.add((catalog_ref, DCT.extent,
                        Literal(i, datatype=XSD.nonNegativeInteger)))

        if pagination_info:
            self._add_pagination_triples(pagination_info)

        _format = url_to_rdflib_format(_format)
        output = self.g.serialize(format=_format)

        return output
Esempio n. 3
0
    def serialize_catalog(self, catalog_dict=None, dataset_dicts=None,
                          _format='xml', pagination_info=None):
        '''
        Returns an RDF serialization of the whole catalog

        `catalog_dict` can contain literal values for the dcat:Catalog class
        like `title`, `homepage`, etc. If not provided these would get default
        values from the CKAN config (eg from `ckan.site_title`).

        If passed a list of CKAN dataset dicts, these will be also serializsed
        as part of the catalog.
        **Note:** There is no hard limit on the number of datasets at this
        level, this should be handled upstream.

        The serialization format can be defined using the `_format` parameter.
        It must be one of the ones supported by RDFLib, defaults to `xml`.

        `pagination_info` may be a dict containing keys describing the results
        pagination. See the `_add_pagination_triples()` method for details.

        Returns a string with the serialized catalog
        '''

        catalog_ref = self.graph_from_catalog(catalog_dict)
        if dataset_dicts:
            for dataset_dict in dataset_dicts:
                dataset_ref = self.graph_from_dataset(dataset_dict)

                cat_ref = self._add_source_catalog(catalog_ref, dataset_dict, dataset_ref)
                if not cat_ref:
                    self.g.add((catalog_ref, DCAT.dataset, dataset_ref))

        if pagination_info:
            self._add_pagination_triples(pagination_info)

        if not _format:
            _format = 'xml'
        _format = url_to_rdflib_format(_format)
        #print _format
        output = self.g.serialize(format=_format)

        #print "serialize catalog**********"
        #print output

        return output
Esempio n. 4
0
    def serialize_dataset(self, dataset_dict, _format='xml'):
        '''
        Given a CKAN dataset dict, returns an RDF serialization

        The serialization format can be defined using the `_format` parameter.
        It must be one of the ones supported by RDFLib, defaults to `xml`.

        Returns a string with the serialized dataset
        '''

        self.graph_from_dataset(dataset_dict)

        _format = url_to_rdflib_format(_format)

        if _format == 'json-ld':
            output = self.g.serialize(format=_format, auto_compact=True)
        else:
            output = self.g.serialize(format=_format)

        return output
    def serialize_record(self, record_dict, resource_dict, _format="xml"):
        """
        Given a CKAN dataset dict, returns an RDF serialization

        The serialization format can be defined using the `_format` parameter.
        It must be one of the ones supported by RDFLib, defaults to `xml`.

        Returns a string with the serialized dataset
        """

        self.graph_from_record(record_dict, resource_dict)

        _format = url_to_rdflib_format(_format)

        if _format == "json-ld":
            output = self.g.serialize(format=_format, auto_compact=True)
        else:
            output = self.g.serialize(format=_format)

        return output
Esempio n. 6
0
    def serialize_catalog(self, catalog_dict=None, dataset_dicts=None,
                          _format='xml', pagination_info=None):
        '''
        Returns an RDF serialization of the whole catalog

        `catalog_dict` can contain literal values for the dcat:Catalog class
        like `title`, `homepage`, etc. If not provided these would get default
        values from the CKAN config (eg from `ckan.site_title`).

        If passed a list of CKAN dataset dicts, these will be also serializsed
        as part of the catalog.
        **Note:** There is no hard limit on the number of datasets at this
        level, this should be handled upstream.

        The serialization format can be defined using the `_format` parameter.
        It must be one of the ones supported by RDFLib, defaults to `xml`.

        `pagination_info` may be a dict containing keys describing the results
        pagination. See the `_add_pagination_triples()` method for details.

        Returns a string with the serialized catalog
        '''

        catalog_ref = self.graph_from_catalog(catalog_dict)
        if dataset_dicts:
            for dataset_dict in dataset_dicts:
                dataset_ref = self.graph_from_dataset(dataset_dict)

                cat_ref = self._add_source_catalog(catalog_ref, dataset_dict, dataset_ref)
                if not cat_ref:
                    self.g.add((catalog_ref, DCAT.dataset, dataset_ref))

        if pagination_info:
            self._add_pagination_triples(pagination_info)

        if not _format:
            _format = 'xml'
        _format = url_to_rdflib_format(_format)
        output = self.g.serialize(format=_format)

        return output
    def serialize_record(self, record, resource, output_format=u'xml', version=None):
        '''
        Given a record dict, returns an RDF serialization.

        The serialization format can be defined using the `_format` parameter. It must be one of the
        ones supported by RDFLib, defaults to `xml`.

        Returns a string with the serialized dataset
        '''
        rdflib_format = url_to_rdflib_format(output_format)
        builder = RecordGraphBuilder(record, resource, Namespaces(self.g), output_format, version)

        for triple in builder:
            # the builder is allowed to yield duplicate triples so using add here just defers the
            # handling of these to the triplestore itself, which it handles fine. We could use set
            # to be more proactive with de-duplication, however the replace part of the set function
            # removes all previously added subject-predicate pairs and is therefore too aggressive
            # (i.e. set(x, y, a) followed by set(x, y, b) would result in (x, y, b) in the
            # triplestore and the (x, y, a) triple would be removed)
            self.g.add(triple)

        return self.g.serialize(format=rdflib_format,
                                **self.serializer_kwargs.get(rdflib_format, {}))