def parse(self, data, _format=None): ''' Parses and RDF graph serialization and into the class graph It calls the rdflib parse function with the provided data and format. Data is a string with the serialized RDF graph (eg RDF/XML, N3 ... ). By default RF/XML is expected. The optional parameter _format can be used to tell rdflib otherwise. It raises a ``RDFParserException`` if there was some error during the parsing. Returns nothing. ''' _format = url_to_rdflib_format(_format) if _format == 'pretty-xml': _format = 'xml' try: self.g.parse(data=data, format=_format) # Apparently there is no single way of catching exceptions from all # rdflib parsers at once, so if you use a new one and the parsing # exceptions are not cached, add them here. # PluginException indicates that an unknown format was passed. except (SyntaxError, xml.sax.SAXParseException, rdflib.plugin.PluginException, TypeError), e: raise RDFParserException(e)
def serialize_catalog(self, catalog_dict=None, dataset_dicts=None, _format='xml', pagination_info=None): ''' Returns an RDF serialization of the whole catalog `catalog_dict` can contain literal values for the dcat:Catalog class like `title`, `homepage`, etc. If not provided these would get default values from the CKAN config (eg from `ckan.site_title`). If passed a list of CKAN dataset dicts, these will be also serializsed as part of the catalog. **Note:** There is no hard limit on the number of datasets at this level, this should be handled upstream. The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. `pagination_info` may be a dict containing keys describing the results pagination. See the `_add_pagination_triples()` method for details. Returns a string with the serialized catalog ''' catalog_ref = self.graph_from_catalog(catalog_dict) if dataset_dicts: i = 0 publishers = {} formats = {} themes = dhh.dge_harvest_dict_theme_option_label() for dataset_dict in dataset_dicts: #Add available resource formats in catalog and publishers dataset_dict[dhc.EXPORT_AVAILABLE_RESOURCE_FORMATS] = formats dataset_dict[dhc.EXPORT_AVAILABLE_PUBLISHERS] = publishers dataset_dict[dhc.EXPORT_AVAILABLE_THEMES] = themes dataset_ref = self.graph_from_dataset(dataset_dict) publishers = dataset_dict.get(dhc.EXPORT_AVAILABLE_PUBLISHERS, {}) formats = dataset_dict.get( dhc.EXPORT_AVAILABLE_RESOURCE_FORMATS, {}) i = i + 1 self.g.add((catalog_ref, DCAT.dataset, dataset_ref)) log.debug("[processors] serialize_catalog Total datasets i=%s", i) self.g.add((catalog_ref, DCT.extent, Literal(i, datatype=XSD.nonNegativeInteger))) if pagination_info: self._add_pagination_triples(pagination_info) _format = url_to_rdflib_format(_format) output = self.g.serialize(format=_format) return output
def serialize_catalog(self, catalog_dict=None, dataset_dicts=None, _format='xml', pagination_info=None): ''' Returns an RDF serialization of the whole catalog `catalog_dict` can contain literal values for the dcat:Catalog class like `title`, `homepage`, etc. If not provided these would get default values from the CKAN config (eg from `ckan.site_title`). If passed a list of CKAN dataset dicts, these will be also serializsed as part of the catalog. **Note:** There is no hard limit on the number of datasets at this level, this should be handled upstream. The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. `pagination_info` may be a dict containing keys describing the results pagination. See the `_add_pagination_triples()` method for details. Returns a string with the serialized catalog ''' catalog_ref = self.graph_from_catalog(catalog_dict) if dataset_dicts: for dataset_dict in dataset_dicts: dataset_ref = self.graph_from_dataset(dataset_dict) cat_ref = self._add_source_catalog(catalog_ref, dataset_dict, dataset_ref) if not cat_ref: self.g.add((catalog_ref, DCAT.dataset, dataset_ref)) if pagination_info: self._add_pagination_triples(pagination_info) if not _format: _format = 'xml' _format = url_to_rdflib_format(_format) #print _format output = self.g.serialize(format=_format) #print "serialize catalog**********" #print output return output
def serialize_dataset(self, dataset_dict, _format='xml'): ''' Given a CKAN dataset dict, returns an RDF serialization The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. Returns a string with the serialized dataset ''' self.graph_from_dataset(dataset_dict) _format = url_to_rdflib_format(_format) if _format == 'json-ld': output = self.g.serialize(format=_format, auto_compact=True) else: output = self.g.serialize(format=_format) return output
def serialize_record(self, record_dict, resource_dict, _format="xml"): """ Given a CKAN dataset dict, returns an RDF serialization The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. Returns a string with the serialized dataset """ self.graph_from_record(record_dict, resource_dict) _format = url_to_rdflib_format(_format) if _format == "json-ld": output = self.g.serialize(format=_format, auto_compact=True) else: output = self.g.serialize(format=_format) return output
def serialize_catalog(self, catalog_dict=None, dataset_dicts=None, _format='xml', pagination_info=None): ''' Returns an RDF serialization of the whole catalog `catalog_dict` can contain literal values for the dcat:Catalog class like `title`, `homepage`, etc. If not provided these would get default values from the CKAN config (eg from `ckan.site_title`). If passed a list of CKAN dataset dicts, these will be also serializsed as part of the catalog. **Note:** There is no hard limit on the number of datasets at this level, this should be handled upstream. The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. `pagination_info` may be a dict containing keys describing the results pagination. See the `_add_pagination_triples()` method for details. Returns a string with the serialized catalog ''' catalog_ref = self.graph_from_catalog(catalog_dict) if dataset_dicts: for dataset_dict in dataset_dicts: dataset_ref = self.graph_from_dataset(dataset_dict) cat_ref = self._add_source_catalog(catalog_ref, dataset_dict, dataset_ref) if not cat_ref: self.g.add((catalog_ref, DCAT.dataset, dataset_ref)) if pagination_info: self._add_pagination_triples(pagination_info) if not _format: _format = 'xml' _format = url_to_rdflib_format(_format) output = self.g.serialize(format=_format) return output
def serialize_record(self, record, resource, output_format=u'xml', version=None): ''' Given a record dict, returns an RDF serialization. The serialization format can be defined using the `_format` parameter. It must be one of the ones supported by RDFLib, defaults to `xml`. Returns a string with the serialized dataset ''' rdflib_format = url_to_rdflib_format(output_format) builder = RecordGraphBuilder(record, resource, Namespaces(self.g), output_format, version) for triple in builder: # the builder is allowed to yield duplicate triples so using add here just defers the # handling of these to the triplestore itself, which it handles fine. We could use set # to be more proactive with de-duplication, however the replace part of the set function # removes all previously added subject-predicate pairs and is therefore too aggressive # (i.e. set(x, y, a) followed by set(x, y, b) would result in (x, y, b) in the # triplestore and the (x, y, a) triple would be removed) self.g.add(triple) return self.g.serialize(format=rdflib_format, **self.serializer_kwargs.get(rdflib_format, {}))