class pyRdfa : """Main processing class for the distiller @ivar options: an instance of the L{Options} class @ivar media_type: the preferred default media type, possibly set at initialization @ivar base: the base value, possibly set at initialization """ def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None def _get_input(self, name) : """ Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, returning a file-like object. If name none of these, it returns the input argument (that should be, supposidly, a file-like object already) If the media type has not been set explicitly at initialization of this instance, the method also sets the media_type based on the HTTP GET response or the suffix of the file. See L{utils.preferred_suffixes} for the suffix to media type mapping. @param name: identifier of the input source @type name: string or a file-like object @return: a file like object if opening "name" is possible and successful, "name" otherwise """ try : if isinstance(name, basestring) : # check if this is a URI, ie, if there is a valid 'scheme' part # otherwise it is considered to be a simple file if urlparse.urlparse(name)[0] != "" : url_request = URIOpener(name) self.base = url_request.location if self.media_type == "" : if url_request.content_type in content_to_host_language : self.media_type = url_request.content_type else : self.media_type = MediaTypes.xml self.options.set_host_language(self.media_type) self.charset = url_request.charset if self.required_base == None : self.required_base = name return url_request.data else : self.base = name # Creating a File URI for this thing if self.required_base == None : self.required_base = "file://" + os.path.join(os.getcwd(),name) if self.media_type == "" : self.media_type = MediaTypes.xml # see if the default should be overwritten for suffix in preferred_suffixes : if name.endswith(suffix) : self.media_type = preferred_suffixes[suffix] self.charset = 'utf-8' break self.options.set_host_language(self.media_type) from py3compat import PY3 if PY3: return open(name, 'rb') else: return open(name, 'r') else : return name except : (type, value, traceback) = sys.exc_info() raise FailedSource(value) #################################################################################################################### # Externally used methods # def graph_from_DOM(self, dom, graph = None, pgraph = None) : """ Extract the RDF Graph from a DOM tree. This is where the real meat happens. All other methods get down to this one, eventually (eg, after opening a URI and parsing it into a DOM) @param dom: a DOM Node element, the top level entry node for the whole tree (to make it clear, a dom.documentElement is used to initiate processing) @keyword graph: an RDF Graph (if None, than a new one is created) @type graph: rdflib Graph instance. If None, a new one is created. @keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @type pgraph: rdflib Graph instance or None @return: an RDF Graph @rtype: rdflib Graph instance """ def copyGraph(tog, fromg) : for t in fromg : tog.add(t) for k,ns in fromg.namespaces() : tog.bind(k,ns) if graph == None : # Create the RDF Graph, that will contain the return triples... graph = Graph() # this will collect the content, the 'default graph', as called in the RDFa spec default_graph = Graph() # get the DOM tree topElement = dom.documentElement # Perform the built-in and external transformations on the HTML tree. for trans in self.options.transformers + builtInTransformers : trans(topElement, self.options) # Create the initial state. This takes care of things # like base, top level namespace settings, etc. state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version) # This may have changed if the state setting detected an explicit version information: self.rdfa_version = state.rdfa_version # The top level subject starts with the current document; this # is used by the recursion #subject = URIRef(state.base) # this function is the real workhorse parse_one_node(topElement, default_graph, None, state, []) # If the RDFS expansion has to be made, here is the place... if self.options.vocab_expansion : from pyRdfa.rdfs.process import process_rdfa_sem process_rdfa_sem(default_graph, self.options) # What should be returned depends on the way the options have been set up if self.options.output_default_graph : copyGraph(graph, default_graph) if self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) elif self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) # this is necessary if several DOM trees are handled in a row... self.options.reset_processor_graph() return graph def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : """ Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. @param name: a URI, a file name, or a file-like object @param graph: rdflib Graph instance. If None, a new one is created. @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @param rdfOutput: whether exceptions should be turned into RDF and returned as part of the processor graph @return: an RDF Graph @rtype: rdflib Graph instance """ def copyErrors(tog, options) : if tog == None : tog = Graph() if options.output_processor_graph : for t in options.processor_graph.graph : tog.add(t) for k,ns in options.processor_graph.graph.namespaces() : tog.bind(k,ns) options.reset_processor_graph() return tog try : # First, open the source... input = self._get_input(name) msg = "" parser = None if self.options.host_language == HostLanguage.html : import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import html5lib parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) if self.charset : # This means the HTTP header has provided a charset, or the # file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) else : # in other cases an XML parser has to be used parse = xml.dom.minidom.parse dom = parse(input) #dom = parse(input,encoding='utf-8') return self.graph_from_DOM(dom, graph, pgraph) except FailedSource, f : if not rdfOutput : raise f self.options.add_error(f.msg, FileReferenceError, name) return copyErrors(graph, self.options) except Exception, e : (a,b,c) = sys.exc_info() sys.excepthook(a,b,c) #if not rdfOutput : raise e return copyErrors(graph, self.options)
class pyRdfa : """Main processing class for the distiller @ivar options: an instance of the L{Options} class @ivar media_type: the preferred default media type, possibly set at initialization @ivar base: the base value, possibly set at initialization @ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers """ def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.http_status = 200 self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None def _get_input(self, name) : """ Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, returning a file-like object. If name is none of these, it returns the input argument (that should be, supposedly, a file-like object already). If the media type has not been set explicitly at initialization of this instance, the method also sets the media_type based on the HTTP GET response or the suffix of the file. See L{host.preferred_suffixes} for the suffix to media type mapping. @param name: identifier of the input source @type name: string or a file-like object @return: a file like object if opening "name" is possible and successful, "name" otherwise """ try : # Python 2 branch isstring = isinstance(name, basestring) except : # Python 3 branch isstring = isinstance(name, str) try : if isstring : # check if this is a URI, ie, if there is a valid 'scheme' part # otherwise it is considered to be a simple file if urlparse(name)[0] != "" : url_request = URIOpener(name) self.base = url_request.location if self.media_type == "" : if url_request.content_type in content_to_host_language : self.media_type = url_request.content_type else : self.media_type = MediaTypes.xml self.options.set_host_language(self.media_type) self.charset = url_request.charset if self.required_base == None : self.required_base = name return url_request.data else : self.base = name # Creating a File URI for this thing if self.required_base == None : self.required_base = "file://" + os.path.join(os.getcwd(),name) if self.media_type == "" : self.media_type = MediaTypes.xml # see if the default should be overwritten for suffix in preferred_suffixes : if name.endswith(suffix) : self.media_type = preferred_suffixes[suffix] self.charset = 'utf-8' break self.options.set_host_language(self.media_type) return file(name) else : return name except HTTPError : raise sys.exc_info()[1] except : (type, value, traceback) = sys.exc_info() raise FailedSource(value) #################################################################################################################### # Externally used methods # def graph_from_DOM(self, dom, graph = None, pgraph = None) : """ Extract the RDF Graph from a DOM tree. This is where the real processing happens. All other methods get down to this one, eventually (e.g., after opening a URI and parsing it into a DOM). @param dom: a DOM Node element, the top level entry node for the whole tree (i.e., the C{dom.documentElement} is used to initiate processing down the node hierarchy) @keyword graph: an RDF Graph (if None, than a new one is created) @type graph: rdflib Graph instance. @keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @type pgraph: rdflib Graph instance @return: an RDF Graph @rtype: rdflib Graph instance """ def copyGraph(tog, fromg) : for t in fromg : tog.add(t) for k,ns in fromg.namespaces() : tog.bind(k,ns) if graph == None : # Create the RDF Graph, that will contain the return triples... graph = Graph() # this will collect the content, the 'default graph', as called in the RDFa spec default_graph = Graph() # get the DOM tree topElement = dom.documentElement # Create the initial state. This takes care of things # like base, top level namespace settings, etc. state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version) # Perform the built-in and external transformations on the HTML tree. for trans in self.options.transformers + builtInTransformers : trans(topElement, self.options, state) # This may have changed if the state setting detected an explicit version information: self.rdfa_version = state.rdfa_version # The top level subject starts with the current document; this # is used by the recursion # this function is the real workhorse parse_one_node(topElement, default_graph, None, state, []) # If the RDFS expansion has to be made, here is the place... if self.options.vocab_expansion : from pyRdfa.rdfs.process import process_rdfa_sem process_rdfa_sem(default_graph, self.options) # What should be returned depends on the way the options have been set up if self.options.output_default_graph : copyGraph(graph, default_graph) if self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) elif self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) # this is necessary if several DOM trees are handled in a row... self.options.reset_processor_graph() return graph def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : """ Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. @param name: a URI, a file name, or a file-like object @param graph: rdflib Graph instance. If None, a new one is created. @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph @return: an RDF Graph @rtype: rdflib Graph instance """ def copyErrors(tog, options) : if tog == None : tog = Graph() if options.output_processor_graph : for t in options.processor_graph.graph : tog.add(t) for k,ns in options.processor_graph.graph.namespaces() : tog.bind(k,ns) options.reset_processor_graph() return tog # Separating this for a forward Python 3 compatibility try : # Python 2 branch isstring = isinstance(name, basestring) except : # Python 3 branch isstring = isinstance(name, str) try : # First, open the source... Possible HTTP errors are returned as error triples input = None try : input = self._get_input(name) except FailedSource : f = sys.exc_info()[1] self.http_status = 400 if not rdfOutput : raise f err = self.options.add_error(f.msg, FileReferenceError, name) self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) except HTTPError : h = sys.exc_info()[1] self.http_status = h.http_code if not rdfOutput : raise h err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name) self.options.processor_graph.add_http_context(err, h.http_code) return copyErrors(graph, self.options) except Exception : e = sys.exc_info()[1] self.http_status = 500 # Something nasty happened:-( if not rdfOutput : raise e err = self.options.add_error(str(e), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options) dom = None try : msg = "" parser = None if self.options.host_language == HostLanguage.html5 : import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import html5lib parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) if self.charset : # This means the HTTP header has provided a charset, or the # file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) try : if isstring : input.close() input = self._get_input(name) else : input.seek(0) from pyRdfa.host import adjust_html_version self.rdfa_version = adjust_html_version(input, self.rdfa_version) except : # if anyting goes wrong, it is not really important; rdfa version stays what it was... pass else : # in other cases an XML parser has to be used from pyRdfa.host import adjust_xhtml_and_version parse = xml.dom.minidom.parse dom = parse(input) (adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version) self.options.host_language = adjusted_host_language self.rdfa_version = version except ImportError : msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>" raise ImportError(msg) except Exception : e = sys.exc_info()[1] # These are various parsing exception. Per spec, this is a case when # error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted # return page or a graph with error triples) does not apply err = self.options.add_error(str(e), context = name) self.http_status = 400 self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) # If we got here, we have a DOM tree to operate on... return self.graph_from_DOM(dom, graph, pgraph) except Exception : # Something nasty happened during the generation of the graph... (a,b,c) = sys.exc_info() sys.excepthook(a,b,c) if isinstance(b, ImportError) : self.http_status = None else : self.http_status = 500 if not rdfOutput : raise b err = self.options.add_error(str(b), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options) def rdf_from_sources(self, names, outputFormat = "turtle", rdfOutput = False) : """ Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF extracted, and serialization is done in the specified format. @param names: list of sources, each can be a URI, a file name, or a file-like object @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph @type rdfOutput: boolean @return: a serialized RDF Graph @rtype: string """ # This is better because it gives access to the various, non-standard serializations # If it does not work because the extra are not installed, fall back to the standard # rdlib distribution... try : from pyRdfaExtras import MyGraph graph = MyGraph() except : graph = Graph() graph.bind("xsd", Namespace('http://www.w3.org/2001/XMLSchema#')) # the value of rdfOutput determines the reaction on exceptions... for name in names : self.graph_from_source(name, graph, rdfOutput) retval = graph.serialize(format=outputFormat) return retval def rdf_from_source(self, name, outputFormat = "turtle", rdfOutput = False) : """ Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF extracted, and serialization is done in the specified format. @param name: a URI, a file name, or a file-like object @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph @type rdfOutput: boolean @return: a serialized RDF Graph @rtype: string """ return self.rdf_from_sources([name], outputFormat, rdfOutput)