# file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) try : if isinstance(name, basestring) : input.close() input = self._get_input(name) else : input.seek(0) from pyRdfa.host import adjust_html_version self.rdfa_version = adjust_html_version(input, self.rdfa_version) except : # if anyting goes wrong, it is not really important; rdfa version stays what it was... pass else : # in other cases an XML parser has to be used from pyRdfa.host import adjust_xhtml_and_version parse = xml.dom.minidom.parse dom = parse(input) (adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version) self.options.host_language = adjusted_host_language self.rdfa_version = version except Exception, e : # These are various parsing exception. Per spec, this is a case when # error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted
def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : """ Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. @param name: a URI, a file name, or a file-like object @param graph: rdflib Graph instance. If None, a new one is created. @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph @return: an RDF Graph @rtype: rdflib Graph instance """ def copyErrors(tog, options) : if tog == None : tog = Graph() if options.output_processor_graph : for t in options.processor_graph.graph : tog.add(t) for k,ns in options.processor_graph.graph.namespaces() : tog.bind(k,ns) options.reset_processor_graph() return tog # Separating this for a forward Python 3 compatibility try : # Python 2 branch isstring = isinstance(name, basestring) except : # Python 3 branch isstring = isinstance(name, str) try : # First, open the source... Possible HTTP errors are returned as error triples input = None try : input = self._get_input(name) except FailedSource : f = sys.exc_info()[1] self.http_status = 400 if not rdfOutput : raise f err = self.options.add_error(f.msg, FileReferenceError, name) self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) except HTTPError : h = sys.exc_info()[1] self.http_status = h.http_code if not rdfOutput : raise h err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name) self.options.processor_graph.add_http_context(err, h.http_code) return copyErrors(graph, self.options) except Exception : e = sys.exc_info()[1] self.http_status = 500 # Something nasty happened:-( if not rdfOutput : raise e err = self.options.add_error(str(e), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options) dom = None try : msg = "" parser = None if self.options.host_language == HostLanguage.html5 : import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import html5lib parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) if self.charset : # This means the HTTP header has provided a charset, or the # file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) try : if isstring : input.close() input = self._get_input(name) else : input.seek(0) from pyRdfa.host import adjust_html_version self.rdfa_version = adjust_html_version(input, self.rdfa_version) except : # if anyting goes wrong, it is not really important; rdfa version stays what it was... pass else : # in other cases an XML parser has to be used from pyRdfa.host import adjust_xhtml_and_version parse = xml.dom.minidom.parse dom = parse(input) (adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version) self.options.host_language = adjusted_host_language self.rdfa_version = version except ImportError : msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>" raise ImportError(msg) except Exception : e = sys.exc_info()[1] # These are various parsing exception. Per spec, this is a case when # error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted # return page or a graph with error triples) does not apply err = self.options.add_error(str(e), context = name) self.http_status = 400 self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) # If we got here, we have a DOM tree to operate on... return self.graph_from_DOM(dom, graph, pgraph) except Exception : # Something nasty happened during the generation of the graph... (a,b,c) = sys.exc_info() sys.excepthook(a,b,c) if isinstance(b, ImportError) : self.http_status = None else : self.http_status = 500 if not rdfOutput : raise b err = self.options.add_error(str(b), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options)