Esempio n. 1
0
						# file is a local file when we suppose it to be a utf-8
						dom = parser.parse(input, encoding=self.charset)
					else :
						# No charset set. The HTMLLib parser tries to sniff into the
						# the file to find a meta header for the charset; if that
						# works, fine, otherwise it falls back on window-...
						dom = parser.parse(input)
						
					try :
						if isinstance(name, basestring) :
							input.close()
							input = self._get_input(name)
						else :
							input.seek(0)
						from pyRdfa.host import adjust_html_version
						self.rdfa_version = adjust_html_version(input, self.rdfa_version)
					except :
						# if anyting goes wrong, it is not really important; rdfa version stays what it was...
						pass
					
				else :
					# in other cases an XML parser has to be used
					from pyRdfa.host import adjust_xhtml_and_version
					parse = xml.dom.minidom.parse
					dom = parse(input)
					(adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version)
					self.options.host_language = adjusted_host_language
					self.rdfa_version          = version
			except Exception, e :
				# These are various parsing exception. Per spec, this is a case when
				# error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted
Esempio n. 2
0
	def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) :
		"""
		Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is
		returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method.
				
		@param name: a URI, a file name, or a file-like object
		@param graph: rdflib Graph instance. If None, a new one is created.
		@param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyErrors(tog, options) :
			if tog == None :
				tog = Graph()
			if options.output_processor_graph :
				for t in options.processor_graph.graph :
					tog.add(t)
				for k,ns in options.processor_graph.graph.namespaces() :
					tog.bind(k,ns)
			options.reset_processor_graph()
			return tog		

		# Separating this for a forward Python 3 compatibility
		try :
			# Python 2 branch
			isstring = isinstance(name, basestring)
		except :
			# Python 3 branch
			isstring = isinstance(name, str)
		
		try :
			# First, open the source... Possible HTTP errors are returned as error triples
			input = None
			try :
				input = self._get_input(name)
			except FailedSource :
				f = sys.exc_info()[1]
				self.http_status = 400
				if not rdfOutput : raise f
				err = self.options.add_error(f.msg, FileReferenceError, name)
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)
			except HTTPError :
				h = sys.exc_info()[1]
				self.http_status = h.http_code
				if not rdfOutput : raise h
				err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name)
				self.options.processor_graph.add_http_context(err, h.http_code)
				return copyErrors(graph, self.options)
			except Exception :
				e = sys.exc_info()[1]
				self.http_status = 500
				# Something nasty happened:-(
				if not rdfOutput : raise e
				err = self.options.add_error(str(e), context = name)
				self.options.processor_graph.add_http_context(err, 500)
				return copyErrors(graph, self.options)

			dom = None
			try :
				msg = ""
				parser = None
				if self.options.host_language == HostLanguage.html5 :
					import warnings
					warnings.filterwarnings("ignore", category=DeprecationWarning)
					import html5lib
					parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
					if self.charset :
						# This means the HTTP header has provided a charset, or the
						# file is a local file when we suppose it to be a utf-8
						dom = parser.parse(input, encoding=self.charset)
					else :
						# No charset set. The HTMLLib parser tries to sniff into the
						# the file to find a meta header for the charset; if that
						# works, fine, otherwise it falls back on window-...
						dom = parser.parse(input)
						
					try :
						if isstring :
							input.close()
							input = self._get_input(name)
						else :
							input.seek(0)
						from pyRdfa.host import adjust_html_version
						self.rdfa_version = adjust_html_version(input, self.rdfa_version)
					except :
						# if anyting goes wrong, it is not really important; rdfa version stays what it was...
						pass
					
				else :
					# in other cases an XML parser has to be used
					from pyRdfa.host import adjust_xhtml_and_version
					parse = xml.dom.minidom.parse
					dom = parse(input)
					(adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version)
					self.options.host_language = adjusted_host_language
					self.rdfa_version          = version
			except ImportError :
				msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>"
				raise ImportError(msg)
			except Exception :
				e = sys.exc_info()[1]
				# These are various parsing exception. Per spec, this is a case when
				# error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted
				# return page or a graph with error triples) does not apply
				err = self.options.add_error(str(e), context = name)
				self.http_status = 400
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)

			# If we got here, we have a DOM tree to operate on...	
			return self.graph_from_DOM(dom, graph, pgraph)
		except Exception :
			# Something nasty happened during the generation of the graph...
			(a,b,c) = sys.exc_info()
			sys.excepthook(a,b,c)
			if isinstance(b, ImportError) :
				self.http_status = None
			else :
				self.http_status = 500
			if not rdfOutput : raise b
			err = self.options.add_error(str(b), context = name)
			self.options.processor_graph.add_http_context(err, 500)
			return copyErrors(graph, self.options)