Esempio n. 1
0
class pyRdfa :
	"""Main processing class for the distiller
	
	@ivar options: an instance of the L{Options} class
	@ivar media_type: the preferred default media type, possibly set at initialization
	@ivar base: the base value, possibly set at initialization
	"""
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
		
	def _get_input(self, name) :
		"""
		Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly,
		returning a file-like object. If name none of these, it returns the input argument (that should
		be, supposidly, a file-like object already)
		
		If the media type has not been set explicitly at initialization of this instance,
		the method also sets the media_type based on the HTTP GET response or the suffix of the file. See
		L{utils.preferred_suffixes} for the suffix to media type mapping. 
		
		@param name: identifier of the input source
		@type name: string or a file-like object
		@return: a file like object if opening "name" is possible and successful, "name" otherwise
		"""
		try :
			if isinstance(name, basestring) :
				# check if this is a URI, ie, if there is a valid 'scheme' part
				# otherwise it is considered to be a simple file
				if urlparse.urlparse(name)[0] != "" :
					url_request 	  = URIOpener(name)
					self.base 		  = url_request.location
					if self.media_type == "" :
						if url_request.content_type in content_to_host_language :
							self.media_type = url_request.content_type
						else :
							self.media_type = MediaTypes.xml
						self.options.set_host_language(self.media_type)
					self.charset = url_request.charset
					if self.required_base == None :
						self.required_base = name
					return url_request.data
				else :
					self.base = name
					# Creating a File URI for this thing
					if self.required_base == None :
						self.required_base = "file://" + os.path.join(os.getcwd(),name)
					if self.media_type == "" :
						self.media_type = MediaTypes.xml
						# see if the default should be overwritten
						for suffix in preferred_suffixes :
							if name.endswith(suffix) :
								self.media_type = preferred_suffixes[suffix]
								self.charset = 'utf-8'
								break
						self.options.set_host_language(self.media_type)
					from py3compat import PY3
					if PY3:
						return open(name, 'rb')
					else:
						return open(name, 'r')
			else :
				return name
		except :
			(type, value, traceback) = sys.exc_info()
			raise FailedSource(value)
	
	####################################################################################################################
	# Externally used methods
	#
	def graph_from_DOM(self, dom, graph = None, pgraph = None) :
		"""
		Extract the RDF Graph from a DOM tree. This is where the real meat happens. All other methods get down to this
		one, eventually (eg, after opening a URI and parsing it into a DOM)
		@param dom: a DOM Node element, the top level entry node for the whole tree (to make it clear, a dom.documentElement is used to initiate processing)
		@keyword graph: an RDF Graph (if None, than a new one is created)
		@type graph: rdflib Graph instance. If None, a new one is created.
		@keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@type pgraph: rdflib Graph instance or None
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyGraph(tog, fromg) :
			for t in fromg :
				tog.add(t)
			for k,ns in fromg.namespaces() :
				tog.bind(k,ns)

		if graph == None :
			# Create the RDF Graph, that will contain the return triples...
			graph   = Graph()
			
		# this will collect the content, the 'default graph', as called in the RDFa spec
		default_graph = Graph()
	
		# get the DOM tree
		topElement = dom.documentElement
		
		# Perform the built-in and external transformations on the HTML tree. 
		for trans in self.options.transformers + builtInTransformers :
			trans(topElement, self.options)
		
		# Create the initial state. This takes care of things
		# like base, top level namespace settings, etc.
		state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version)
		
		# This may have changed if the state setting detected an explicit version information:
		self.rdfa_version = state.rdfa_version
		
		# The top level subject starts with the current document; this
		# is used by the recursion
		#subject = URIRef(state.base)
		# this function is the real workhorse
		parse_one_node(topElement, default_graph, None, state, [])
		
		# If the RDFS expansion has to be made, here is the place...
		if self.options.vocab_expansion :
			from pyRdfa.rdfs.process import process_rdfa_sem
			process_rdfa_sem(default_graph, self.options)
	
		# What should be returned depends on the way the options have been set up
		if self.options.output_default_graph :
			copyGraph(graph, default_graph)
			if self.options.output_processor_graph :
				if pgraph != None :
					copyGraph(pgraph, self.options.processor_graph.graph)
				else :					
					copyGraph(graph, self.options.processor_graph.graph)
		elif self.options.output_processor_graph :
			if pgraph != None :
				copyGraph(pgraph, self.options.processor_graph.graph)
			else :
				copyGraph(graph, self.options.processor_graph.graph)

		# this is necessary if several DOM trees are handled in a row...
		self.options.reset_processor_graph()

		return graph
	
	def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) :
		"""
		Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is
		returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method.
				
		@param name: a URI, a file name, or a file-like object
		@param graph: rdflib Graph instance. If None, a new one is created.
		@param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@param rdfOutput: whether exceptions should be turned into RDF and returned as part of the processor graph
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyErrors(tog, options) :
			if tog == None :
				tog = Graph()
			if options.output_processor_graph :
				for t in options.processor_graph.graph :
					tog.add(t)
				for k,ns in options.processor_graph.graph.namespaces() :
					tog.bind(k,ns)
			options.reset_processor_graph()
			return tog		
		
		try :
			# First, open the source...
			input = self._get_input(name)
			msg = ""
			parser = None
			if self.options.host_language == HostLanguage.html :
				import warnings
				warnings.filterwarnings("ignore", category=DeprecationWarning)
				import html5lib
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
				if self.charset :
					# This means the HTTP header has provided a charset, or the
					# file is a local file when we suppose it to be a utf-8
					dom = parser.parse(input, encoding=self.charset)
				else :
					# No charset set. The HTMLLib parser tries to sniff into the
					# the file to find a meta header for the charset; if that
					# works, fine, otherwise it falls back on window-...
					dom = parser.parse(input)
					
			else :
				# in other cases an XML parser has to be used
				parse = xml.dom.minidom.parse
				dom = parse(input)
			#dom = parse(input,encoding='utf-8')
			return self.graph_from_DOM(dom, graph, pgraph)
		except FailedSource, f :
			if not rdfOutput : raise f
			self.options.add_error(f.msg, FileReferenceError, name)
			return copyErrors(graph, self.options)
		except Exception, e :
			(a,b,c) = sys.exc_info()
			sys.excepthook(a,b,c)
			#if not rdfOutput : raise e
			return copyErrors(graph, self.options)
Esempio n. 2
0
class pyRdfa :
	"""Main processing class for the distiller
	
	@ivar options: an instance of the L{Options} class
	@ivar media_type: the preferred default media type, possibly set at initialization
	@ivar base: the base value, possibly set at initialization
	@ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers
	"""
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.http_status = 200
		
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
		
	def _get_input(self, name) :
		"""
		Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly,
		returning a file-like object. If name is none of these, it returns the input argument (that should
		be, supposedly, a file-like object already).
		
		If the media type has not been set explicitly at initialization of this instance,
		the method also sets the media_type based on the HTTP GET response or the suffix of the file. See
		L{host.preferred_suffixes} for the suffix to media type mapping. 
		
		@param name: identifier of the input source
		@type name: string or a file-like object
		@return: a file like object if opening "name" is possible and successful, "name" otherwise
		"""
		try :
			# Python 2 branch
			isstring = isinstance(name, basestring)
		except :
			# Python 3 branch
			isstring = isinstance(name, str)

		try :
			if isstring :
				# check if this is a URI, ie, if there is a valid 'scheme' part
				# otherwise it is considered to be a simple file
				if urlparse(name)[0] != "" :
					url_request 	  = URIOpener(name)
					self.base 		  = url_request.location
					if self.media_type == "" :
						if url_request.content_type in content_to_host_language :
							self.media_type = url_request.content_type
						else :
							self.media_type = MediaTypes.xml
						self.options.set_host_language(self.media_type)
					self.charset = url_request.charset
					if self.required_base == None :
						self.required_base = name
					return url_request.data
				else :
					self.base = name
					# Creating a File URI for this thing
					if self.required_base == None :
						self.required_base = "file://" + os.path.join(os.getcwd(),name)
					if self.media_type == "" :
						self.media_type = MediaTypes.xml
						# see if the default should be overwritten
						for suffix in preferred_suffixes :
							if name.endswith(suffix) :
								self.media_type = preferred_suffixes[suffix]
								self.charset = 'utf-8'
								break
						self.options.set_host_language(self.media_type)
					return file(name)
			else :
				return name
		except HTTPError :
			raise sys.exc_info()[1]
		except :
			(type, value, traceback) = sys.exc_info()
			raise FailedSource(value)
	
	####################################################################################################################
	# Externally used methods
	#
	def graph_from_DOM(self, dom, graph = None, pgraph = None) :
		"""
		Extract the RDF Graph from a DOM tree. This is where the real processing happens. All other methods get down to this
		one, eventually (e.g., after opening a URI and parsing it into a DOM).
		@param dom: a DOM Node element, the top level entry node for the whole tree (i.e., the C{dom.documentElement} is used to initiate processing down the node hierarchy)
		@keyword graph: an RDF Graph (if None, than a new one is created)
		@type graph: rdflib Graph instance.
		@keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@type pgraph: rdflib Graph instance
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyGraph(tog, fromg) :
			for t in fromg :
				tog.add(t)
			for k,ns in fromg.namespaces() :
				tog.bind(k,ns)

		if graph == None :
			# Create the RDF Graph, that will contain the return triples...
			graph   = Graph()
			
		# this will collect the content, the 'default graph', as called in the RDFa spec
		default_graph = Graph()
	
		# get the DOM tree
		topElement = dom.documentElement
		
		# Create the initial state. This takes care of things
		# like base, top level namespace settings, etc.
		state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version)

		# Perform the built-in and external transformations on the HTML tree. 
		for trans in self.options.transformers + builtInTransformers :
			trans(topElement, self.options, state)
		
		# This may have changed if the state setting detected an explicit version information:
		self.rdfa_version = state.rdfa_version
				
		# The top level subject starts with the current document; this
		# is used by the recursion
		# this function is the real workhorse
		parse_one_node(topElement, default_graph, None, state, [])
		
		# If the RDFS expansion has to be made, here is the place...
		if self.options.vocab_expansion :
			from pyRdfa.rdfs.process import process_rdfa_sem
			process_rdfa_sem(default_graph, self.options)
	
		# What should be returned depends on the way the options have been set up
		if self.options.output_default_graph :
			copyGraph(graph, default_graph)
			if self.options.output_processor_graph :
				if pgraph != None :
					copyGraph(pgraph, self.options.processor_graph.graph)
				else :					
					copyGraph(graph, self.options.processor_graph.graph)
		elif self.options.output_processor_graph :
			if pgraph != None :
				copyGraph(pgraph, self.options.processor_graph.graph)
			else :
				copyGraph(graph, self.options.processor_graph.graph)

		# this is necessary if several DOM trees are handled in a row...
		self.options.reset_processor_graph()

		return graph
	
	def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) :
		"""
		Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is
		returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method.
				
		@param name: a URI, a file name, or a file-like object
		@param graph: rdflib Graph instance. If None, a new one is created.
		@param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyErrors(tog, options) :
			if tog == None :
				tog = Graph()
			if options.output_processor_graph :
				for t in options.processor_graph.graph :
					tog.add(t)
				for k,ns in options.processor_graph.graph.namespaces() :
					tog.bind(k,ns)
			options.reset_processor_graph()
			return tog		

		# Separating this for a forward Python 3 compatibility
		try :
			# Python 2 branch
			isstring = isinstance(name, basestring)
		except :
			# Python 3 branch
			isstring = isinstance(name, str)
		
		try :
			# First, open the source... Possible HTTP errors are returned as error triples
			input = None
			try :
				input = self._get_input(name)
			except FailedSource :
				f = sys.exc_info()[1]
				self.http_status = 400
				if not rdfOutput : raise f
				err = self.options.add_error(f.msg, FileReferenceError, name)
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)
			except HTTPError :
				h = sys.exc_info()[1]
				self.http_status = h.http_code
				if not rdfOutput : raise h
				err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name)
				self.options.processor_graph.add_http_context(err, h.http_code)
				return copyErrors(graph, self.options)
			except Exception :
				e = sys.exc_info()[1]
				self.http_status = 500
				# Something nasty happened:-(
				if not rdfOutput : raise e
				err = self.options.add_error(str(e), context = name)
				self.options.processor_graph.add_http_context(err, 500)
				return copyErrors(graph, self.options)

			dom = None
			try :
				msg = ""
				parser = None
				if self.options.host_language == HostLanguage.html5 :
					import warnings
					warnings.filterwarnings("ignore", category=DeprecationWarning)
					import html5lib
					parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
					if self.charset :
						# This means the HTTP header has provided a charset, or the
						# file is a local file when we suppose it to be a utf-8
						dom = parser.parse(input, encoding=self.charset)
					else :
						# No charset set. The HTMLLib parser tries to sniff into the
						# the file to find a meta header for the charset; if that
						# works, fine, otherwise it falls back on window-...
						dom = parser.parse(input)
						
					try :
						if isstring :
							input.close()
							input = self._get_input(name)
						else :
							input.seek(0)
						from pyRdfa.host import adjust_html_version
						self.rdfa_version = adjust_html_version(input, self.rdfa_version)
					except :
						# if anyting goes wrong, it is not really important; rdfa version stays what it was...
						pass
					
				else :
					# in other cases an XML parser has to be used
					from pyRdfa.host import adjust_xhtml_and_version
					parse = xml.dom.minidom.parse
					dom = parse(input)
					(adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version)
					self.options.host_language = adjusted_host_language
					self.rdfa_version          = version
			except ImportError :
				msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>"
				raise ImportError(msg)
			except Exception :
				e = sys.exc_info()[1]
				# These are various parsing exception. Per spec, this is a case when
				# error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted
				# return page or a graph with error triples) does not apply
				err = self.options.add_error(str(e), context = name)
				self.http_status = 400
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)

			# If we got here, we have a DOM tree to operate on...	
			return self.graph_from_DOM(dom, graph, pgraph)
		except Exception :
			# Something nasty happened during the generation of the graph...
			(a,b,c) = sys.exc_info()
			sys.excepthook(a,b,c)
			if isinstance(b, ImportError) :
				self.http_status = None
			else :
				self.http_status = 500
			if not rdfOutput : raise b
			err = self.options.add_error(str(b), context = name)
			self.options.processor_graph.add_http_context(err, 500)
			return copyErrors(graph, self.options)
	
	def rdf_from_sources(self, names, outputFormat = "turtle", rdfOutput = False) :
		"""
		Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF
		extracted, and serialization is done in the specified format.
		@param names: list of sources, each can be a URI, a file name, or a file-like object
		@keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only.
		@keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph
		@type rdfOutput: boolean
		@return: a serialized RDF Graph
		@rtype: string
		"""
		# This is better because it gives access to the various, non-standard serializations
		# If it does not work because the extra are not installed, fall back to the standard
		# rdlib distribution...
		try :
			from pyRdfaExtras import MyGraph
			graph = MyGraph()
		except :
			graph = Graph()

		graph.bind("xsd", Namespace('http://www.w3.org/2001/XMLSchema#'))
		# the value of rdfOutput determines the reaction on exceptions...
		for name in names :
			self.graph_from_source(name, graph, rdfOutput)
		retval = graph.serialize(format=outputFormat)
		return retval

	def rdf_from_source(self, name, outputFormat = "turtle", rdfOutput = False) :
		"""
		Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF
		extracted, and serialization is done in the specified format.
		@param name: a URI, a file name, or a file-like object
		@keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only.
		@keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph
		@type rdfOutput: boolean
		@return: a serialized RDF Graph
		@rtype: string
		"""
		return self.rdf_from_sources([name], outputFormat, rdfOutput)