Beispiel #1
0
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.http_status = 200
		
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
Beispiel #2
0
	def convert( self ) :
		"""
		Top level entry to convert and generate all the triples. It finds the top level items,
		and generates triples for each of them; additionally, it generates a top level entry point
		to the items from base in the form of an RDF list.
		"""
		item_list = []
		for top_level_item in self.get_top_level_items() :
			item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) )
		#list = generate_RDF_collection( self.graph, item_list )
		#self.graph.add( (URIRef(self.base),self.ns_md["item"],list) )
		
		# If the vocab expansion is also switched on, this is the time to do it.

		# This is the version with my current proposal: the basic expansion is always there;
		# the follow-your-nose inclusion of vocabulary is optional
		if self.vocabularies_used :
			try :
				try :
					from ..pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
					from ..pyRdfa.options      import Options
				except :
					from pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
					from pyRdfa.options      import Options
				# if we did not get here, the pyRdfa package could not be
				# imported. Too bad, but life should go on in the except branch...
				if self.vocab_expansion :
					# This is the full deal
					options = Options(vocab_expansion = self.vocab_expansion, vocab_cache = self.vocab_cache)
					process_rdfa_sem(self.graph, options)
				else :
					MiniOWL(self.graph).closure()
			except :
				pass
Beispiel #3
0
def extrair_rdfa(url):
    options = Options(embedded_rdf=True)
    #r = requests.get(url)
    #print pyRdfa(options=options).rdf_from_source(url,outputFormat='pretty-xml')
    g1 = pyRdfa(options=options).rdf_from_source(url,
                                                 outputFormat='pretty-xml')
    #print g1#g2 = pyRdfa(options=options).rdf_from_source('http://rbarbosa.me/ex.html',outputFormat='pretty-xml')
    g = Graph()
    g.parse(io.BytesIO(g1))
    return g
Beispiel #4
0
def check_term(conn, term, predicates):
    cur = conn.cursor()
    html = gizmos.tree.build_tree(cur, "obi", term, predicate_ids=predicates)

    # Create the DOM document element
    parser = html5lib.HTMLParser(
        tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom = parser.parse(html)

    # get the DOM tree
    top = dom.documentElement

    # Create the initial state (from pyRdfa)
    actual = Graph()
    options = Options(
        output_default_graph=True,
        output_processor_graph=True,
        space_preserve=True,
        transformers=[],
        embedded_rdf=True,
        vocab_expansion=False,
        vocab_cache=True,
        vocab_cache_report=False,
        refresh_vocab_cache=False,
        check_lite=False,
        experimental_features=True,
    )
    state = ExecutionContext(
        top,
        actual,
        base="http://purl.obolibrary.org/obo/",
        options=options,
        rdfa_version="1.1",
    )

    # Add the RDFa to the RDFLib graph (recursive)
    parse_one_node(top, actual, None, state, [])

    expected = Graph()
    if predicates:
        expected.parse(f"tests/resources/obi-tree-{term}-predicates.ttl",
                       format="turtle")
    else:
        expected.parse(f"tests/resources/obi-tree-{term}.ttl", format="turtle")

    compare_graphs(actual, expected)
Beispiel #5
0
    def parse(self):
        """
		Parse the RDFa input and store the processor and default graphs. The final media type is also updated.
		"""
        transformers = []
        if self.rdfa_lite:
            from pyRdfa.transform.lite import lite_prune
            transformers.append(lite_prune)

        options = Options(output_default_graph=True,
                          output_processor_graph=True,
                          transformers=transformers,
                          vocab_expansion=self.vocab_expansion,
                          embedded_rdf=self.embedded_rdf,
                          add_informational_messages=True)
        processor = pyRdfa(options=options,
                           base=self.base,
                           media_type=self.media_type)
        processor.graph_from_source(self.uri,
                                    graph=self.default_graph,
                                    pgraph=self.processor_graph,
                                    rdfOutput=True)
        # Extracting some parameters for the error messages
        self.processor = processor
Beispiel #6
0
class ExecutionContext:
    """State at a specific node, including the current set of namespaces in the RDFLib sense, current language,
	the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce
	URI references for RDFLib.
	
	@ivar options: reference to the overall options
	@type options: L{Options}
	@ivar base: the 'base' URI
	@ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit
	@ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals
	@ivar lang: language tag (possibly None)
	@ivar term_or_curie: vocabulary management class instance
	@type term_or_curie: L{termorcurie.TermOrCurie}
	@ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists
	@ivar node: the node to which this state belongs
	@type node: DOM node instance
	@ivar rdfa_version: RDFa version of the content
	@type rdfa_version: String
	@ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5)
	@type supress_lang: Boolean
	@cvar _list: list of attributes that allow for lists of values and should be treated as such
	@cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation.
	"""

    # list of attributes that allow for lists of values and should be treated as such
    _list = ["rel", "rev", "property", "typeof", "role"]
    # mapping table from attribute name to the exact method to retrieve the URI(s).
    _resource_type = {}

    def __init__(self,
                 node,
                 graph,
                 inherited_state=None,
                 base="",
                 options=None,
                 rdfa_version=None):
        """
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{state.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This overrides the possible
		base inherited from the upper layers. The 
		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
		@keyword options: invocation options, and references to warning graphs
		@type options: L{Options<pyRdfa.options>}
		"""
        def remove_frag_id(uri):
            """
			The fragment ID for self.base must be removed
			"""
            try:
                # To be on the safe side:-)
                t = urlparse(uri)
                return urlunparse((t[0], t[1], t[2], t[3], t[4], ""))
            except:
                return uri

        # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
        if len(ExecutionContext._resource_type) == 0:
            ExecutionContext._resource_type = {
                "href": ExecutionContext._URI,
                "src": ExecutionContext._URI,
                "vocab": ExecutionContext._URI,
                "about": ExecutionContext._CURIEorURI,
                "resource": ExecutionContext._CURIEorURI,
                "rel": ExecutionContext._TERMorCURIEorAbsURI,
                "rev": ExecutionContext._TERMorCURIEorAbsURI,
                "datatype": ExecutionContext._TERMorCURIEorAbsURI,
                "typeof": ExecutionContext._TERMorCURIEorAbsURI,
                "property": ExecutionContext._TERMorCURIEorAbsURI,
                "role": ExecutionContext._TERMorCURIEorAbsURI,
            }
        #-----------------------------------------------------------------
        self.node = node

        #-----------------------------------------------------------------
        # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
        # case in, say, XHTML...)
        # At the moment, it is invoked with a 'None' at the top level of parsing, that is
        # when the <base> element is looked for (for the HTML cases, that is)
        if inherited_state:
            self.rdfa_version = inherited_state.rdfa_version
            self.base = inherited_state.base
            self.options = inherited_state.options

            self.list_mapping = inherited_state.list_mapping
            self.new_list = False

            # for generic XML versions the xml:base attribute should be handled
            if self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))
        else:
            # this is the branch called from the very top
            self.list_mapping = ListStructure()
            self.new_list = True

            if rdfa_version is not None:
                self.rdfa_version = rdfa_version
            else:
                from pyRdfa import rdfa_current_version
                self.rdfa_version = rdfa_current_version

            # This value can be overwritten by a @version attribute
            if node.hasAttribute("version"):
                top_version = node.getAttribute("version")
                if top_version.find("RDFa 1.0") != -1 or top_version.find(
                        "RDFa1.0") != -1:
                    self.rdfa_version = "1.0"
                elif top_version.find("RDFa 1.1") != -1 or top_version.find(
                        "RDFa1.1") != -1:
                    self.rdfa_version = "1.1"

            # this is just to play safe. I believe this should actually not happen...
            if options == None:
                from pyRdfa import Options
                self.options = Options()
            else:
                self.options = options

            self.base = ""
            # handle the base element case for HTML
            if self.options.host_language in [
                    HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5
            ]:
                for bases in node.getElementsByTagName("base"):
                    if bases.hasAttribute("href"):
                        self.base = remove_frag_id(bases.getAttribute("href"))
                        continue
            elif self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))

            # If no local setting for base occurs, the input argument has it
            if self.base == "":
                self.base = base

            # Perform an extra beautification in RDFLib
            if self.options.host_language in beautifying_prefixes:
                dict = beautifying_prefixes[self.options.host_language]
                for key in dict:
                    graph.bind(key, dict[key])

            input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (
                self.options.host_language, self.rdfa_version, self.base)
            self.options.add_info(input_info)

        #-----------------------------------------------------------------
        # this will be used repeatedly, better store it once and for all...
        self.parsedBase = urlsplit(self.base)

        #-----------------------------------------------------------------
        # generate and store the local CURIE handling class instance
        self.term_or_curie = TermOrCurie(self, graph, inherited_state)

        #-----------------------------------------------------------------
        # Settling the language tags
        # @lang has priority over @xml:lang
        # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
        # first get the inherited state's language, if any
        if inherited_state:
            self.lang = inherited_state.lang
        else:
            self.lang = None

        self.supress_lang = False

        if self.options.host_language in [
                HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5
        ]:
            # we may have lang and xml:lang
            if node.hasAttribute("lang"):
                lang = node.getAttribute("lang").lower()
            else:
                lang = None
            if node.hasAttribute("xml:lang"):
                xmllang = node.getAttribute("xml:lang").lower()
            else:
                xmllang = None
            # First of all, set the value, if any
            if xmllang != None:
                # this has priority
                if len(xmllang) != 0:
                    self.lang = xmllang
                else:
                    self.lang = None
            elif lang != None:
                if len(lang) != 0:
                    self.lang = lang
                else:
                    self.lang = None
            # Ideally, a warning should be generated if lang and xmllang are both present with different values. But
            # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
            # error situations are simply swallowed...

        elif self.options.host_language in accept_xml_lang and node.hasAttribute(
                "xml:lang"):
            self.lang = node.getAttribute("xml:lang").lower()
            if len(self.lang) == 0: self.lang = None

        #-----------------------------------------------------------------
        # Set the default namespace. Used when generating XML Literals
        if node.hasAttribute("xmlns"):
            self.defaultNS = node.getAttribute("xmlns")
        elif inherited_state and inherited_state.defaultNS != None:
            self.defaultNS = inherited_state.defaultNS
        else:
            self.defaultNS = None

    # end __init__

    def _URI(self, val):
        """Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also
		checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an
		uninterpreted CURIE...
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance
		"""
        def create_URIRef(uri, check=True):
            """
			Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case
			there is something unusual, a warning is generated (though the URIRef is created nevertheless)
			@param uri: (absolute) URI string
			@return: an RDFLib URIRef instance
			"""
            from pyRdfa import uri_schemes
            val = uri.strip()
            if check and urlsplit(val)[0] not in uri_schemes:
                self.options.add_warning(err_URI_scheme % val.strip(),
                                         node=self.node.nodeName)
            return URIRef(val)

        def join(base, v, check=True):
            """
			Mini helping function: it makes a urljoin for the paths. Based on the python library, but
			that one has a bug: in some cases it
			swallows the '#' or '?' character at the end. This is clearly a problem with
			Semantic Web URI-s, so this is checked, too
			@param base: base URI string
			@param v: local part
			@param check: whether the URI should be checked against the list of 'existing' URI schemes
			@return: an RDFLib URIRef instance
			"""
            # UGLY!!! There is a bug for a corner case in python version <= 2.5.X
            if len(v) > 0 and v[0] == '?' and (py_v_major < 3
                                               and py_v_minor <= 5):
                return create_URIRef(base + v, check)
            ####

            joined = urljoin(base, v)
            try:
                if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?"):
                    return create_URIRef(joined + v[-1], check)
                else:
                    return create_URIRef(joined, check)
            except:
                return create_URIRef(joined, check)

        if val == "":
            # The fragment ID must be removed...
            return URIRef(self.base)

        # fall back on good old traditional URI-s.
        # To be on the safe side, let us use the Python libraries
        if self.parsedBase[0] == "":
            # base is, in fact, a local file name
            # The following call is just to be sure that some pathological cases when
            # the ':' _does_ appear in the URI but not in a scheme position is taken
            # care of properly...

            key = urlsplit(val)[0]
            if key == "":
                # relative URI, to be combined with local file name:
                return join(self.base, val, check=False)
            else:
                return create_URIRef(val)
        else:
            # Trust the python library...
            # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
            # swallows the '#' or '?' character at the end. This is clearly a problem with
            # Semantic Web URI-s
            return join(self.base, val)

    # end _URI

    def _CURIEorURI(self, val):
        """Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself
		is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance or None
		"""
        if val == "":
            return URIRef(self.base)

        safe_curie = False
        if val[0] == '[':
            # If a safe CURIE is asked for, a pure URI is not acceptable.
            # Is checked below, and that is why the safe_curie flag is necessary
            if val[-1] != ']':
                # that is certainly forbidden: an incomplete safe CURIE
                self.options.add_warning(err_illegal_safe_CURIE % val,
                                         UnresolvablePrefix,
                                         node=self.node.nodeName)
                return None
            else:
                val = val[1:-1]
                safe_curie = True
        # There is a branch here depending on whether we are in 1.1 or 1.0 mode
        if self.rdfa_version >= "1.1":
            retval = self.term_or_curie.CURIE_to_URI(val)
            if retval == None:
                # the value could not be interpreted as a CURIE, ie, it did not produce any valid URI.
                # The rule says that then the whole value should be considered as a URI
                # except if it was part of a safe CURIE. In that case it should be ignored...
                if safe_curie:
                    self.options.add_warning(err_no_CURIE_in_safe_CURIE % val,
                                             UnresolvablePrefix,
                                             node=self.node.nodeName)
                    return None
                else:
                    return self._URI(val)
            else:
                # there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out
                if isinstance(retval, BNode) == False and urlsplit(
                        str(retval))[0] == "":
                    # yep, there is something wrong, a new URIRef has to be created:
                    return URIRef(self.base + str(retval))
                else:
                    return retval
        else:
            # in 1.0 mode a CURIE can be considered only in case of a safe CURIE
            if safe_curie:
                return self.term_or_curie.CURIE_to_URI(val)
            else:
                return self._URI(val)

    # end _CURIEorURI

    def _TERMorCURIEorAbsURI(self, val):
        """Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise
		the method falls back on a CURIE or an absolute URI.
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance or None
		"""
        from pyRdfa import uri_schemes
        # This case excludes the pure base, ie, the empty value
        if val == "":
            return None

        from pyRdfa.termorcurie import ncname, termname
        if termname.match(val):
            # This is a term, must be handled as such...
            retval = self.term_or_curie.term_to_URI(val)
            if not retval:
                self.options.add_warning(err_undefined_terms % val,
                                         UnresolvableTerm,
                                         node=self.node.nodeName,
                                         buggy_value=val)
                return None
            else:
                return retval
        else:
            # try a CURIE
            retval = self.term_or_curie.CURIE_to_URI(val)
            if retval:
                return retval
            elif self.rdfa_version >= "1.1":
                # See if it is an absolute URI
                scheme = urlsplit(val)[0]
                if scheme == "":
                    # bug; there should be no relative URIs here
                    self.options.add_warning(err_non_legal_CURIE_ref % val,
                                             UnresolvablePrefix,
                                             node=self.node.nodeName)
                    return None
                else:
                    if scheme not in uri_schemes:
                        self.options.add_warning(err_URI_scheme % val.strip(),
                                                 node=self.node.nodeName)
                    return URIRef(val)
            else:
                # rdfa 1.0 case
                self.options.add_warning(err_undefined_CURIE % val.strip(),
                                         UnresolvablePrefix,
                                         node=self.node.nodeName)
                return None

    # end _TERMorCURIEorAbsURI

    # -----------------------------------------------------------------------------------------------

    def getURI(self, attr):
        """Get the URI(s) for the attribute. The name of the attribute determines whether the value should be
		a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done
		using the L{ExecutionContext._resource_type} table.
		@param attr: attribute name
		@type attr: string
		@return: an RDFLib URIRef instance (or None) or a list of those
		"""
        if self.node.hasAttribute(attr):
            val = self.node.getAttribute(attr)
        else:
            if attr in ExecutionContext._list:
                return []
            else:
                return None

        # This may raise an exception if the attr has no key. This, actually,
        # should not happen if the code is correct, but it does not harm having it here...
        try:
            func = ExecutionContext._resource_type[attr]
        except:
            # Actually, this should not happen...
            func = ExecutionContext._URI

        if attr in ExecutionContext._list:
            # Allows for a list
            resources = [
                func(self, v.strip()) for v in val.strip().split() if v != None
            ]
            retval = [r for r in resources if r != None]
        else:
            retval = func(self, val.strip())
        return retval

    # end getURI

    def getResource(self, *args):
        """Get single resources from several different attributes. The first one that returns a valid URI wins.
		@param args: variable list of attribute names, or a single attribute being a list itself.
		@return: an RDFLib URIRef instance (or None) :
		"""
        if len(args) == 0:
            return None
        if isinstance(args[0], TupleType) or isinstance(args[0], ListType):
            rargs = args[0]
        else:
            rargs = args

        for resource in rargs:
            uri = self.getURI(resource)
            if uri != None: return uri
        return None

    # -----------------------------------------------------------------------------------------------
    def reset_list_mapping(self, origin=None):
        """
		Reset, ie, create a new empty dictionary for the list mapping.
		"""
        self.list_mapping = ListStructure()
        if origin: self.set_list_origin(origin)
        self.new_list = True

    def list_empty(self):
        """
		Checks whether the list is empty.
		@return: Boolean
		"""
        return len(self.list_mapping.mapping) == 0

    def get_list_props(self):
        """
		Return the list of property values in the list structure
		@return: list of URIRef
		"""
        return list(self.list_mapping.mapping.keys())

    def get_list_value(self, prop):
        """
		Return the list of values in the list structure for a specific property
		@return: list of RDF nodes
		"""
        return self.list_mapping.mapping[prop]

    def set_list_origin(self, origin):
        """
		Set the origin of the list, ie, the subject to attach the final list(s) to
		@param origin: URIRef
		"""
        self.list_mapping.origin = origin

    def get_list_origin(self):
        """
		Return the origin of the list, ie, the subject to attach the final list(s) to
		@return: URIRef
		"""
        return self.list_mapping.origin

    def add_to_list_mapping(self, property, resource):
        """Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays;
		if the array does not exist yet, it will be created on the fly.
		
		@param property: the property URI, used as a key in the dictionary
		@param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy
		placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not
		an empty list has to be generated.
		"""
        if property in self.list_mapping.mapping:
            if resource != None:
                # indeed, if it is None, than it should not override anything
                if self.list_mapping.mapping[property] == None:
                    # replacing a dummy with real content
                    self.list_mapping.mapping[property] = [resource]
                else:
                    self.list_mapping.mapping[property].append(resource)
        else:
            if resource != None:
                self.list_mapping.mapping[property] = [resource]
            else:
                self.list_mapping.mapping[property] = None
Beispiel #7
0
    def __init__(self,
                 node,
                 graph,
                 inherited_state=None,
                 base="",
                 options=None,
                 rdfa_version=None):
        """
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{state.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This overrides the possible
		base inherited from the upper layers. The 
		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
		@keyword options: invocation options, and references to warning graphs
		@type options: L{Options<pyRdfa.options>}
		"""
        def remove_frag_id(uri):
            """
			The fragment ID for self.base must be removed
			"""
            try:
                # To be on the safe side:-)
                t = urlparse(uri)
                return urlunparse((t[0], t[1], t[2], t[3], t[4], ""))
            except:
                return uri

        # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
        if len(ExecutionContext._resource_type) == 0:
            ExecutionContext._resource_type = {
                "href": ExecutionContext._URI,
                "src": ExecutionContext._URI,
                "vocab": ExecutionContext._URI,
                "about": ExecutionContext._CURIEorURI,
                "resource": ExecutionContext._CURIEorURI,
                "rel": ExecutionContext._TERMorCURIEorAbsURI,
                "rev": ExecutionContext._TERMorCURIEorAbsURI,
                "datatype": ExecutionContext._TERMorCURIEorAbsURI,
                "typeof": ExecutionContext._TERMorCURIEorAbsURI,
                "property": ExecutionContext._TERMorCURIEorAbsURI,
                "role": ExecutionContext._TERMorCURIEorAbsURI,
            }
        #-----------------------------------------------------------------
        self.node = node

        #-----------------------------------------------------------------
        # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
        # case in, say, XHTML...)
        # At the moment, it is invoked with a 'None' at the top level of parsing, that is
        # when the <base> element is looked for (for the HTML cases, that is)
        if inherited_state:
            self.rdfa_version = inherited_state.rdfa_version
            self.base = inherited_state.base
            self.options = inherited_state.options

            self.list_mapping = inherited_state.list_mapping
            self.new_list = False

            # for generic XML versions the xml:base attribute should be handled
            if self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))
        else:
            # this is the branch called from the very top
            self.list_mapping = ListStructure()
            self.new_list = True

            if rdfa_version is not None:
                self.rdfa_version = rdfa_version
            else:
                from pyRdfa import rdfa_current_version
                self.rdfa_version = rdfa_current_version

            # This value can be overwritten by a @version attribute
            if node.hasAttribute("version"):
                top_version = node.getAttribute("version")
                if top_version.find("RDFa 1.0") != -1 or top_version.find(
                        "RDFa1.0") != -1:
                    self.rdfa_version = "1.0"
                elif top_version.find("RDFa 1.1") != -1 or top_version.find(
                        "RDFa1.1") != -1:
                    self.rdfa_version = "1.1"

            # this is just to play safe. I believe this should actually not happen...
            if options == None:
                from pyRdfa import Options
                self.options = Options()
            else:
                self.options = options

            self.base = ""
            # handle the base element case for HTML
            if self.options.host_language in [
                    HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5
            ]:
                for bases in node.getElementsByTagName("base"):
                    if bases.hasAttribute("href"):
                        self.base = remove_frag_id(bases.getAttribute("href"))
                        continue
            elif self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))

            # If no local setting for base occurs, the input argument has it
            if self.base == "":
                self.base = base

            # Perform an extra beautification in RDFLib
            if self.options.host_language in beautifying_prefixes:
                dict = beautifying_prefixes[self.options.host_language]
                for key in dict:
                    graph.bind(key, dict[key])

            input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (
                self.options.host_language, self.rdfa_version, self.base)
            self.options.add_info(input_info)

        #-----------------------------------------------------------------
        # this will be used repeatedly, better store it once and for all...
        self.parsedBase = urlsplit(self.base)

        #-----------------------------------------------------------------
        # generate and store the local CURIE handling class instance
        self.term_or_curie = TermOrCurie(self, graph, inherited_state)

        #-----------------------------------------------------------------
        # Settling the language tags
        # @lang has priority over @xml:lang
        # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
        # first get the inherited state's language, if any
        if inherited_state:
            self.lang = inherited_state.lang
        else:
            self.lang = None

        self.supress_lang = False

        if self.options.host_language in [
                HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5
        ]:
            # we may have lang and xml:lang
            if node.hasAttribute("lang"):
                lang = node.getAttribute("lang").lower()
            else:
                lang = None
            if node.hasAttribute("xml:lang"):
                xmllang = node.getAttribute("xml:lang").lower()
            else:
                xmllang = None
            # First of all, set the value, if any
            if xmllang != None:
                # this has priority
                if len(xmllang) != 0:
                    self.lang = xmllang
                else:
                    self.lang = None
            elif lang != None:
                if len(lang) != 0:
                    self.lang = lang
                else:
                    self.lang = None
            # Ideally, a warning should be generated if lang and xmllang are both present with different values. But
            # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
            # error situations are simply swallowed...

        elif self.options.host_language in accept_xml_lang and node.hasAttribute(
                "xml:lang"):
            self.lang = node.getAttribute("xml:lang").lower()
            if len(self.lang) == 0: self.lang = None

        #-----------------------------------------------------------------
        # Set the default namespace. Used when generating XML Literals
        if node.hasAttribute("xmlns"):
            self.defaultNS = node.getAttribute("xmlns")
        elif inherited_state and inherited_state.defaultNS != None:
            self.defaultNS = inherited_state.defaultNS
        else:
            self.defaultNS = None
Beispiel #8
0
def return_graph(uri, options, newCache=False):
    """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of
	RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked
	on the RDFa content.
			
	The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used.
	
	This function is used to retreive the vocabulary file and turn it into an RDFLib graph.
	
	@param uri: URI for the graph
	@param options: used as a place where warnings can be sent
	@param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text
	@return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful
	"""
    def return_to_cache(msg):
        if newCache:
            options.add_warning(err_unreachable_vocab % uri,
                                warning_type=VocabReferenceError)
        else:
            options.add_warning(err_outdated_cache % uri,
                                warning_type=VocabReferenceError)

    retval = None
    expiration_date = None
    content = None

    try:
        content = URIOpener(
            uri, {
                'Accept':
                'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9'
            })
    except HTTPError:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)
    except RDFaError:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)
    except Exception:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)

    # Store the expiration date of the newly accessed data
    expiration_date = content.expiration_date

    if content.content_type == MediaTypes.turtle:
        try:
            retval = Graph()
            retval.parse(content.data, format="n3")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.rdfxml:
        try:
            retval = Graph()
            retval.parse(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.nt:
        try:
            retval = Graph()
            retval.parse(content.data, format="nt")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_ntriples_vocab % (uri, value))
    elif content.content_type in [
            MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml
    ] or xml_application_media_type.match(content.content_type) != None:
        try:
            from pyRdfa import pyRdfa
            from pyRdfa.options import Options
            options = Options()
            retval = pyRdfa(options).graph_from_source(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_rdfa_vocab % (uri, value))
    else:
        options.add_warning(err_unrecognised_vocab_type %
                            (uri, content.content_type))

    return (retval, expiration_date)
Beispiel #9
0
class pyRdfa :
	"""Main processing class for the distiller
	
	@ivar options: an instance of the L{Options} class
	@ivar media_type: the preferred default media type, possibly set at initialization
	@ivar base: the base value, possibly set at initialization
	@ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers
	"""
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.http_status = 200
		
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
		
	def _get_input(self, name) :
		"""
		Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly,
		returning a file-like object. If name is none of these, it returns the input argument (that should
		be, supposedly, a file-like object already).
		
		If the media type has not been set explicitly at initialization of this instance,
		the method also sets the media_type based on the HTTP GET response or the suffix of the file. See
		L{host.preferred_suffixes} for the suffix to media type mapping. 
		
		@param name: identifier of the input source
		@type name: string or a file-like object
		@return: a file like object if opening "name" is possible and successful, "name" otherwise
		"""
		try :
			if isinstance(name, basestring) :
				# check if this is a URI, ie, if there is a valid 'scheme' part
				# otherwise it is considered to be a simple file
				if urlparse.urlparse(name)[0] != "" :
					url_request 	  = URIOpener(name)
					self.base 		  = url_request.location
					if self.media_type == "" :
						if url_request.content_type in content_to_host_language :
							self.media_type = url_request.content_type
						else :
							self.media_type = MediaTypes.xml
						self.options.set_host_language(self.media_type)
					self.charset = url_request.charset
					if self.required_base == None :
						self.required_base = name
					return url_request.data
				else :
					self.base = name
					# Creating a File URI for this thing
					if self.required_base == None :
						self.required_base = "file://" + os.path.join(os.getcwd(),name)
					if self.media_type == "" :
						self.media_type = MediaTypes.xml
						# see if the default should be overwritten
						for suffix in preferred_suffixes :
							if name.endswith(suffix) :
								self.media_type = preferred_suffixes[suffix]
								self.charset = 'utf-8'
								break
						self.options.set_host_language(self.media_type)
					return file(name)
			else :
				return name
		except HTTPError, h :
			raise h
		except :
Beispiel #10
0
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.nt:
        try:
            retval = Graph()
            retval.parse(content.data, format="nt")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_ntriples_vocab % (uri, value))
    elif content.content_type in [
            MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml
    ] or xml_application_media_type.match(content.content_type) != None:
        try:
            from pyRdfa import pyRdfa
            from pyRdfa.options import Options
            options = Options()
            retval = pyRdfa(options).graph_from_source(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_rdfa_vocab % (uri, value))
    else:
        options.add_warning(err_unrecognised_vocab_type %
                            (uri, content.content_type))

    return (retval, expiration_date)


############################################################################################
type = ns_rdf["type"]
Property = ns_rdf["Property"]
Class = ns_rdfs["Class"]
Beispiel #11
0
    elif content.content_type == MediaTypes.nt:
        try:
            retval = Graph()
            retval.parse(content.data, format="nt")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_ntriples_vocab % (uri, value))
    elif (
        content.content_type in [MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml]
        or xml_application_media_type.match(content.content_type) != None
    ):
        try:
            from pyRdfa import pyRdfa
            from pyRdfa.options import Options

            options = Options()
            retval = pyRdfa(options).graph_from_source(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_rdfa_vocab % (uri, value))
    else:
        options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type))

    return (retval, expiration_date)


############################################################################################
type = ns_rdf["type"]
Property = ns_rdf["Property"]
Class = ns_rdfs["Class"]
subClassOf = ns_rdfs["subClassOf"]
Beispiel #12
0
                output_processor_graph = True
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  vocab_cache_report=vocab_cache_report,
                  bypass_vocab_cache=bypass_vocab_cache,
                  transformers=extras,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  hturtle=hturtle)

processor = pyRdfa(options, base)
if len(value) >= 1:
    retval = processor.rdf_from_sources(value,
                                        outputFormat=format,
                                        rdfOutput=rdfOutput)
else:
    retval = processor.rdf_from_source(sys.stdin,
                                       outputFormat=format,
                                       rdfOutput=rdfOutput)
Beispiel #13
0
def return_graph(uri, options, newCache = False) :
	"""Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of
	RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked
	on the RDFa content.

	The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used.

	This function is used to retreive the vocabulary file and turn it into an RDFLib graph.

	@param uri: URI for the graph
	@param options: used as a place where warnings can be sent
	@param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text
	@return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful
	"""
	def return_to_cache(msg) :
		if newCache :
			options.add_warning(err_unreachable_vocab % uri, warning_type=VocabReferenceError)
		else :
			options.add_warning(err_outdated_cache % uri, warning_type=VocabReferenceError)

	retval 			= None
	expiration_date = None
	content			= None

	try :
		content = URIOpener(uri,
							{'Accept' : 'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9'})
	except HTTPError :
		(type,value,traceback) = sys.exc_info()
		return_to_cache(value)
		return (None,None)
	except RDFaError :
		(type,value,traceback) = sys.exc_info()
		return_to_cache(value)
		return (None,None)
	except Exception :
		(type,value,traceback) = sys.exc_info()
		return_to_cache(value)
		return (None,None)

	# Store the expiration date of the newly accessed data
	expiration_date = content.expiration_date

	if content.content_type == MediaTypes.turtle :
		try :
			retval = Graph()
			retval.parse(content.data, format="n3")
		except :
			(type,value,traceback) = sys.exc_info()
			options.add_warning(err_unparsable_Turtle_vocab % (uri,value))
	elif content.content_type == MediaTypes.rdfxml :
		try :
			retval = Graph()
			retval.parse(content.data)
		except :
			(type,value,traceback) = sys.exc_info()
			options.add_warning(err_unparsable_Turtle_vocab % (uri,value))
	elif content.content_type == MediaTypes.nt :
		try :
			retval = Graph()
			retval.parse(content.data, format="nt")
		except :
			(type,value,traceback) = sys.exc_info()
			options.add_warning(err_unparsable_ntriples_vocab % (uri,value))
	elif content.content_type in [MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml] or xml_application_media_type.match(content.content_type) != None :
		try :
			from pyRdfa import pyRdfa
			from pyRdfa.options	import Options
			options = Options()
			retval = pyRdfa(options).graph_from_source(content.data)
		except :
			(type,value,traceback) = sys.exc_info()
			options.add_warning(err_unparsable_rdfa_vocab % (uri,value))
	else :
		options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type))

	return (retval, expiration_date)
Beispiel #14
0
class pyRdfa :
	"""Main processing class for the distiller
	
	@ivar options: an instance of the L{Options} class
	@ivar media_type: the preferred default media type, possibly set at initialization
	@ivar base: the base value, possibly set at initialization
	@ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers
	"""
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.http_status = 200
		
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
		
	def _get_input(self, name) :
		"""
		Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly,
		returning a file-like object. If name is none of these, it returns the input argument (that should
		be, supposedly, a file-like object already).
		
		If the media type has not been set explicitly at initialization of this instance,
		the method also sets the media_type based on the HTTP GET response or the suffix of the file. See
		L{host.preferred_suffixes} for the suffix to media type mapping. 
		
		@param name: identifier of the input source
		@type name: string or a file-like object
		@return: a file like object if opening "name" is possible and successful, "name" otherwise
		"""
		try :
			# Python 2 branch
			isstring = isinstance(name, basestring)
		except :
			# Python 3 branch
			isstring = isinstance(name, str)

		try :
			if isstring :
				# check if this is a URI, ie, if there is a valid 'scheme' part
				# otherwise it is considered to be a simple file
				if urlparse(name)[0] != "" :
					url_request 	  = URIOpener(name)
					self.base 		  = url_request.location
					if self.media_type == "" :
						if url_request.content_type in content_to_host_language :
							self.media_type = url_request.content_type
						else :
							self.media_type = MediaTypes.xml
						self.options.set_host_language(self.media_type)
					self.charset = url_request.charset
					if self.required_base == None :
						self.required_base = name
					return url_request.data
				else :
					self.base = name
					# Creating a File URI for this thing
					if self.required_base == None :
						self.required_base = "file://" + os.path.join(os.getcwd(),name)
					if self.media_type == "" :
						self.media_type = MediaTypes.xml
						# see if the default should be overwritten
						for suffix in preferred_suffixes :
							if name.endswith(suffix) :
								self.media_type = preferred_suffixes[suffix]
								self.charset = 'utf-8'
								break
						self.options.set_host_language(self.media_type)
					return file(name)
			else :
				return name
		except HTTPError :
			raise sys.exc_info()[1]
		except :
			(type, value, traceback) = sys.exc_info()
			raise FailedSource(value)
	
	####################################################################################################################
	# Externally used methods
	#
	def graph_from_DOM(self, dom, graph = None, pgraph = None) :
		"""
		Extract the RDF Graph from a DOM tree. This is where the real processing happens. All other methods get down to this
		one, eventually (e.g., after opening a URI and parsing it into a DOM).
		@param dom: a DOM Node element, the top level entry node for the whole tree (i.e., the C{dom.documentElement} is used to initiate processing down the node hierarchy)
		@keyword graph: an RDF Graph (if None, than a new one is created)
		@type graph: rdflib Graph instance.
		@keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@type pgraph: rdflib Graph instance
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyGraph(tog, fromg) :
			for t in fromg :
				tog.add(t)
			for k,ns in fromg.namespaces() :
				tog.bind(k,ns)

		if graph == None :
			# Create the RDF Graph, that will contain the return triples...
			graph   = Graph()
			
		# this will collect the content, the 'default graph', as called in the RDFa spec
		default_graph = Graph()
	
		# get the DOM tree
		topElement = dom.documentElement
		
		# Create the initial state. This takes care of things
		# like base, top level namespace settings, etc.
		state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version)

		# Perform the built-in and external transformations on the HTML tree. 
		for trans in self.options.transformers + builtInTransformers :
			trans(topElement, self.options, state)
		
		# This may have changed if the state setting detected an explicit version information:
		self.rdfa_version = state.rdfa_version
				
		# The top level subject starts with the current document; this
		# is used by the recursion
		# this function is the real workhorse
		parse_one_node(topElement, default_graph, None, state, [])
		
		# If the RDFS expansion has to be made, here is the place...
		if self.options.vocab_expansion :
			from pyRdfa.rdfs.process import process_rdfa_sem
			process_rdfa_sem(default_graph, self.options)
	
		# What should be returned depends on the way the options have been set up
		if self.options.output_default_graph :
			copyGraph(graph, default_graph)
			if self.options.output_processor_graph :
				if pgraph != None :
					copyGraph(pgraph, self.options.processor_graph.graph)
				else :					
					copyGraph(graph, self.options.processor_graph.graph)
		elif self.options.output_processor_graph :
			if pgraph != None :
				copyGraph(pgraph, self.options.processor_graph.graph)
			else :
				copyGraph(graph, self.options.processor_graph.graph)

		# this is necessary if several DOM trees are handled in a row...
		self.options.reset_processor_graph()

		return graph
	
	def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) :
		"""
		Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is
		returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method.
				
		@param name: a URI, a file name, or a file-like object
		@param graph: rdflib Graph instance. If None, a new one is created.
		@param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyErrors(tog, options) :
			if tog == None :
				tog = Graph()
			if options.output_processor_graph :
				for t in options.processor_graph.graph :
					tog.add(t)
				for k,ns in options.processor_graph.graph.namespaces() :
					tog.bind(k,ns)
			options.reset_processor_graph()
			return tog		

		# Separating this for a forward Python 3 compatibility
		try :
			# Python 2 branch
			isstring = isinstance(name, basestring)
		except :
			# Python 3 branch
			isstring = isinstance(name, str)
		
		try :
			# First, open the source... Possible HTTP errors are returned as error triples
			input = None
			try :
				input = self._get_input(name)
			except FailedSource :
				f = sys.exc_info()[1]
				self.http_status = 400
				if not rdfOutput : raise f
				err = self.options.add_error(f.msg, FileReferenceError, name)
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)
			except HTTPError :
				h = sys.exc_info()[1]
				self.http_status = h.http_code
				if not rdfOutput : raise h
				err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name)
				self.options.processor_graph.add_http_context(err, h.http_code)
				return copyErrors(graph, self.options)
			except Exception :
				e = sys.exc_info()[1]
				self.http_status = 500
				# Something nasty happened:-(
				if not rdfOutput : raise e
				err = self.options.add_error(str(e), context = name)
				self.options.processor_graph.add_http_context(err, 500)
				return copyErrors(graph, self.options)

			dom = None
			try :
				msg = ""
				parser = None
				if self.options.host_language == HostLanguage.html5 :
					import warnings
					warnings.filterwarnings("ignore", category=DeprecationWarning)
					import html5lib
					parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
					if self.charset :
						# This means the HTTP header has provided a charset, or the
						# file is a local file when we suppose it to be a utf-8
						dom = parser.parse(input, encoding=self.charset)
					else :
						# No charset set. The HTMLLib parser tries to sniff into the
						# the file to find a meta header for the charset; if that
						# works, fine, otherwise it falls back on window-...
						dom = parser.parse(input)
						
					try :
						if isstring :
							input.close()
							input = self._get_input(name)
						else :
							input.seek(0)
						from pyRdfa.host import adjust_html_version
						self.rdfa_version = adjust_html_version(input, self.rdfa_version)
					except :
						# if anyting goes wrong, it is not really important; rdfa version stays what it was...
						pass
					
				else :
					# in other cases an XML parser has to be used
					from pyRdfa.host import adjust_xhtml_and_version
					parse = xml.dom.minidom.parse
					dom = parse(input)
					(adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version)
					self.options.host_language = adjusted_host_language
					self.rdfa_version          = version
			except ImportError :
				msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>"
				raise ImportError(msg)
			except Exception :
				e = sys.exc_info()[1]
				# These are various parsing exception. Per spec, this is a case when
				# error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted
				# return page or a graph with error triples) does not apply
				err = self.options.add_error(str(e), context = name)
				self.http_status = 400
				self.options.processor_graph.add_http_context(err, 400)
				return copyErrors(graph, self.options)

			# If we got here, we have a DOM tree to operate on...	
			return self.graph_from_DOM(dom, graph, pgraph)
		except Exception :
			# Something nasty happened during the generation of the graph...
			(a,b,c) = sys.exc_info()
			sys.excepthook(a,b,c)
			if isinstance(b, ImportError) :
				self.http_status = None
			else :
				self.http_status = 500
			if not rdfOutput : raise b
			err = self.options.add_error(str(b), context = name)
			self.options.processor_graph.add_http_context(err, 500)
			return copyErrors(graph, self.options)
	
	def rdf_from_sources(self, names, outputFormat = "turtle", rdfOutput = False) :
		"""
		Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF
		extracted, and serialization is done in the specified format.
		@param names: list of sources, each can be a URI, a file name, or a file-like object
		@keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only.
		@keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph
		@type rdfOutput: boolean
		@return: a serialized RDF Graph
		@rtype: string
		"""
		# This is better because it gives access to the various, non-standard serializations
		# If it does not work because the extra are not installed, fall back to the standard
		# rdlib distribution...
		try :
			from pyRdfaExtras import MyGraph
			graph = MyGraph()
		except :
			graph = Graph()

		graph.bind("xsd", Namespace('http://www.w3.org/2001/XMLSchema#'))
		# the value of rdfOutput determines the reaction on exceptions...
		for name in names :
			self.graph_from_source(name, graph, rdfOutput)
		retval = graph.serialize(format=outputFormat)
		return retval

	def rdf_from_source(self, name, outputFormat = "turtle", rdfOutput = False) :
		"""
		Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF
		extracted, and serialization is done in the specified format.
		@param name: a URI, a file name, or a file-like object
		@keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only.
		@keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph
		@type rdfOutput: boolean
		@return: a serialized RDF Graph
		@rtype: string
		"""
		return self.rdf_from_sources([name], outputFormat, rdfOutput)
Beispiel #15
0
class ExecutionContext :
	"""State at a specific node, including the current set of namespaces in the RDFLib sense, current language,
	the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce
	URI references for RDFLib.
	
	@ivar options: reference to the overall options
	@type options: L{Options}
	@ivar base: the 'base' URI
	@ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit
	@ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals
	@ivar lang: language tag (possibly None)
	@ivar term_or_curie: vocabulary management class instance
	@type term_or_curie: L{termorcurie.TermOrCurie}
	@ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists
	@ivar node: the node to which this state belongs
	@type node: DOM node instance
	@ivar rdfa_version: RDFa version of the content
	@type rdfa_version: String
	@ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5)
	@type supress_lang: Boolean
	@cvar _list: list of attributes that allow for lists of values and should be treated as such
	@cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation.
	"""

	# list of attributes that allow for lists of values and should be treated as such	
	_list = [ "rel", "rev", "property", "typeof", "role" ]
	# mapping table from attribute name to the exact method to retrieve the URI(s).
	_resource_type = {}
	
	def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) :
		"""
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{state.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This overrides the possible
		base inherited from the upper layers. The 
		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
		@keyword options: invocation options, and references to warning graphs
		@type options: L{Options<pyRdfa.options>}
		"""
		def remove_frag_id(uri) :
			"""
			The fragment ID for self.base must be removed
			"""
			try :
				# To be on the safe side:-)
				t = urlparse.urlparse(uri)
				return urlparse.urlunparse((t[0],t[1],t[2],t[3],t[4],""))
			except :
				return uri
			
		# This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
		if len(	ExecutionContext._resource_type ) == 0 :	
			ExecutionContext._resource_type = {
				"href"		:	ExecutionContext._URI,
				"src"		:	ExecutionContext._URI,
				"vocab"	    :   ExecutionContext._URI,
			
				"about"		:	ExecutionContext._CURIEorURI, 
				"resource"	:	ExecutionContext._CURIEorURI, 
			
				"rel"		:	ExecutionContext._TERMorCURIEorAbsURI,
				"rev"		:	ExecutionContext._TERMorCURIEorAbsURI,
				"datatype"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"typeof"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"property"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"role"		:	ExecutionContext._TERMorCURIEorAbsURI,
			}	
		#-----------------------------------------------------------------
		self.node = node
		
		#-----------------------------------------------------------------
		# Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
		# case in, say, XHTML...)
		# At the moment, it is invoked with a 'None' at the top level of parsing, that is
		# when the <base> element is looked for (for the HTML cases, that is)
		if inherited_state :
			self.rdfa_version		= inherited_state.rdfa_version
			self.base				= inherited_state.base
			self.options			= inherited_state.options
						
			self.list_mapping 		= inherited_state.list_mapping
			self.new_list			= False
			
			# for generic XML versions the xml:base attribute should be handled
			if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
				self.base = remove_frag_id(node.getAttribute("xml:base"))
		else :
			# this is the branch called from the very top			
			self.list_mapping = ListStructure()
			self.new_list	  = True
			
			if rdfa_version is not None :
				self.rdfa_version = rdfa_version
			else :
				from pyRdfa import rdfa_current_version				
				self.rdfa_version = rdfa_current_version

			# This value can be overwritten by a @version attribute
			if node.hasAttribute("version") :
				top_version = node.getAttribute("version")
				if top_version.find("RDFa 1.0") != -1 :
					self.rdfa_version = "1.0"
				elif top_version.find("RDFa 1.1") != -1 :
					self.rdfa_version = "1.1"						
			
			# this is just to play safe. I believe this should actually not happen...
			if options == None :
				from pyRdfa import Options
				self.options = Options()
			else :
				self.options = options

			self.base = ""
			# handle the base element case for HTML
			if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5  ] :
				for bases in node.getElementsByTagName("base") :
					if bases.hasAttribute("href") :
						self.base = remove_frag_id(bases.getAttribute("href"))
						continue
			elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
				self.base = remove_frag_id(node.getAttribute("xml:base"))
				
			# If no local setting for base occurs, the input argument has it
			if self.base == "" :
				self.base = base
				
			# Perform an extra beautification in RDFLib
			if self.options.host_language in beautifying_prefixes :
				dict = beautifying_prefixes[self.options.host_language]
				for key in dict :
					graph.bind(key,dict[key])
					
			input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base)
			self.options.add_info(input_info)

								
		#-----------------------------------------------------------------
		# this will be used repeatedly, better store it once and for all...		
		self.parsedBase = urlparse.urlsplit(self.base)

		#-----------------------------------------------------------------
		# generate and store the local CURIE handling class instance
		self.term_or_curie = TermOrCurie(self, graph, inherited_state)

		#-----------------------------------------------------------------
		# Settling the language tags
		# @lang has priority over @xml:lang
		# it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
		# first get the inherited state's language, if any
		if inherited_state :
			self.lang = inherited_state.lang
		else :
			self.lang = None
			
		self.supress_lang = False
			
			
		if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] :
			# we may have lang and xml:lang
			if node.hasAttribute("lang") :
				lang = node.getAttribute("lang").lower()
			else :
				lang = None
			if node.hasAttribute("xml:lang") :
				xmllang = node.getAttribute("xml:lang").lower()
			else :
				xmllang = None
			# First of all, set the value, if any
			if xmllang != None :
				# this has priority
				if len(xmllang) != 0 :
					self.lang = xmllang
				else :
					self.lang = None
			elif lang != None :
				if len(lang) != 0 :
					self.lang = lang
				else :
					self.lang = None					
			# Ideally, a warning should be generated if lang and xmllang are both present with different values. But
			# the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
			# error situations are simply swallowed...
				
		elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") :
				self.lang = node.getAttribute("xml:lang").lower()
				if len(self.lang) == 0 : self.lang = None
			
		#-----------------------------------------------------------------
		# Set the default namespace. Used when generating XML Literals
		if node.hasAttribute("xmlns") :
			self.defaultNS = node.getAttribute("xmlns")
		elif inherited_state and inherited_state.defaultNS != None :
			self.defaultNS = inherited_state.defaultNS
		else :
			self.defaultNS = None
	# end __init__

	def _URI(self, val) :
		"""Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also
		checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an
		uninterpreted CURIE...
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance
		"""
		def create_URIRef(uri, check = True) :
			"""
			Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case
			there is something unusual, a warning is generated (though the URIRef is created nevertheless)
			@param uri: (absolute) URI string
			@return: an RDFLib URIRef instance
			"""
			from pyRdfa	import uri_schemes
			val = uri.strip()
			if check and urlparse.urlsplit(val)[0] not in uri_schemes :
				self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
			return URIRef(val)

		def join(base, v, check = True) :
			"""
			Mini helping function: it makes a urljoin for the paths. Based on the python library, but
			that one has a bug: in some cases it
			swallows the '#' or '?' character at the end. This is clearly a problem with
			Semantic Web URI-s, so this is checked, too
			@param base: base URI string
			@param v: local part
			@return: an RDFLib URIRef instance
			"""
			# UGLY!!! There is a bug for a corner case in python version <= 2.5.X
			if len(v) > 0 and v[0] == '?' and py_v_minor <= 5 :
				return create_URIRef(base+v, check)
			####
			
			joined = urlparse.urljoin(base, v)
			try :
				if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
					return create_URIRef(joined + v[-1], check)
				else :
					return create_URIRef(joined, check)
			except :
				return create_URIRef(joined, check)

		if val == "" :
			# The fragment ID must be removed...
			return URIRef(self.base)
			
		# fall back on good old traditional URI-s.
		# To be on the safe side, let us use the Python libraries
		if self.parsedBase[0] == "" :
			# base is, in fact, a local file name
			# The following call is just to be sure that some pathological cases when
			# the ':' _does_ appear in the URI but not in a scheme position is taken
			# care of properly...
			
			key = urlparse.urlsplit(val)[0]
			if key == "" :
				# relative URI, to be combined with local file name:
				return join(self.base, val, check = False)
			else :
				return create_URIRef(val)
		else :
			# Trust the python library...
			# Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
			# swallows the '#' or '?' character at the end. This is clearly a problem with
			# Semantic Web URI-s			
			return join(self.base, val)
	# end _URI

	def _CURIEorURI(self, val) :
		"""Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself
		is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance or None
		"""
		if val == "" :
			return URIRef(self.base)

		safe_curie = False
		if val[0] == '[' :
			# If a safe CURIE is asked for, a pure URI is not acceptable.
			# Is checked below, and that is why the safe_curie flag is necessary
			if val[-1] != ']' :
				# that is certainly forbidden: an incomplete safe CURIE
				self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
				return None
			else :
				val = val[1:-1]
				safe_curie = True
		# There is a branch here depending on whether we are in 1.1 or 1.0 mode
		if self.rdfa_version >= "1.1" :
			retval = self.term_or_curie.CURIE_to_URI(val)
			if retval == None :
				# the value could not be interpreted as a CURIE, ie, it did not produce any valid URI.
				# The rule says that then the whole value should be considered as a URI
				# except if it was part of a safe CURIE. In that case it should be ignored...
				if safe_curie :
					self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
					return None
				else :
					return self._URI(val)
			else :
				# there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out
				if isinstance(retval, BNode) == False and urlparse.urlsplit(str(retval))[0] == "" :
					# yep, there is something wrong, a new URIRef has to be created:
					return URIRef(self.base+str(retval))
				else :
					return retval
		else :
			# in 1.0 mode a CURIE can be considered only in case of a safe CURIE
			if safe_curie :
				return self.term_or_curie.CURIE_to_URI(val)
			else :
				return self._URI(val)
	# end _CURIEorURI

	def _TERMorCURIEorAbsURI(self, val) :
		"""Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise
		the method falls back on a CURIE or an absolute URI.
		@param val: attribute value to be interpreted
		@type val: string
		@return: an RDFLib URIRef instance or None
		"""
		from pyRdfa	import uri_schemes
		# This case excludes the pure base, ie, the empty value
		if val == "" :
			return None
		
		from termorcurie import ncname, termname
		if termname.match(val) :
			# This is a term, must be handled as such...			
			retval = self.term_or_curie.term_to_URI(val)
			if not retval :
				self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value = val)
				return None
			else :
				return retval
		else :
			# try a CURIE
			retval = self.term_or_curie.CURIE_to_URI(val)
			if retval :
				return retval
			elif self.rdfa_version >= "1.1" :
				# See if it is an absolute URI
				scheme = urlparse.urlsplit(val)[0]
				if scheme == "" :
					# bug; there should be no relative URIs here
					self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName)
					return None
				else :
					if scheme not in uri_schemes :
						self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
					return URIRef(val)
			else :
				# rdfa 1.0 case
				self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName)
				return None
	# end _TERMorCURIEorAbsURI

	# -----------------------------------------------------------------------------------------------

	def getURI(self, attr) :
		"""Get the URI(s) for the attribute. The name of the attribute determines whether the value should be
		a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done
		using the L{ExecutionContext._resource_type} table.
		@param attr: attribute name
		@type attr: string
		@return: an RDFLib URIRef instance (or None) or a list of those
		"""
		if self.node.hasAttribute(attr) :
			val = self.node.getAttribute(attr)
		else :
			if attr in ExecutionContext._list :
				return []
			else :
				return None
		
		# This may raise an exception if the attr has no key. This, actually,
		# should not happen if the code is correct, but it does not harm having it here...
		try :
			func = ExecutionContext._resource_type[attr]
		except :
			# Actually, this should not happen...
			func = ExecutionContext._URI
		
		if attr in ExecutionContext._list :
			# Allows for a list
			resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ]
			retval = [ r for r in resources if r != None ]
		else :
			retval = func(self, val.strip())
		return retval
	# end getURI
	
	def getResource(self, *args) :
		"""Get single resources from several different attributes. The first one that returns a valid URI wins.
		@param args: variable list of attribute names, or a single attribute being a list itself.
		@return: an RDFLib URIRef instance (or None) :
		"""
		if len(args) == 0 :
			return None
		if isinstance(args[0], TupleType) or isinstance(args[0], ListType) :
			rargs = args[0]
		else :
			rargs = args
			
		for resource in rargs :
			uri = self.getURI(resource)
			if uri != None : return uri
		return None
	
	# -----------------------------------------------------------------------------------------------
	def reset_list_mapping(self, origin=None) :
		"""
		Reset, ie, create a new empty dictionary for the list mapping.
		"""
		self.list_mapping = ListStructure()
		if origin: self.set_list_origin(origin)
		self.new_list = True

	def list_empty(self) :
		"""
		Checks whether the list is empty.
		@return: Boolean
		"""
		return len(self.list_mapping.mapping) == 0
		
	def get_list_props(self) :
		"""
		Return the list of property values in the list structure
		@return: list of URIRef
		"""
		return self.list_mapping.mapping.keys()
		
	def get_list_value(self,prop) :
		"""
		Return the list of values in the list structure for a specific property
		@return: list of RDF nodes
		"""
		return self.list_mapping.mapping[prop]
		
	def set_list_origin(self, origin) :
		"""
		Set the origin of the list, ie, the subject to attach the final list(s) to
		@param origin: URIRef
		"""		
		self.list_mapping.origin = origin
		
	def get_list_origin(self) :
		"""
		Return the origin of the list, ie, the subject to attach the final list(s) to
		@return: URIRef
		"""		
		return self.list_mapping.origin
		
	def add_to_list_mapping(self, property, resource) :
		"""Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays;
		if the array does not exist yet, it will be created on the fly.
		
		@param property: the property URI, used as a key in the dictionary
		@param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy
		placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not
		an empty list has to be generated.
		"""
		if property in self.list_mapping.mapping :
			if resource != None :
				# indeed, if it is None, than it should not override anything
				if self.list_mapping.mapping[property] == None :
					# replacing a dummy with real content
					self.list_mapping.mapping[property] = [ resource ]
				else :			
					self.list_mapping.mapping[property].append(resource)
		else :
			if resource != None :
				self.list_mapping.mapping[property] = [ resource ]
			else :
				self.list_mapping.mapping[property] = None
Beispiel #16
0
	def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) :
		"""
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{state.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This overrides the possible
		base inherited from the upper layers. The 
		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
		@keyword options: invocation options, and references to warning graphs
		@type options: L{Options<pyRdfa.options>}
		"""
		def remove_frag_id(uri) :
			"""
			The fragment ID for self.base must be removed
			"""
			try :
				# To be on the safe side:-)
				t = urlparse.urlparse(uri)
				return urlparse.urlunparse((t[0],t[1],t[2],t[3],t[4],""))
			except :
				return uri
			
		# This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
		if len(	ExecutionContext._resource_type ) == 0 :	
			ExecutionContext._resource_type = {
				"href"		:	ExecutionContext._URI,
				"src"		:	ExecutionContext._URI,
				"vocab"	    :   ExecutionContext._URI,
			
				"about"		:	ExecutionContext._CURIEorURI, 
				"resource"	:	ExecutionContext._CURIEorURI, 
			
				"rel"		:	ExecutionContext._TERMorCURIEorAbsURI,
				"rev"		:	ExecutionContext._TERMorCURIEorAbsURI,
				"datatype"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"typeof"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"property"	:	ExecutionContext._TERMorCURIEorAbsURI,
				"role"		:	ExecutionContext._TERMorCURIEorAbsURI,
			}	
		#-----------------------------------------------------------------
		self.node = node
		
		#-----------------------------------------------------------------
		# Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
		# case in, say, XHTML...)
		# At the moment, it is invoked with a 'None' at the top level of parsing, that is
		# when the <base> element is looked for (for the HTML cases, that is)
		if inherited_state :
			self.rdfa_version		= inherited_state.rdfa_version
			self.base				= inherited_state.base
			self.options			= inherited_state.options
						
			self.list_mapping 		= inherited_state.list_mapping
			self.new_list			= False
			
			# for generic XML versions the xml:base attribute should be handled
			if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
				self.base = remove_frag_id(node.getAttribute("xml:base"))
		else :
			# this is the branch called from the very top			
			self.list_mapping = ListStructure()
			self.new_list	  = True
			
			if rdfa_version is not None :
				self.rdfa_version = rdfa_version
			else :
				from pyRdfa import rdfa_current_version				
				self.rdfa_version = rdfa_current_version

			# This value can be overwritten by a @version attribute
			if node.hasAttribute("version") :
				top_version = node.getAttribute("version")
				if top_version.find("RDFa 1.0") != -1 :
					self.rdfa_version = "1.0"
				elif top_version.find("RDFa 1.1") != -1 :
					self.rdfa_version = "1.1"						
			
			# this is just to play safe. I believe this should actually not happen...
			if options == None :
				from pyRdfa import Options
				self.options = Options()
			else :
				self.options = options

			self.base = ""
			# handle the base element case for HTML
			if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5  ] :
				for bases in node.getElementsByTagName("base") :
					if bases.hasAttribute("href") :
						self.base = remove_frag_id(bases.getAttribute("href"))
						continue
			elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
				self.base = remove_frag_id(node.getAttribute("xml:base"))
				
			# If no local setting for base occurs, the input argument has it
			if self.base == "" :
				self.base = base
				
			# Perform an extra beautification in RDFLib
			if self.options.host_language in beautifying_prefixes :
				dict = beautifying_prefixes[self.options.host_language]
				for key in dict :
					graph.bind(key,dict[key])
					
			input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base)
			self.options.add_info(input_info)

								
		#-----------------------------------------------------------------
		# this will be used repeatedly, better store it once and for all...		
		self.parsedBase = urlparse.urlsplit(self.base)

		#-----------------------------------------------------------------
		# generate and store the local CURIE handling class instance
		self.term_or_curie = TermOrCurie(self, graph, inherited_state)

		#-----------------------------------------------------------------
		# Settling the language tags
		# @lang has priority over @xml:lang
		# it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
		# first get the inherited state's language, if any
		if inherited_state :
			self.lang = inherited_state.lang
		else :
			self.lang = None
			
		self.supress_lang = False
			
			
		if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] :
			# we may have lang and xml:lang
			if node.hasAttribute("lang") :
				lang = node.getAttribute("lang").lower()
			else :
				lang = None
			if node.hasAttribute("xml:lang") :
				xmllang = node.getAttribute("xml:lang").lower()
			else :
				xmllang = None
			# First of all, set the value, if any
			if xmllang != None :
				# this has priority
				if len(xmllang) != 0 :
					self.lang = xmllang
				else :
					self.lang = None
			elif lang != None :
				if len(lang) != 0 :
					self.lang = lang
				else :
					self.lang = None					
			# Ideally, a warning should be generated if lang and xmllang are both present with different values. But
			# the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
			# error situations are simply swallowed...
				
		elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") :
				self.lang = node.getAttribute("xml:lang").lower()
				if len(self.lang) == 0 : self.lang = None
			
		#-----------------------------------------------------------------
		# Set the default namespace. Used when generating XML Literals
		if node.hasAttribute("xmlns") :
			self.defaultNS = node.getAttribute("xmlns")
		elif inherited_state and inherited_state.defaultNS != None :
			self.defaultNS = inherited_state.defaultNS
		else :
			self.defaultNS = None
Beispiel #17
0
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  transformers=extras,
                  embedded_rdf=embedded_rdf,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  vocab_cache_report=vocab_cache_report,
                  refresh_vocab_cache=refresh_vocab_cache,
                  check_lite=check_lite,
                  experimental_features=True)

processor = pyRdfa(options, base)
if len(value) >= 1:
    print processor.rdf_from_sources(value,
                                     outputFormat=format,
                                     rdfOutput=rdfOutput)
else:
    print processor.rdf_from_source(sys.stdin,
                                    outputFormat=format,
                                    rdfOutput=rdfOutput)
Beispiel #18
0
                output_processor_graph = True
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  transformers=extras,
                  embedded_rdf=embedded_rdf,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  vocab_cache_report=vocab_cache_report,
                  refresh_vocab_cache=refresh_vocab_cache)

processor = pyRdfa(options, base)
if len(value) >= 1:
    print processor.rdf_from_sources(value,
                                     outputFormat=format,
                                     rdfOutput=rdfOutput)
else:
    print processor.rdf_from_source(sys.stdin,
                                    outputFormat=format,
                                    rdfOutput=rdfOutput)
Beispiel #19
0
class pyRdfa :
	"""Main processing class for the distiller
	
	@ivar options: an instance of the L{Options} class
	@ivar media_type: the preferred default media type, possibly set at initialization
	@ivar base: the base value, possibly set at initialization
	"""
	def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) :
		"""
		@keyword options: Options for the distiller
		@type options: L{Options}
		@keyword base: URI for the default "base" value (usually the URI of the file to be processed)
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source
		@keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used
		"""
		self.base = base
		if base == "" :
			self.required_base = None
		else :
			self.required_base	= base
		self.charset 		= None

		# predefined content type
		self.media_type = media_type

		if options == None :
			self.options = Options()
		else :
			self.options = options

		if media_type != "" :
			self.options.set_host_language(self.media_type)
			
		if rdfa_version is not None :
			self.rdfa_version = rdfa_version
		else :
			self.rdfa_version = None
		
	def _get_input(self, name) :
		"""
		Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly,
		returning a file-like object. If name none of these, it returns the input argument (that should
		be, supposidly, a file-like object already)
		
		If the media type has not been set explicitly at initialization of this instance,
		the method also sets the media_type based on the HTTP GET response or the suffix of the file. See
		L{utils.preferred_suffixes} for the suffix to media type mapping. 
		
		@param name: identifier of the input source
		@type name: string or a file-like object
		@return: a file like object if opening "name" is possible and successful, "name" otherwise
		"""
		try :
			if isinstance(name, basestring) :
				# check if this is a URI, ie, if there is a valid 'scheme' part
				# otherwise it is considered to be a simple file
				if urlparse.urlparse(name)[0] != "" :
					url_request 	  = URIOpener(name)
					self.base 		  = url_request.location
					if self.media_type == "" :
						if url_request.content_type in content_to_host_language :
							self.media_type = url_request.content_type
						else :
							self.media_type = MediaTypes.xml
						self.options.set_host_language(self.media_type)
					self.charset = url_request.charset
					if self.required_base == None :
						self.required_base = name
					return url_request.data
				else :
					self.base = name
					# Creating a File URI for this thing
					if self.required_base == None :
						self.required_base = "file://" + os.path.join(os.getcwd(),name)
					if self.media_type == "" :
						self.media_type = MediaTypes.xml
						# see if the default should be overwritten
						for suffix in preferred_suffixes :
							if name.endswith(suffix) :
								self.media_type = preferred_suffixes[suffix]
								self.charset = 'utf-8'
								break
						self.options.set_host_language(self.media_type)
					from py3compat import PY3
					if PY3:
						return open(name, 'rb')
					else:
						return open(name, 'r')
			else :
				return name
		except :
			(type, value, traceback) = sys.exc_info()
			raise FailedSource(value)
	
	####################################################################################################################
	# Externally used methods
	#
	def graph_from_DOM(self, dom, graph = None, pgraph = None) :
		"""
		Extract the RDF Graph from a DOM tree. This is where the real meat happens. All other methods get down to this
		one, eventually (eg, after opening a URI and parsing it into a DOM)
		@param dom: a DOM Node element, the top level entry node for the whole tree (to make it clear, a dom.documentElement is used to initiate processing)
		@keyword graph: an RDF Graph (if None, than a new one is created)
		@type graph: rdflib Graph instance. If None, a new one is created.
		@keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@type pgraph: rdflib Graph instance or None
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyGraph(tog, fromg) :
			for t in fromg :
				tog.add(t)
			for k,ns in fromg.namespaces() :
				tog.bind(k,ns)

		if graph == None :
			# Create the RDF Graph, that will contain the return triples...
			graph   = Graph()
			
		# this will collect the content, the 'default graph', as called in the RDFa spec
		default_graph = Graph()
	
		# get the DOM tree
		topElement = dom.documentElement
		
		# Perform the built-in and external transformations on the HTML tree. 
		for trans in self.options.transformers + builtInTransformers :
			trans(topElement, self.options)
		
		# Create the initial state. This takes care of things
		# like base, top level namespace settings, etc.
		state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version)
		
		# This may have changed if the state setting detected an explicit version information:
		self.rdfa_version = state.rdfa_version
		
		# The top level subject starts with the current document; this
		# is used by the recursion
		#subject = URIRef(state.base)
		# this function is the real workhorse
		parse_one_node(topElement, default_graph, None, state, [])
		
		# If the RDFS expansion has to be made, here is the place...
		if self.options.vocab_expansion :
			from pyRdfa.rdfs.process import process_rdfa_sem
			process_rdfa_sem(default_graph, self.options)
	
		# What should be returned depends on the way the options have been set up
		if self.options.output_default_graph :
			copyGraph(graph, default_graph)
			if self.options.output_processor_graph :
				if pgraph != None :
					copyGraph(pgraph, self.options.processor_graph.graph)
				else :					
					copyGraph(graph, self.options.processor_graph.graph)
		elif self.options.output_processor_graph :
			if pgraph != None :
				copyGraph(pgraph, self.options.processor_graph.graph)
			else :
				copyGraph(graph, self.options.processor_graph.graph)

		# this is necessary if several DOM trees are handled in a row...
		self.options.reset_processor_graph()

		return graph
	
	def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) :
		"""
		Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is
		returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method.
				
		@param name: a URI, a file name, or a file-like object
		@param graph: rdflib Graph instance. If None, a new one is created.
		@param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph.
		@param rdfOutput: whether exceptions should be turned into RDF and returned as part of the processor graph
		@return: an RDF Graph
		@rtype: rdflib Graph instance
		"""
		def copyErrors(tog, options) :
			if tog == None :
				tog = Graph()
			if options.output_processor_graph :
				for t in options.processor_graph.graph :
					tog.add(t)
				for k,ns in options.processor_graph.graph.namespaces() :
					tog.bind(k,ns)
			options.reset_processor_graph()
			return tog		
		
		try :
			# First, open the source...
			input = self._get_input(name)
			msg = ""
			parser = None
			if self.options.host_language == HostLanguage.html :
				import warnings
				warnings.filterwarnings("ignore", category=DeprecationWarning)
				import html5lib
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
				if self.charset :
					# This means the HTTP header has provided a charset, or the
					# file is a local file when we suppose it to be a utf-8
					dom = parser.parse(input, encoding=self.charset)
				else :
					# No charset set. The HTMLLib parser tries to sniff into the
					# the file to find a meta header for the charset; if that
					# works, fine, otherwise it falls back on window-...
					dom = parser.parse(input)
					
			else :
				# in other cases an XML parser has to be used
				parse = xml.dom.minidom.parse
				dom = parse(input)
			#dom = parse(input,encoding='utf-8')
			return self.graph_from_DOM(dom, graph, pgraph)
		except FailedSource, f :
			if not rdfOutput : raise f
			self.options.add_error(f.msg, FileReferenceError, name)
			return copyErrors(graph, self.options)
		except Exception, e :
			(a,b,c) = sys.exc_info()
			sys.excepthook(a,b,c)
			#if not rdfOutput : raise e
			return copyErrors(graph, self.options)