def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.http_status = 200 self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None
def convert( self ) : """ Top level entry to convert and generate all the triples. It finds the top level items, and generates triples for each of them; additionally, it generates a top level entry point to the items from base in the form of an RDF list. """ item_list = [] for top_level_item in self.get_top_level_items() : item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) ) #list = generate_RDF_collection( self.graph, item_list ) #self.graph.add( (URIRef(self.base),self.ns_md["item"],list) ) # If the vocab expansion is also switched on, this is the time to do it. # This is the version with my current proposal: the basic expansion is always there; # the follow-your-nose inclusion of vocabulary is optional if self.vocabularies_used : try : try : from ..pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem from ..pyRdfa.options import Options except : from pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem from pyRdfa.options import Options # if we did not get here, the pyRdfa package could not be # imported. Too bad, but life should go on in the except branch... if self.vocab_expansion : # This is the full deal options = Options(vocab_expansion = self.vocab_expansion, vocab_cache = self.vocab_cache) process_rdfa_sem(self.graph, options) else : MiniOWL(self.graph).closure() except : pass
def extrair_rdfa(url): options = Options(embedded_rdf=True) #r = requests.get(url) #print pyRdfa(options=options).rdf_from_source(url,outputFormat='pretty-xml') g1 = pyRdfa(options=options).rdf_from_source(url, outputFormat='pretty-xml') #print g1#g2 = pyRdfa(options=options).rdf_from_source('http://rbarbosa.me/ex.html',outputFormat='pretty-xml') g = Graph() g.parse(io.BytesIO(g1)) return g
def check_term(conn, term, predicates): cur = conn.cursor() html = gizmos.tree.build_tree(cur, "obi", term, predicate_ids=predicates) # Create the DOM document element parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom")) dom = parser.parse(html) # get the DOM tree top = dom.documentElement # Create the initial state (from pyRdfa) actual = Graph() options = Options( output_default_graph=True, output_processor_graph=True, space_preserve=True, transformers=[], embedded_rdf=True, vocab_expansion=False, vocab_cache=True, vocab_cache_report=False, refresh_vocab_cache=False, check_lite=False, experimental_features=True, ) state = ExecutionContext( top, actual, base="http://purl.obolibrary.org/obo/", options=options, rdfa_version="1.1", ) # Add the RDFa to the RDFLib graph (recursive) parse_one_node(top, actual, None, state, []) expected = Graph() if predicates: expected.parse(f"tests/resources/obi-tree-{term}-predicates.ttl", format="turtle") else: expected.parse(f"tests/resources/obi-tree-{term}.ttl", format="turtle") compare_graphs(actual, expected)
def parse(self): """ Parse the RDFa input and store the processor and default graphs. The final media type is also updated. """ transformers = [] if self.rdfa_lite: from pyRdfa.transform.lite import lite_prune transformers.append(lite_prune) options = Options(output_default_graph=True, output_processor_graph=True, transformers=transformers, vocab_expansion=self.vocab_expansion, embedded_rdf=self.embedded_rdf, add_informational_messages=True) processor = pyRdfa(options=options, base=self.base, media_type=self.media_type) processor.graph_from_source(self.uri, graph=self.default_graph, pgraph=self.processor_graph, rdfOutput=True) # Extracting some parameters for the error messages self.processor = processor
class ExecutionContext: """State at a specific node, including the current set of namespaces in the RDFLib sense, current language, the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce URI references for RDFLib. @ivar options: reference to the overall options @type options: L{Options} @ivar base: the 'base' URI @ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit @ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals @ivar lang: language tag (possibly None) @ivar term_or_curie: vocabulary management class instance @type term_or_curie: L{termorcurie.TermOrCurie} @ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists @ivar node: the node to which this state belongs @type node: DOM node instance @ivar rdfa_version: RDFa version of the content @type rdfa_version: String @ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5) @type supress_lang: Boolean @cvar _list: list of attributes that allow for lists of values and should be treated as such @cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation. """ # list of attributes that allow for lists of values and should be treated as such _list = ["rel", "rev", "property", "typeof", "role"] # mapping table from attribute name to the exact method to retrieve the URI(s). _resource_type = {} def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version=None): """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{state.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This overrides the possible base inherited from the upper layers. The current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is necessary for SVG (and other possible XML dialects that accept C{@xml:base}) @keyword options: invocation options, and references to warning graphs @type options: L{Options<pyRdfa.options>} """ def remove_frag_id(uri): """ The fragment ID for self.base must be removed """ try: # To be on the safe side:-) t = urlparse(uri) return urlunparse((t[0], t[1], t[2], t[3], t[4], "")) except: return uri # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up if len(ExecutionContext._resource_type) == 0: ExecutionContext._resource_type = { "href": ExecutionContext._URI, "src": ExecutionContext._URI, "vocab": ExecutionContext._URI, "about": ExecutionContext._CURIEorURI, "resource": ExecutionContext._CURIEorURI, "rel": ExecutionContext._TERMorCURIEorAbsURI, "rev": ExecutionContext._TERMorCURIEorAbsURI, "datatype": ExecutionContext._TERMorCURIEorAbsURI, "typeof": ExecutionContext._TERMorCURIEorAbsURI, "property": ExecutionContext._TERMorCURIEorAbsURI, "role": ExecutionContext._TERMorCURIEorAbsURI, } #----------------------------------------------------------------- self.node = node #----------------------------------------------------------------- # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the # case in, say, XHTML...) # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for (for the HTML cases, that is) if inherited_state: self.rdfa_version = inherited_state.rdfa_version self.base = inherited_state.base self.options = inherited_state.options self.list_mapping = inherited_state.list_mapping self.new_list = False # for generic XML versions the xml:base attribute should be handled if self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) else: # this is the branch called from the very top self.list_mapping = ListStructure() self.new_list = True if rdfa_version is not None: self.rdfa_version = rdfa_version else: from pyRdfa import rdfa_current_version self.rdfa_version = rdfa_current_version # This value can be overwritten by a @version attribute if node.hasAttribute("version"): top_version = node.getAttribute("version") if top_version.find("RDFa 1.0") != -1 or top_version.find( "RDFa1.0") != -1: self.rdfa_version = "1.0" elif top_version.find("RDFa 1.1") != -1 or top_version.find( "RDFa1.1") != -1: self.rdfa_version = "1.1" # this is just to play safe. I believe this should actually not happen... if options == None: from pyRdfa import Options self.options = Options() else: self.options = options self.base = "" # handle the base element case for HTML if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ]: for bases in node.getElementsByTagName("base"): if bases.hasAttribute("href"): self.base = remove_frag_id(bases.getAttribute("href")) continue elif self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) # If no local setting for base occurs, the input argument has it if self.base == "": self.base = base # Perform an extra beautification in RDFLib if self.options.host_language in beautifying_prefixes: dict = beautifying_prefixes[self.options.host_language] for key in dict: graph.bind(key, dict[key]) input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % ( self.options.host_language, self.rdfa_version, self.base) self.options.add_info(input_info) #----------------------------------------------------------------- # this will be used repeatedly, better store it once and for all... self.parsedBase = urlsplit(self.base) #----------------------------------------------------------------- # generate and store the local CURIE handling class instance self.term_or_curie = TermOrCurie(self, graph, inherited_state) #----------------------------------------------------------------- # Settling the language tags # @lang has priority over @xml:lang # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( # first get the inherited state's language, if any if inherited_state: self.lang = inherited_state.lang else: self.lang = None self.supress_lang = False if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ]: # we may have lang and xml:lang if node.hasAttribute("lang"): lang = node.getAttribute("lang").lower() else: lang = None if node.hasAttribute("xml:lang"): xmllang = node.getAttribute("xml:lang").lower() else: xmllang = None # First of all, set the value, if any if xmllang != None: # this has priority if len(xmllang) != 0: self.lang = xmllang else: self.lang = None elif lang != None: if len(lang) != 0: self.lang = lang else: self.lang = None # Ideally, a warning should be generated if lang and xmllang are both present with different values. But # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential # error situations are simply swallowed... elif self.options.host_language in accept_xml_lang and node.hasAttribute( "xml:lang"): self.lang = node.getAttribute("xml:lang").lower() if len(self.lang) == 0: self.lang = None #----------------------------------------------------------------- # Set the default namespace. Used when generating XML Literals if node.hasAttribute("xmlns"): self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None: self.defaultNS = inherited_state.defaultNS else: self.defaultNS = None # end __init__ def _URI(self, val): """Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an uninterpreted CURIE... @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance """ def create_URIRef(uri, check=True): """ Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case there is something unusual, a warning is generated (though the URIRef is created nevertheless) @param uri: (absolute) URI string @return: an RDFLib URIRef instance """ from pyRdfa import uri_schemes val = uri.strip() if check and urlsplit(val)[0] not in uri_schemes: self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) return URIRef(val) def join(base, v, check=True): """ Mini helping function: it makes a urljoin for the paths. Based on the python library, but that one has a bug: in some cases it swallows the '#' or '?' character at the end. This is clearly a problem with Semantic Web URI-s, so this is checked, too @param base: base URI string @param v: local part @param check: whether the URI should be checked against the list of 'existing' URI schemes @return: an RDFLib URIRef instance """ # UGLY!!! There is a bug for a corner case in python version <= 2.5.X if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5): return create_URIRef(base + v, check) #### joined = urljoin(base, v) try: if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?"): return create_URIRef(joined + v[-1], check) else: return create_URIRef(joined, check) except: return create_URIRef(joined, check) if val == "": # The fragment ID must be removed... return URIRef(self.base) # fall back on good old traditional URI-s. # To be on the safe side, let us use the Python libraries if self.parsedBase[0] == "": # base is, in fact, a local file name # The following call is just to be sure that some pathological cases when # the ':' _does_ appear in the URI but not in a scheme position is taken # care of properly... key = urlsplit(val)[0] if key == "": # relative URI, to be combined with local file name: return join(self.base, val, check=False) else: return create_URIRef(val) else: # Trust the python library... # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it # swallows the '#' or '?' character at the end. This is clearly a problem with # Semantic Web URI-s return join(self.base, val) # end _URI def _CURIEorURI(self, val): """Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance or None """ if val == "": return URIRef(self.base) safe_curie = False if val[0] == '[': # If a safe CURIE is asked for, a pure URI is not acceptable. # Is checked below, and that is why the safe_curie flag is necessary if val[-1] != ']': # that is certainly forbidden: an incomplete safe CURIE self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) return None else: val = val[1:-1] safe_curie = True # There is a branch here depending on whether we are in 1.1 or 1.0 mode if self.rdfa_version >= "1.1": retval = self.term_or_curie.CURIE_to_URI(val) if retval == None: # the value could not be interpreted as a CURIE, ie, it did not produce any valid URI. # The rule says that then the whole value should be considered as a URI # except if it was part of a safe CURIE. In that case it should be ignored... if safe_curie: self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) return None else: return self._URI(val) else: # there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out if isinstance(retval, BNode) == False and urlsplit( str(retval))[0] == "": # yep, there is something wrong, a new URIRef has to be created: return URIRef(self.base + str(retval)) else: return retval else: # in 1.0 mode a CURIE can be considered only in case of a safe CURIE if safe_curie: return self.term_or_curie.CURIE_to_URI(val) else: return self._URI(val) # end _CURIEorURI def _TERMorCURIEorAbsURI(self, val): """Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise the method falls back on a CURIE or an absolute URI. @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance or None """ from pyRdfa import uri_schemes # This case excludes the pure base, ie, the empty value if val == "": return None from pyRdfa.termorcurie import ncname, termname if termname.match(val): # This is a term, must be handled as such... retval = self.term_or_curie.term_to_URI(val) if not retval: self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value=val) return None else: return retval else: # try a CURIE retval = self.term_or_curie.CURIE_to_URI(val) if retval: return retval elif self.rdfa_version >= "1.1": # See if it is an absolute URI scheme = urlsplit(val)[0] if scheme == "": # bug; there should be no relative URIs here self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName) return None else: if scheme not in uri_schemes: self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) return URIRef(val) else: # rdfa 1.0 case self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName) return None # end _TERMorCURIEorAbsURI # ----------------------------------------------------------------------------------------------- def getURI(self, attr): """Get the URI(s) for the attribute. The name of the attribute determines whether the value should be a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done using the L{ExecutionContext._resource_type} table. @param attr: attribute name @type attr: string @return: an RDFLib URIRef instance (or None) or a list of those """ if self.node.hasAttribute(attr): val = self.node.getAttribute(attr) else: if attr in ExecutionContext._list: return [] else: return None # This may raise an exception if the attr has no key. This, actually, # should not happen if the code is correct, but it does not harm having it here... try: func = ExecutionContext._resource_type[attr] except: # Actually, this should not happen... func = ExecutionContext._URI if attr in ExecutionContext._list: # Allows for a list resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ] retval = [r for r in resources if r != None] else: retval = func(self, val.strip()) return retval # end getURI def getResource(self, *args): """Get single resources from several different attributes. The first one that returns a valid URI wins. @param args: variable list of attribute names, or a single attribute being a list itself. @return: an RDFLib URIRef instance (or None) : """ if len(args) == 0: return None if isinstance(args[0], TupleType) or isinstance(args[0], ListType): rargs = args[0] else: rargs = args for resource in rargs: uri = self.getURI(resource) if uri != None: return uri return None # ----------------------------------------------------------------------------------------------- def reset_list_mapping(self, origin=None): """ Reset, ie, create a new empty dictionary for the list mapping. """ self.list_mapping = ListStructure() if origin: self.set_list_origin(origin) self.new_list = True def list_empty(self): """ Checks whether the list is empty. @return: Boolean """ return len(self.list_mapping.mapping) == 0 def get_list_props(self): """ Return the list of property values in the list structure @return: list of URIRef """ return list(self.list_mapping.mapping.keys()) def get_list_value(self, prop): """ Return the list of values in the list structure for a specific property @return: list of RDF nodes """ return self.list_mapping.mapping[prop] def set_list_origin(self, origin): """ Set the origin of the list, ie, the subject to attach the final list(s) to @param origin: URIRef """ self.list_mapping.origin = origin def get_list_origin(self): """ Return the origin of the list, ie, the subject to attach the final list(s) to @return: URIRef """ return self.list_mapping.origin def add_to_list_mapping(self, property, resource): """Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays; if the array does not exist yet, it will be created on the fly. @param property: the property URI, used as a key in the dictionary @param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not an empty list has to be generated. """ if property in self.list_mapping.mapping: if resource != None: # indeed, if it is None, than it should not override anything if self.list_mapping.mapping[property] == None: # replacing a dummy with real content self.list_mapping.mapping[property] = [resource] else: self.list_mapping.mapping[property].append(resource) else: if resource != None: self.list_mapping.mapping[property] = [resource] else: self.list_mapping.mapping[property] = None
def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version=None): """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{state.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This overrides the possible base inherited from the upper layers. The current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is necessary for SVG (and other possible XML dialects that accept C{@xml:base}) @keyword options: invocation options, and references to warning graphs @type options: L{Options<pyRdfa.options>} """ def remove_frag_id(uri): """ The fragment ID for self.base must be removed """ try: # To be on the safe side:-) t = urlparse(uri) return urlunparse((t[0], t[1], t[2], t[3], t[4], "")) except: return uri # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up if len(ExecutionContext._resource_type) == 0: ExecutionContext._resource_type = { "href": ExecutionContext._URI, "src": ExecutionContext._URI, "vocab": ExecutionContext._URI, "about": ExecutionContext._CURIEorURI, "resource": ExecutionContext._CURIEorURI, "rel": ExecutionContext._TERMorCURIEorAbsURI, "rev": ExecutionContext._TERMorCURIEorAbsURI, "datatype": ExecutionContext._TERMorCURIEorAbsURI, "typeof": ExecutionContext._TERMorCURIEorAbsURI, "property": ExecutionContext._TERMorCURIEorAbsURI, "role": ExecutionContext._TERMorCURIEorAbsURI, } #----------------------------------------------------------------- self.node = node #----------------------------------------------------------------- # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the # case in, say, XHTML...) # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for (for the HTML cases, that is) if inherited_state: self.rdfa_version = inherited_state.rdfa_version self.base = inherited_state.base self.options = inherited_state.options self.list_mapping = inherited_state.list_mapping self.new_list = False # for generic XML versions the xml:base attribute should be handled if self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) else: # this is the branch called from the very top self.list_mapping = ListStructure() self.new_list = True if rdfa_version is not None: self.rdfa_version = rdfa_version else: from pyRdfa import rdfa_current_version self.rdfa_version = rdfa_current_version # This value can be overwritten by a @version attribute if node.hasAttribute("version"): top_version = node.getAttribute("version") if top_version.find("RDFa 1.0") != -1 or top_version.find( "RDFa1.0") != -1: self.rdfa_version = "1.0" elif top_version.find("RDFa 1.1") != -1 or top_version.find( "RDFa1.1") != -1: self.rdfa_version = "1.1" # this is just to play safe. I believe this should actually not happen... if options == None: from pyRdfa import Options self.options = Options() else: self.options = options self.base = "" # handle the base element case for HTML if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ]: for bases in node.getElementsByTagName("base"): if bases.hasAttribute("href"): self.base = remove_frag_id(bases.getAttribute("href")) continue elif self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) # If no local setting for base occurs, the input argument has it if self.base == "": self.base = base # Perform an extra beautification in RDFLib if self.options.host_language in beautifying_prefixes: dict = beautifying_prefixes[self.options.host_language] for key in dict: graph.bind(key, dict[key]) input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % ( self.options.host_language, self.rdfa_version, self.base) self.options.add_info(input_info) #----------------------------------------------------------------- # this will be used repeatedly, better store it once and for all... self.parsedBase = urlsplit(self.base) #----------------------------------------------------------------- # generate and store the local CURIE handling class instance self.term_or_curie = TermOrCurie(self, graph, inherited_state) #----------------------------------------------------------------- # Settling the language tags # @lang has priority over @xml:lang # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( # first get the inherited state's language, if any if inherited_state: self.lang = inherited_state.lang else: self.lang = None self.supress_lang = False if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ]: # we may have lang and xml:lang if node.hasAttribute("lang"): lang = node.getAttribute("lang").lower() else: lang = None if node.hasAttribute("xml:lang"): xmllang = node.getAttribute("xml:lang").lower() else: xmllang = None # First of all, set the value, if any if xmllang != None: # this has priority if len(xmllang) != 0: self.lang = xmllang else: self.lang = None elif lang != None: if len(lang) != 0: self.lang = lang else: self.lang = None # Ideally, a warning should be generated if lang and xmllang are both present with different values. But # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential # error situations are simply swallowed... elif self.options.host_language in accept_xml_lang and node.hasAttribute( "xml:lang"): self.lang = node.getAttribute("xml:lang").lower() if len(self.lang) == 0: self.lang = None #----------------------------------------------------------------- # Set the default namespace. Used when generating XML Literals if node.hasAttribute("xmlns"): self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None: self.defaultNS = inherited_state.defaultNS else: self.defaultNS = None
def return_graph(uri, options, newCache=False): """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked on the RDFa content. The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used. This function is used to retreive the vocabulary file and turn it into an RDFLib graph. @param uri: URI for the graph @param options: used as a place where warnings can be sent @param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text @return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful """ def return_to_cache(msg): if newCache: options.add_warning(err_unreachable_vocab % uri, warning_type=VocabReferenceError) else: options.add_warning(err_outdated_cache % uri, warning_type=VocabReferenceError) retval = None expiration_date = None content = None try: content = URIOpener( uri, { 'Accept': 'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9' }) except HTTPError: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) except RDFaError: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) except Exception: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) # Store the expiration date of the newly accessed data expiration_date = content.expiration_date if content.content_type == MediaTypes.turtle: try: retval = Graph() retval.parse(content.data, format="n3") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.rdfxml: try: retval = Graph() retval.parse(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.nt: try: retval = Graph() retval.parse(content.data, format="nt") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri, value)) elif content.content_type in [ MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml ] or xml_application_media_type.match(content.content_type) != None: try: from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri, value)) else: options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date)
class pyRdfa : """Main processing class for the distiller @ivar options: an instance of the L{Options} class @ivar media_type: the preferred default media type, possibly set at initialization @ivar base: the base value, possibly set at initialization @ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers """ def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.http_status = 200 self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None def _get_input(self, name) : """ Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, returning a file-like object. If name is none of these, it returns the input argument (that should be, supposedly, a file-like object already). If the media type has not been set explicitly at initialization of this instance, the method also sets the media_type based on the HTTP GET response or the suffix of the file. See L{host.preferred_suffixes} for the suffix to media type mapping. @param name: identifier of the input source @type name: string or a file-like object @return: a file like object if opening "name" is possible and successful, "name" otherwise """ try : if isinstance(name, basestring) : # check if this is a URI, ie, if there is a valid 'scheme' part # otherwise it is considered to be a simple file if urlparse.urlparse(name)[0] != "" : url_request = URIOpener(name) self.base = url_request.location if self.media_type == "" : if url_request.content_type in content_to_host_language : self.media_type = url_request.content_type else : self.media_type = MediaTypes.xml self.options.set_host_language(self.media_type) self.charset = url_request.charset if self.required_base == None : self.required_base = name return url_request.data else : self.base = name # Creating a File URI for this thing if self.required_base == None : self.required_base = "file://" + os.path.join(os.getcwd(),name) if self.media_type == "" : self.media_type = MediaTypes.xml # see if the default should be overwritten for suffix in preferred_suffixes : if name.endswith(suffix) : self.media_type = preferred_suffixes[suffix] self.charset = 'utf-8' break self.options.set_host_language(self.media_type) return file(name) else : return name except HTTPError, h : raise h except :
(type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.nt: try: retval = Graph() retval.parse(content.data, format="nt") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri, value)) elif content.content_type in [ MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml ] or xml_application_media_type.match(content.content_type) != None: try: from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri, value)) else: options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date) ############################################################################################ type = ns_rdf["type"] Property = ns_rdf["Property"] Class = ns_rdfs["Class"]
elif content.content_type == MediaTypes.nt: try: retval = Graph() retval.parse(content.data, format="nt") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri, value)) elif ( content.content_type in [MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml] or xml_application_media_type.match(content.content_type) != None ): try: from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri, value)) else: options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date) ############################################################################################ type = ns_rdf["type"] Property = ns_rdf["Property"] Class = ns_rdfs["Class"] subClassOf = ns_rdfs["subClassOf"]
output_processor_graph = True elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, vocab_cache_report=vocab_cache_report, bypass_vocab_cache=bypass_vocab_cache, transformers=extras, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, hturtle=hturtle) processor = pyRdfa(options, base) if len(value) >= 1: retval = processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: retval = processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
def return_graph(uri, options, newCache = False) : """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked on the RDFa content. The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used. This function is used to retreive the vocabulary file and turn it into an RDFLib graph. @param uri: URI for the graph @param options: used as a place where warnings can be sent @param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text @return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful """ def return_to_cache(msg) : if newCache : options.add_warning(err_unreachable_vocab % uri, warning_type=VocabReferenceError) else : options.add_warning(err_outdated_cache % uri, warning_type=VocabReferenceError) retval = None expiration_date = None content = None try : content = URIOpener(uri, {'Accept' : 'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9'}) except HTTPError : (type,value,traceback) = sys.exc_info() return_to_cache(value) return (None,None) except RDFaError : (type,value,traceback) = sys.exc_info() return_to_cache(value) return (None,None) except Exception : (type,value,traceback) = sys.exc_info() return_to_cache(value) return (None,None) # Store the expiration date of the newly accessed data expiration_date = content.expiration_date if content.content_type == MediaTypes.turtle : try : retval = Graph() retval.parse(content.data, format="n3") except : (type,value,traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri,value)) elif content.content_type == MediaTypes.rdfxml : try : retval = Graph() retval.parse(content.data) except : (type,value,traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri,value)) elif content.content_type == MediaTypes.nt : try : retval = Graph() retval.parse(content.data, format="nt") except : (type,value,traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri,value)) elif content.content_type in [MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml] or xml_application_media_type.match(content.content_type) != None : try : from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except : (type,value,traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri,value)) else : options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date)
class pyRdfa : """Main processing class for the distiller @ivar options: an instance of the L{Options} class @ivar media_type: the preferred default media type, possibly set at initialization @ivar base: the base value, possibly set at initialization @ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers """ def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.http_status = 200 self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None def _get_input(self, name) : """ Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, returning a file-like object. If name is none of these, it returns the input argument (that should be, supposedly, a file-like object already). If the media type has not been set explicitly at initialization of this instance, the method also sets the media_type based on the HTTP GET response or the suffix of the file. See L{host.preferred_suffixes} for the suffix to media type mapping. @param name: identifier of the input source @type name: string or a file-like object @return: a file like object if opening "name" is possible and successful, "name" otherwise """ try : # Python 2 branch isstring = isinstance(name, basestring) except : # Python 3 branch isstring = isinstance(name, str) try : if isstring : # check if this is a URI, ie, if there is a valid 'scheme' part # otherwise it is considered to be a simple file if urlparse(name)[0] != "" : url_request = URIOpener(name) self.base = url_request.location if self.media_type == "" : if url_request.content_type in content_to_host_language : self.media_type = url_request.content_type else : self.media_type = MediaTypes.xml self.options.set_host_language(self.media_type) self.charset = url_request.charset if self.required_base == None : self.required_base = name return url_request.data else : self.base = name # Creating a File URI for this thing if self.required_base == None : self.required_base = "file://" + os.path.join(os.getcwd(),name) if self.media_type == "" : self.media_type = MediaTypes.xml # see if the default should be overwritten for suffix in preferred_suffixes : if name.endswith(suffix) : self.media_type = preferred_suffixes[suffix] self.charset = 'utf-8' break self.options.set_host_language(self.media_type) return file(name) else : return name except HTTPError : raise sys.exc_info()[1] except : (type, value, traceback) = sys.exc_info() raise FailedSource(value) #################################################################################################################### # Externally used methods # def graph_from_DOM(self, dom, graph = None, pgraph = None) : """ Extract the RDF Graph from a DOM tree. This is where the real processing happens. All other methods get down to this one, eventually (e.g., after opening a URI and parsing it into a DOM). @param dom: a DOM Node element, the top level entry node for the whole tree (i.e., the C{dom.documentElement} is used to initiate processing down the node hierarchy) @keyword graph: an RDF Graph (if None, than a new one is created) @type graph: rdflib Graph instance. @keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @type pgraph: rdflib Graph instance @return: an RDF Graph @rtype: rdflib Graph instance """ def copyGraph(tog, fromg) : for t in fromg : tog.add(t) for k,ns in fromg.namespaces() : tog.bind(k,ns) if graph == None : # Create the RDF Graph, that will contain the return triples... graph = Graph() # this will collect the content, the 'default graph', as called in the RDFa spec default_graph = Graph() # get the DOM tree topElement = dom.documentElement # Create the initial state. This takes care of things # like base, top level namespace settings, etc. state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version) # Perform the built-in and external transformations on the HTML tree. for trans in self.options.transformers + builtInTransformers : trans(topElement, self.options, state) # This may have changed if the state setting detected an explicit version information: self.rdfa_version = state.rdfa_version # The top level subject starts with the current document; this # is used by the recursion # this function is the real workhorse parse_one_node(topElement, default_graph, None, state, []) # If the RDFS expansion has to be made, here is the place... if self.options.vocab_expansion : from pyRdfa.rdfs.process import process_rdfa_sem process_rdfa_sem(default_graph, self.options) # What should be returned depends on the way the options have been set up if self.options.output_default_graph : copyGraph(graph, default_graph) if self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) elif self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) # this is necessary if several DOM trees are handled in a row... self.options.reset_processor_graph() return graph def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : """ Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. @param name: a URI, a file name, or a file-like object @param graph: rdflib Graph instance. If None, a new one is created. @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph @return: an RDF Graph @rtype: rdflib Graph instance """ def copyErrors(tog, options) : if tog == None : tog = Graph() if options.output_processor_graph : for t in options.processor_graph.graph : tog.add(t) for k,ns in options.processor_graph.graph.namespaces() : tog.bind(k,ns) options.reset_processor_graph() return tog # Separating this for a forward Python 3 compatibility try : # Python 2 branch isstring = isinstance(name, basestring) except : # Python 3 branch isstring = isinstance(name, str) try : # First, open the source... Possible HTTP errors are returned as error triples input = None try : input = self._get_input(name) except FailedSource : f = sys.exc_info()[1] self.http_status = 400 if not rdfOutput : raise f err = self.options.add_error(f.msg, FileReferenceError, name) self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) except HTTPError : h = sys.exc_info()[1] self.http_status = h.http_code if not rdfOutput : raise h err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name) self.options.processor_graph.add_http_context(err, h.http_code) return copyErrors(graph, self.options) except Exception : e = sys.exc_info()[1] self.http_status = 500 # Something nasty happened:-( if not rdfOutput : raise e err = self.options.add_error(str(e), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options) dom = None try : msg = "" parser = None if self.options.host_language == HostLanguage.html5 : import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import html5lib parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) if self.charset : # This means the HTTP header has provided a charset, or the # file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) try : if isstring : input.close() input = self._get_input(name) else : input.seek(0) from pyRdfa.host import adjust_html_version self.rdfa_version = adjust_html_version(input, self.rdfa_version) except : # if anyting goes wrong, it is not really important; rdfa version stays what it was... pass else : # in other cases an XML parser has to be used from pyRdfa.host import adjust_xhtml_and_version parse = xml.dom.minidom.parse dom = parse(input) (adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version) self.options.host_language = adjusted_host_language self.rdfa_version = version except ImportError : msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>" raise ImportError(msg) except Exception : e = sys.exc_info()[1] # These are various parsing exception. Per spec, this is a case when # error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted # return page or a graph with error triples) does not apply err = self.options.add_error(str(e), context = name) self.http_status = 400 self.options.processor_graph.add_http_context(err, 400) return copyErrors(graph, self.options) # If we got here, we have a DOM tree to operate on... return self.graph_from_DOM(dom, graph, pgraph) except Exception : # Something nasty happened during the generation of the graph... (a,b,c) = sys.exc_info() sys.excepthook(a,b,c) if isinstance(b, ImportError) : self.http_status = None else : self.http_status = 500 if not rdfOutput : raise b err = self.options.add_error(str(b), context = name) self.options.processor_graph.add_http_context(err, 500) return copyErrors(graph, self.options) def rdf_from_sources(self, names, outputFormat = "turtle", rdfOutput = False) : """ Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF extracted, and serialization is done in the specified format. @param names: list of sources, each can be a URI, a file name, or a file-like object @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph @type rdfOutput: boolean @return: a serialized RDF Graph @rtype: string """ # This is better because it gives access to the various, non-standard serializations # If it does not work because the extra are not installed, fall back to the standard # rdlib distribution... try : from pyRdfaExtras import MyGraph graph = MyGraph() except : graph = Graph() graph.bind("xsd", Namespace('http://www.w3.org/2001/XMLSchema#')) # the value of rdfOutput determines the reaction on exceptions... for name in names : self.graph_from_source(name, graph, rdfOutput) retval = graph.serialize(format=outputFormat) return retval def rdf_from_source(self, name, outputFormat = "turtle", rdfOutput = False) : """ Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF extracted, and serialization is done in the specified format. @param name: a URI, a file name, or a file-like object @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph @type rdfOutput: boolean @return: a serialized RDF Graph @rtype: string """ return self.rdf_from_sources([name], outputFormat, rdfOutput)
class ExecutionContext : """State at a specific node, including the current set of namespaces in the RDFLib sense, current language, the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce URI references for RDFLib. @ivar options: reference to the overall options @type options: L{Options} @ivar base: the 'base' URI @ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit @ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals @ivar lang: language tag (possibly None) @ivar term_or_curie: vocabulary management class instance @type term_or_curie: L{termorcurie.TermOrCurie} @ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists @ivar node: the node to which this state belongs @type node: DOM node instance @ivar rdfa_version: RDFa version of the content @type rdfa_version: String @ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5) @type supress_lang: Boolean @cvar _list: list of attributes that allow for lists of values and should be treated as such @cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation. """ # list of attributes that allow for lists of values and should be treated as such _list = [ "rel", "rev", "property", "typeof", "role" ] # mapping table from attribute name to the exact method to retrieve the URI(s). _resource_type = {} def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) : """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{state.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This overrides the possible base inherited from the upper layers. The current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is necessary for SVG (and other possible XML dialects that accept C{@xml:base}) @keyword options: invocation options, and references to warning graphs @type options: L{Options<pyRdfa.options>} """ def remove_frag_id(uri) : """ The fragment ID for self.base must be removed """ try : # To be on the safe side:-) t = urlparse.urlparse(uri) return urlparse.urlunparse((t[0],t[1],t[2],t[3],t[4],"")) except : return uri # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up if len( ExecutionContext._resource_type ) == 0 : ExecutionContext._resource_type = { "href" : ExecutionContext._URI, "src" : ExecutionContext._URI, "vocab" : ExecutionContext._URI, "about" : ExecutionContext._CURIEorURI, "resource" : ExecutionContext._CURIEorURI, "rel" : ExecutionContext._TERMorCURIEorAbsURI, "rev" : ExecutionContext._TERMorCURIEorAbsURI, "datatype" : ExecutionContext._TERMorCURIEorAbsURI, "typeof" : ExecutionContext._TERMorCURIEorAbsURI, "property" : ExecutionContext._TERMorCURIEorAbsURI, "role" : ExecutionContext._TERMorCURIEorAbsURI, } #----------------------------------------------------------------- self.node = node #----------------------------------------------------------------- # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the # case in, say, XHTML...) # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for (for the HTML cases, that is) if inherited_state : self.rdfa_version = inherited_state.rdfa_version self.base = inherited_state.base self.options = inherited_state.options self.list_mapping = inherited_state.list_mapping self.new_list = False # for generic XML versions the xml:base attribute should be handled if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : self.base = remove_frag_id(node.getAttribute("xml:base")) else : # this is the branch called from the very top self.list_mapping = ListStructure() self.new_list = True if rdfa_version is not None : self.rdfa_version = rdfa_version else : from pyRdfa import rdfa_current_version self.rdfa_version = rdfa_current_version # This value can be overwritten by a @version attribute if node.hasAttribute("version") : top_version = node.getAttribute("version") if top_version.find("RDFa 1.0") != -1 : self.rdfa_version = "1.0" elif top_version.find("RDFa 1.1") != -1 : self.rdfa_version = "1.1" # this is just to play safe. I believe this should actually not happen... if options == None : from pyRdfa import Options self.options = Options() else : self.options = options self.base = "" # handle the base element case for HTML if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] : for bases in node.getElementsByTagName("base") : if bases.hasAttribute("href") : self.base = remove_frag_id(bases.getAttribute("href")) continue elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : self.base = remove_frag_id(node.getAttribute("xml:base")) # If no local setting for base occurs, the input argument has it if self.base == "" : self.base = base # Perform an extra beautification in RDFLib if self.options.host_language in beautifying_prefixes : dict = beautifying_prefixes[self.options.host_language] for key in dict : graph.bind(key,dict[key]) input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base) self.options.add_info(input_info) #----------------------------------------------------------------- # this will be used repeatedly, better store it once and for all... self.parsedBase = urlparse.urlsplit(self.base) #----------------------------------------------------------------- # generate and store the local CURIE handling class instance self.term_or_curie = TermOrCurie(self, graph, inherited_state) #----------------------------------------------------------------- # Settling the language tags # @lang has priority over @xml:lang # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( # first get the inherited state's language, if any if inherited_state : self.lang = inherited_state.lang else : self.lang = None self.supress_lang = False if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] : # we may have lang and xml:lang if node.hasAttribute("lang") : lang = node.getAttribute("lang").lower() else : lang = None if node.hasAttribute("xml:lang") : xmllang = node.getAttribute("xml:lang").lower() else : xmllang = None # First of all, set the value, if any if xmllang != None : # this has priority if len(xmllang) != 0 : self.lang = xmllang else : self.lang = None elif lang != None : if len(lang) != 0 : self.lang = lang else : self.lang = None # Ideally, a warning should be generated if lang and xmllang are both present with different values. But # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential # error situations are simply swallowed... elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") : self.lang = node.getAttribute("xml:lang").lower() if len(self.lang) == 0 : self.lang = None #----------------------------------------------------------------- # Set the default namespace. Used when generating XML Literals if node.hasAttribute("xmlns") : self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None : self.defaultNS = inherited_state.defaultNS else : self.defaultNS = None # end __init__ def _URI(self, val) : """Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an uninterpreted CURIE... @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance """ def create_URIRef(uri, check = True) : """ Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case there is something unusual, a warning is generated (though the URIRef is created nevertheless) @param uri: (absolute) URI string @return: an RDFLib URIRef instance """ from pyRdfa import uri_schemes val = uri.strip() if check and urlparse.urlsplit(val)[0] not in uri_schemes : self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) return URIRef(val) def join(base, v, check = True) : """ Mini helping function: it makes a urljoin for the paths. Based on the python library, but that one has a bug: in some cases it swallows the '#' or '?' character at the end. This is clearly a problem with Semantic Web URI-s, so this is checked, too @param base: base URI string @param v: local part @return: an RDFLib URIRef instance """ # UGLY!!! There is a bug for a corner case in python version <= 2.5.X if len(v) > 0 and v[0] == '?' and py_v_minor <= 5 : return create_URIRef(base+v, check) #### joined = urlparse.urljoin(base, v) try : if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : return create_URIRef(joined + v[-1], check) else : return create_URIRef(joined, check) except : return create_URIRef(joined, check) if val == "" : # The fragment ID must be removed... return URIRef(self.base) # fall back on good old traditional URI-s. # To be on the safe side, let us use the Python libraries if self.parsedBase[0] == "" : # base is, in fact, a local file name # The following call is just to be sure that some pathological cases when # the ':' _does_ appear in the URI but not in a scheme position is taken # care of properly... key = urlparse.urlsplit(val)[0] if key == "" : # relative URI, to be combined with local file name: return join(self.base, val, check = False) else : return create_URIRef(val) else : # Trust the python library... # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it # swallows the '#' or '?' character at the end. This is clearly a problem with # Semantic Web URI-s return join(self.base, val) # end _URI def _CURIEorURI(self, val) : """Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance or None """ if val == "" : return URIRef(self.base) safe_curie = False if val[0] == '[' : # If a safe CURIE is asked for, a pure URI is not acceptable. # Is checked below, and that is why the safe_curie flag is necessary if val[-1] != ']' : # that is certainly forbidden: an incomplete safe CURIE self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) return None else : val = val[1:-1] safe_curie = True # There is a branch here depending on whether we are in 1.1 or 1.0 mode if self.rdfa_version >= "1.1" : retval = self.term_or_curie.CURIE_to_URI(val) if retval == None : # the value could not be interpreted as a CURIE, ie, it did not produce any valid URI. # The rule says that then the whole value should be considered as a URI # except if it was part of a safe CURIE. In that case it should be ignored... if safe_curie : self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) return None else : return self._URI(val) else : # there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out if isinstance(retval, BNode) == False and urlparse.urlsplit(str(retval))[0] == "" : # yep, there is something wrong, a new URIRef has to be created: return URIRef(self.base+str(retval)) else : return retval else : # in 1.0 mode a CURIE can be considered only in case of a safe CURIE if safe_curie : return self.term_or_curie.CURIE_to_URI(val) else : return self._URI(val) # end _CURIEorURI def _TERMorCURIEorAbsURI(self, val) : """Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise the method falls back on a CURIE or an absolute URI. @param val: attribute value to be interpreted @type val: string @return: an RDFLib URIRef instance or None """ from pyRdfa import uri_schemes # This case excludes the pure base, ie, the empty value if val == "" : return None from termorcurie import ncname, termname if termname.match(val) : # This is a term, must be handled as such... retval = self.term_or_curie.term_to_URI(val) if not retval : self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value = val) return None else : return retval else : # try a CURIE retval = self.term_or_curie.CURIE_to_URI(val) if retval : return retval elif self.rdfa_version >= "1.1" : # See if it is an absolute URI scheme = urlparse.urlsplit(val)[0] if scheme == "" : # bug; there should be no relative URIs here self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName) return None else : if scheme not in uri_schemes : self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) return URIRef(val) else : # rdfa 1.0 case self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName) return None # end _TERMorCURIEorAbsURI # ----------------------------------------------------------------------------------------------- def getURI(self, attr) : """Get the URI(s) for the attribute. The name of the attribute determines whether the value should be a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done using the L{ExecutionContext._resource_type} table. @param attr: attribute name @type attr: string @return: an RDFLib URIRef instance (or None) or a list of those """ if self.node.hasAttribute(attr) : val = self.node.getAttribute(attr) else : if attr in ExecutionContext._list : return [] else : return None # This may raise an exception if the attr has no key. This, actually, # should not happen if the code is correct, but it does not harm having it here... try : func = ExecutionContext._resource_type[attr] except : # Actually, this should not happen... func = ExecutionContext._URI if attr in ExecutionContext._list : # Allows for a list resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ] retval = [ r for r in resources if r != None ] else : retval = func(self, val.strip()) return retval # end getURI def getResource(self, *args) : """Get single resources from several different attributes. The first one that returns a valid URI wins. @param args: variable list of attribute names, or a single attribute being a list itself. @return: an RDFLib URIRef instance (or None) : """ if len(args) == 0 : return None if isinstance(args[0], TupleType) or isinstance(args[0], ListType) : rargs = args[0] else : rargs = args for resource in rargs : uri = self.getURI(resource) if uri != None : return uri return None # ----------------------------------------------------------------------------------------------- def reset_list_mapping(self, origin=None) : """ Reset, ie, create a new empty dictionary for the list mapping. """ self.list_mapping = ListStructure() if origin: self.set_list_origin(origin) self.new_list = True def list_empty(self) : """ Checks whether the list is empty. @return: Boolean """ return len(self.list_mapping.mapping) == 0 def get_list_props(self) : """ Return the list of property values in the list structure @return: list of URIRef """ return self.list_mapping.mapping.keys() def get_list_value(self,prop) : """ Return the list of values in the list structure for a specific property @return: list of RDF nodes """ return self.list_mapping.mapping[prop] def set_list_origin(self, origin) : """ Set the origin of the list, ie, the subject to attach the final list(s) to @param origin: URIRef """ self.list_mapping.origin = origin def get_list_origin(self) : """ Return the origin of the list, ie, the subject to attach the final list(s) to @return: URIRef """ return self.list_mapping.origin def add_to_list_mapping(self, property, resource) : """Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays; if the array does not exist yet, it will be created on the fly. @param property: the property URI, used as a key in the dictionary @param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not an empty list has to be generated. """ if property in self.list_mapping.mapping : if resource != None : # indeed, if it is None, than it should not override anything if self.list_mapping.mapping[property] == None : # replacing a dummy with real content self.list_mapping.mapping[property] = [ resource ] else : self.list_mapping.mapping[property].append(resource) else : if resource != None : self.list_mapping.mapping[property] = [ resource ] else : self.list_mapping.mapping[property] = None
def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) : """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{state.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This overrides the possible base inherited from the upper layers. The current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is necessary for SVG (and other possible XML dialects that accept C{@xml:base}) @keyword options: invocation options, and references to warning graphs @type options: L{Options<pyRdfa.options>} """ def remove_frag_id(uri) : """ The fragment ID for self.base must be removed """ try : # To be on the safe side:-) t = urlparse.urlparse(uri) return urlparse.urlunparse((t[0],t[1],t[2],t[3],t[4],"")) except : return uri # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up if len( ExecutionContext._resource_type ) == 0 : ExecutionContext._resource_type = { "href" : ExecutionContext._URI, "src" : ExecutionContext._URI, "vocab" : ExecutionContext._URI, "about" : ExecutionContext._CURIEorURI, "resource" : ExecutionContext._CURIEorURI, "rel" : ExecutionContext._TERMorCURIEorAbsURI, "rev" : ExecutionContext._TERMorCURIEorAbsURI, "datatype" : ExecutionContext._TERMorCURIEorAbsURI, "typeof" : ExecutionContext._TERMorCURIEorAbsURI, "property" : ExecutionContext._TERMorCURIEorAbsURI, "role" : ExecutionContext._TERMorCURIEorAbsURI, } #----------------------------------------------------------------- self.node = node #----------------------------------------------------------------- # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the # case in, say, XHTML...) # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for (for the HTML cases, that is) if inherited_state : self.rdfa_version = inherited_state.rdfa_version self.base = inherited_state.base self.options = inherited_state.options self.list_mapping = inherited_state.list_mapping self.new_list = False # for generic XML versions the xml:base attribute should be handled if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : self.base = remove_frag_id(node.getAttribute("xml:base")) else : # this is the branch called from the very top self.list_mapping = ListStructure() self.new_list = True if rdfa_version is not None : self.rdfa_version = rdfa_version else : from pyRdfa import rdfa_current_version self.rdfa_version = rdfa_current_version # This value can be overwritten by a @version attribute if node.hasAttribute("version") : top_version = node.getAttribute("version") if top_version.find("RDFa 1.0") != -1 : self.rdfa_version = "1.0" elif top_version.find("RDFa 1.1") != -1 : self.rdfa_version = "1.1" # this is just to play safe. I believe this should actually not happen... if options == None : from pyRdfa import Options self.options = Options() else : self.options = options self.base = "" # handle the base element case for HTML if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] : for bases in node.getElementsByTagName("base") : if bases.hasAttribute("href") : self.base = remove_frag_id(bases.getAttribute("href")) continue elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : self.base = remove_frag_id(node.getAttribute("xml:base")) # If no local setting for base occurs, the input argument has it if self.base == "" : self.base = base # Perform an extra beautification in RDFLib if self.options.host_language in beautifying_prefixes : dict = beautifying_prefixes[self.options.host_language] for key in dict : graph.bind(key,dict[key]) input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base) self.options.add_info(input_info) #----------------------------------------------------------------- # this will be used repeatedly, better store it once and for all... self.parsedBase = urlparse.urlsplit(self.base) #----------------------------------------------------------------- # generate and store the local CURIE handling class instance self.term_or_curie = TermOrCurie(self, graph, inherited_state) #----------------------------------------------------------------- # Settling the language tags # @lang has priority over @xml:lang # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( # first get the inherited state's language, if any if inherited_state : self.lang = inherited_state.lang else : self.lang = None self.supress_lang = False if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] : # we may have lang and xml:lang if node.hasAttribute("lang") : lang = node.getAttribute("lang").lower() else : lang = None if node.hasAttribute("xml:lang") : xmllang = node.getAttribute("xml:lang").lower() else : xmllang = None # First of all, set the value, if any if xmllang != None : # this has priority if len(xmllang) != 0 : self.lang = xmllang else : self.lang = None elif lang != None : if len(lang) != 0 : self.lang = lang else : self.lang = None # Ideally, a warning should be generated if lang and xmllang are both present with different values. But # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential # error situations are simply swallowed... elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") : self.lang = node.getAttribute("xml:lang").lower() if len(self.lang) == 0 : self.lang = None #----------------------------------------------------------------- # Set the default namespace. Used when generating XML Literals if node.hasAttribute("xmlns") : self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None : self.defaultNS = inherited_state.defaultNS else : self.defaultNS = None
elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, transformers=extras, embedded_rdf=embedded_rdf, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, vocab_cache_report=vocab_cache_report, refresh_vocab_cache=refresh_vocab_cache, check_lite=check_lite, experimental_features=True) processor = pyRdfa(options, base) if len(value) >= 1: print processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: print processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
output_processor_graph = True elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, transformers=extras, embedded_rdf=embedded_rdf, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, vocab_cache_report=vocab_cache_report, refresh_vocab_cache=refresh_vocab_cache) processor = pyRdfa(options, base) if len(value) >= 1: print processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: print processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
class pyRdfa : """Main processing class for the distiller @ivar options: an instance of the L{Options} class @ivar media_type: the preferred default media type, possibly set at initialization @ivar base: the base value, possibly set at initialization """ def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : """ @keyword options: Options for the distiller @type options: L{Options} @keyword base: URI for the default "base" value (usually the URI of the file to be processed) @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used """ self.base = base if base == "" : self.required_base = None else : self.required_base = base self.charset = None # predefined content type self.media_type = media_type if options == None : self.options = Options() else : self.options = options if media_type != "" : self.options.set_host_language(self.media_type) if rdfa_version is not None : self.rdfa_version = rdfa_version else : self.rdfa_version = None def _get_input(self, name) : """ Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, returning a file-like object. If name none of these, it returns the input argument (that should be, supposidly, a file-like object already) If the media type has not been set explicitly at initialization of this instance, the method also sets the media_type based on the HTTP GET response or the suffix of the file. See L{utils.preferred_suffixes} for the suffix to media type mapping. @param name: identifier of the input source @type name: string or a file-like object @return: a file like object if opening "name" is possible and successful, "name" otherwise """ try : if isinstance(name, basestring) : # check if this is a URI, ie, if there is a valid 'scheme' part # otherwise it is considered to be a simple file if urlparse.urlparse(name)[0] != "" : url_request = URIOpener(name) self.base = url_request.location if self.media_type == "" : if url_request.content_type in content_to_host_language : self.media_type = url_request.content_type else : self.media_type = MediaTypes.xml self.options.set_host_language(self.media_type) self.charset = url_request.charset if self.required_base == None : self.required_base = name return url_request.data else : self.base = name # Creating a File URI for this thing if self.required_base == None : self.required_base = "file://" + os.path.join(os.getcwd(),name) if self.media_type == "" : self.media_type = MediaTypes.xml # see if the default should be overwritten for suffix in preferred_suffixes : if name.endswith(suffix) : self.media_type = preferred_suffixes[suffix] self.charset = 'utf-8' break self.options.set_host_language(self.media_type) from py3compat import PY3 if PY3: return open(name, 'rb') else: return open(name, 'r') else : return name except : (type, value, traceback) = sys.exc_info() raise FailedSource(value) #################################################################################################################### # Externally used methods # def graph_from_DOM(self, dom, graph = None, pgraph = None) : """ Extract the RDF Graph from a DOM tree. This is where the real meat happens. All other methods get down to this one, eventually (eg, after opening a URI and parsing it into a DOM) @param dom: a DOM Node element, the top level entry node for the whole tree (to make it clear, a dom.documentElement is used to initiate processing) @keyword graph: an RDF Graph (if None, than a new one is created) @type graph: rdflib Graph instance. If None, a new one is created. @keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @type pgraph: rdflib Graph instance or None @return: an RDF Graph @rtype: rdflib Graph instance """ def copyGraph(tog, fromg) : for t in fromg : tog.add(t) for k,ns in fromg.namespaces() : tog.bind(k,ns) if graph == None : # Create the RDF Graph, that will contain the return triples... graph = Graph() # this will collect the content, the 'default graph', as called in the RDFa spec default_graph = Graph() # get the DOM tree topElement = dom.documentElement # Perform the built-in and external transformations on the HTML tree. for trans in self.options.transformers + builtInTransformers : trans(topElement, self.options) # Create the initial state. This takes care of things # like base, top level namespace settings, etc. state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version) # This may have changed if the state setting detected an explicit version information: self.rdfa_version = state.rdfa_version # The top level subject starts with the current document; this # is used by the recursion #subject = URIRef(state.base) # this function is the real workhorse parse_one_node(topElement, default_graph, None, state, []) # If the RDFS expansion has to be made, here is the place... if self.options.vocab_expansion : from pyRdfa.rdfs.process import process_rdfa_sem process_rdfa_sem(default_graph, self.options) # What should be returned depends on the way the options have been set up if self.options.output_default_graph : copyGraph(graph, default_graph) if self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) elif self.options.output_processor_graph : if pgraph != None : copyGraph(pgraph, self.options.processor_graph.graph) else : copyGraph(graph, self.options.processor_graph.graph) # this is necessary if several DOM trees are handled in a row... self.options.reset_processor_graph() return graph def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : """ Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. @param name: a URI, a file name, or a file-like object @param graph: rdflib Graph instance. If None, a new one is created. @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. @param rdfOutput: whether exceptions should be turned into RDF and returned as part of the processor graph @return: an RDF Graph @rtype: rdflib Graph instance """ def copyErrors(tog, options) : if tog == None : tog = Graph() if options.output_processor_graph : for t in options.processor_graph.graph : tog.add(t) for k,ns in options.processor_graph.graph.namespaces() : tog.bind(k,ns) options.reset_processor_graph() return tog try : # First, open the source... input = self._get_input(name) msg = "" parser = None if self.options.host_language == HostLanguage.html : import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import html5lib parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) if self.charset : # This means the HTTP header has provided a charset, or the # file is a local file when we suppose it to be a utf-8 dom = parser.parse(input, encoding=self.charset) else : # No charset set. The HTMLLib parser tries to sniff into the # the file to find a meta header for the charset; if that # works, fine, otherwise it falls back on window-... dom = parser.parse(input) else : # in other cases an XML parser has to be used parse = xml.dom.minidom.parse dom = parse(input) #dom = parse(input,encoding='utf-8') return self.graph_from_DOM(dom, graph, pgraph) except FailedSource, f : if not rdfOutput : raise f self.options.add_error(f.msg, FileReferenceError, name) return copyErrors(graph, self.options) except Exception, e : (a,b,c) = sys.exc_info() sys.excepthook(a,b,c) #if not rdfOutput : raise e return copyErrors(graph, self.options)