def convert( self ) : """ Top level entry to convert and generate all the triples. It finds the top level items, and generates triples for each of them; additionally, it generates a top level entry point to the items from base in the form of an RDF list. """ item_list = [] for top_level_item in self.get_top_level_items() : item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) ) #list = generate_RDF_collection( self.graph, item_list ) #self.graph.add( (URIRef(self.base),self.ns_md["item"],list) ) # If the vocab expansion is also switched on, this is the time to do it. # This is the version with my current proposal: the basic expansion is always there; # the follow-your-nose inclusion of vocabulary is optional if self.vocabularies_used : try : try : from ..pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem from ..pyRdfa.options import Options except : from pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem from pyRdfa.options import Options # if we did not get here, the pyRdfa package could not be # imported. Too bad, but life should go on in the except branch... if self.vocab_expansion : # This is the full deal options = Options(vocab_expansion = self.vocab_expansion, vocab_cache = self.vocab_cache) process_rdfa_sem(self.graph, options) else : MiniOWL(self.graph).closure() except : pass
def extrair_rdfa(url): options = Options(embedded_rdf=True) #r = requests.get(url) #print pyRdfa(options=options).rdf_from_source(url,outputFormat='pretty-xml') g1 = pyRdfa(options=options).rdf_from_source(url, outputFormat='pretty-xml') #print g1#g2 = pyRdfa(options=options).rdf_from_source('http://rbarbosa.me/ex.html',outputFormat='pretty-xml') g = Graph() g.parse(io.BytesIO(g1)) return g
def check_term(conn, term, predicates): cur = conn.cursor() html = gizmos.tree.build_tree(cur, "obi", term, predicate_ids=predicates) # Create the DOM document element parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom")) dom = parser.parse(html) # get the DOM tree top = dom.documentElement # Create the initial state (from pyRdfa) actual = Graph() options = Options( output_default_graph=True, output_processor_graph=True, space_preserve=True, transformers=[], embedded_rdf=True, vocab_expansion=False, vocab_cache=True, vocab_cache_report=False, refresh_vocab_cache=False, check_lite=False, experimental_features=True, ) state = ExecutionContext( top, actual, base="http://purl.obolibrary.org/obo/", options=options, rdfa_version="1.1", ) # Add the RDFa to the RDFLib graph (recursive) parse_one_node(top, actual, None, state, []) expected = Graph() if predicates: expected.parse(f"tests/resources/obi-tree-{term}-predicates.ttl", format="turtle") else: expected.parse(f"tests/resources/obi-tree-{term}.ttl", format="turtle") compare_graphs(actual, expected)
def parse(self): """ Parse the RDFa input and store the processor and default graphs. The final media type is also updated. """ transformers = [] if self.rdfa_lite: from pyRdfa.transform.lite import lite_prune transformers.append(lite_prune) options = Options(output_default_graph=True, output_processor_graph=True, transformers=transformers, vocab_expansion=self.vocab_expansion, embedded_rdf=self.embedded_rdf, add_informational_messages=True) processor = pyRdfa(options=options, base=self.base, media_type=self.media_type) processor.graph_from_source(self.uri, graph=self.default_graph, pgraph=self.processor_graph, rdfOutput=True) # Extracting some parameters for the error messages self.processor = processor
output_processor_graph = True elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, transformers=extras, embedded_rdf=embedded_rdf, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, vocab_cache_report=vocab_cache_report, refresh_vocab_cache=refresh_vocab_cache) processor = pyRdfa(options, base) if len(value) >= 1: print processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: print processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version=None): """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{state.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This overrides the possible base inherited from the upper layers. The current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is necessary for SVG (and other possible XML dialects that accept C{@xml:base}) @keyword options: invocation options, and references to warning graphs @type options: L{Options<pyRdfa.options>} """ def remove_frag_id(uri): """ The fragment ID for self.base must be removed """ try: # To be on the safe side:-) t = urlparse(uri) return urlunparse((t[0], t[1], t[2], t[3], t[4], "")) except: return uri # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up if len(ExecutionContext._resource_type) == 0: ExecutionContext._resource_type = { "href": ExecutionContext._URI, "src": ExecutionContext._URI, "vocab": ExecutionContext._URI, "about": ExecutionContext._CURIEorURI, "resource": ExecutionContext._CURIEorURI, "rel": ExecutionContext._TERMorCURIEorAbsURI, "rev": ExecutionContext._TERMorCURIEorAbsURI, "datatype": ExecutionContext._TERMorCURIEorAbsURI, "typeof": ExecutionContext._TERMorCURIEorAbsURI, "property": ExecutionContext._TERMorCURIEorAbsURI, "role": ExecutionContext._TERMorCURIEorAbsURI, } #----------------------------------------------------------------- self.node = node #----------------------------------------------------------------- # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the # case in, say, XHTML...) # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for (for the HTML cases, that is) if inherited_state: self.rdfa_version = inherited_state.rdfa_version self.base = inherited_state.base self.options = inherited_state.options self.list_mapping = inherited_state.list_mapping self.new_list = False # for generic XML versions the xml:base attribute should be handled if self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) else: # this is the branch called from the very top self.list_mapping = ListStructure() self.new_list = True if rdfa_version is not None: self.rdfa_version = rdfa_version else: from pyRdfa import rdfa_current_version self.rdfa_version = rdfa_current_version # This value can be overwritten by a @version attribute if node.hasAttribute("version"): top_version = node.getAttribute("version") if top_version.find("RDFa 1.0") != -1 or top_version.find( "RDFa1.0") != -1: self.rdfa_version = "1.0" elif top_version.find("RDFa 1.1") != -1 or top_version.find( "RDFa1.1") != -1: self.rdfa_version = "1.1" # this is just to play safe. I believe this should actually not happen... if options == None: from pyRdfa import Options self.options = Options() else: self.options = options self.base = "" # handle the base element case for HTML if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ]: for bases in node.getElementsByTagName("base"): if bases.hasAttribute("href"): self.base = remove_frag_id(bases.getAttribute("href")) continue elif self.options.host_language in accept_xml_base and node.hasAttribute( "xml:base"): self.base = remove_frag_id(node.getAttribute("xml:base")) # If no local setting for base occurs, the input argument has it if self.base == "": self.base = base # Perform an extra beautification in RDFLib if self.options.host_language in beautifying_prefixes: dict = beautifying_prefixes[self.options.host_language] for key in dict: graph.bind(key, dict[key]) input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % ( self.options.host_language, self.rdfa_version, self.base) self.options.add_info(input_info) #----------------------------------------------------------------- # this will be used repeatedly, better store it once and for all... self.parsedBase = urlsplit(self.base) #----------------------------------------------------------------- # generate and store the local CURIE handling class instance self.term_or_curie = TermOrCurie(self, graph, inherited_state) #----------------------------------------------------------------- # Settling the language tags # @lang has priority over @xml:lang # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( # first get the inherited state's language, if any if inherited_state: self.lang = inherited_state.lang else: self.lang = None self.supress_lang = False if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ]: # we may have lang and xml:lang if node.hasAttribute("lang"): lang = node.getAttribute("lang").lower() else: lang = None if node.hasAttribute("xml:lang"): xmllang = node.getAttribute("xml:lang").lower() else: xmllang = None # First of all, set the value, if any if xmllang != None: # this has priority if len(xmllang) != 0: self.lang = xmllang else: self.lang = None elif lang != None: if len(lang) != 0: self.lang = lang else: self.lang = None # Ideally, a warning should be generated if lang and xmllang are both present with different values. But # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential # error situations are simply swallowed... elif self.options.host_language in accept_xml_lang and node.hasAttribute( "xml:lang"): self.lang = node.getAttribute("xml:lang").lower() if len(self.lang) == 0: self.lang = None #----------------------------------------------------------------- # Set the default namespace. Used when generating XML Literals if node.hasAttribute("xmlns"): self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None: self.defaultNS = inherited_state.defaultNS else: self.defaultNS = None
def return_graph(uri, options, newCache=False): """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked on the RDFa content. The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used. This function is used to retreive the vocabulary file and turn it into an RDFLib graph. @param uri: URI for the graph @param options: used as a place where warnings can be sent @param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text @return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful """ def return_to_cache(msg): if newCache: options.add_warning(err_unreachable_vocab % uri, warning_type=VocabReferenceError) else: options.add_warning(err_outdated_cache % uri, warning_type=VocabReferenceError) retval = None expiration_date = None content = None try: content = URIOpener( uri, { 'Accept': 'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9' }) except HTTPError: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) except RDFaError: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) except Exception: (type, value, traceback) = sys.exc_info() return_to_cache(value) return (None, None) # Store the expiration date of the newly accessed data expiration_date = content.expiration_date if content.content_type == MediaTypes.turtle: try: retval = Graph() retval.parse(content.data, format="n3") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.rdfxml: try: retval = Graph() retval.parse(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.nt: try: retval = Graph() retval.parse(content.data, format="nt") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri, value)) elif content.content_type in [ MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml ] or xml_application_media_type.match(content.content_type) != None: try: from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri, value)) else: options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date)
elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, transformers=extras, embedded_rdf=embedded_rdf, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, vocab_cache_report=vocab_cache_report, refresh_vocab_cache=refresh_vocab_cache, check_lite=check_lite, experimental_features=True) processor = pyRdfa(options, base) if len(value) >= 1: print processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: print processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
output_processor_graph = True elif a == "default": output_default_graph = True output_processor_graph = False else: usage() sys.exit(1) except: usage() sys.exit(1) options = Options(output_default_graph=output_default_graph, output_processor_graph=output_processor_graph, space_preserve=space_preserve, vocab_cache_report=vocab_cache_report, bypass_vocab_cache=bypass_vocab_cache, transformers=extras, vocab_expansion=vocab_expansion, vocab_cache=vocab_cache, hturtle=hturtle) processor = pyRdfa(options, base) if len(value) >= 1: retval = processor.rdf_from_sources(value, outputFormat=format, rdfOutput=rdfOutput) else: retval = processor.rdf_from_source(sys.stdin, outputFormat=format, rdfOutput=rdfOutput)
(type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_Turtle_vocab % (uri, value)) elif content.content_type == MediaTypes.nt: try: retval = Graph() retval.parse(content.data, format="nt") except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_ntriples_vocab % (uri, value)) elif content.content_type in [ MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml ] or xml_application_media_type.match(content.content_type) != None: try: from pyRdfa import pyRdfa from pyRdfa.options import Options options = Options() retval = pyRdfa(options).graph_from_source(content.data) except: (type, value, traceback) = sys.exc_info() options.add_warning(err_unparsable_rdfa_vocab % (uri, value)) else: options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) return (retval, expiration_date) ############################################################################################ type = ns_rdf["type"] Property = ns_rdf["Property"] Class = ns_rdfs["Class"]