Example #1
0
def parse(filename):
    # parse file and return document root
    try:
        doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(filename))
        return doc.documentElement
    except Ft.FtException, e:
        raise Error(_("File '%s' has invalid XML: %s") % (filename, str(e)))
Example #2
0
def parse(filename):
    # parse file and return document root
    try:
        doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(filename))
        return doc.documentElement
    except Ft.FtException, e:
        raise Error(_("File '%s' has invalid XML: %s") % (filename, str(e)))
Example #3
0
 def readxml(self, uri, tmpDir='/tmp', sha1sum=False, compress=None, sign=None):
     uri = File.make_uri(uri)
     localpath = File.download(uri, tmpDir,sha1sum=sha1sum,compress=compress,sign=sign)
     try:
         self.doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(localpath))
         return self.doc.documentElement
     except Ft.FtException, e:
         raise Error(_("File '%s' has invalid XML: %s") % (localpath, str(e)) )
Example #4
0
 def readxml(self, uri, tmpDir="/tmp"):
     uri = File.make_uri(uri)
     localpath = File.download(uri, tmpDir)
     try:
         self.doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(localpath))
         return self.doc.documentElement
     except Ft.FtException, e:
         raise Error(_("File '%s' has invalid XML: %s") % (localpath, str(e)))
Example #5
0
 def readxml(self,
             uri,
             tmpDir='/tmp',
             sha1sum=False,
             compress=None,
             sign=None):
     uri = File.make_uri(uri)
     localpath = File.download(uri,
                               tmpDir,
                               sha1sum=sha1sum,
                               compress=compress,
                               sign=sign)
     try:
         self.doc = NoExtDtdReader.parseUri(
             Ft.Lib.Uri.OsPathToUri(localpath))
         return self.doc.documentElement
     except Ft.FtException, e:
         raise Error(
             _("File '%s' has invalid XML: %s") % (localpath, str(e)))
Example #6
0
def cdom(name):
    doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(name))
Example #7
0
    def load(self, webget):
        """
        
        >>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph())
        >>> g.load(WebMemo())
        >>> g.dom.documentElement.localName
        u'purchaseOrder'
        """
        if self.dom: return

        lastUri, (content, self.headers) = webget(self.url,
                                       (RDF_MT,
                                        XML_MT, XML_text_MT,
                                        XHTML_MT, XHTML_text_MT))
        if lastUri != self.url:
            ##We want the retrieval URL even in the face of a redirect
            self.url = lastUri             
        parsedAsRDF = False
        
        #Until we peak in for a base, use the one given or the retrieval URL
        initialBase = self.baseURI and self.baseURI or self.url
        
        #peek in response headers to determine content-type
        #NOTE we need to attempt to parse the source as RDF/XML regardless (by the base case rule):
        #If an information resource IR is represented by a conforming RDF/XML document[RDFX], then 
        #the RDF graph represented by that document is a GRDDL result of IR.
        if self.headers['content-type'].startswith(RDF_MT):
            try:
                self.graph.parse(StringIO(content),publicID=initialBase)
                parsedAsRDF = True
            except:
                pass
            self.dom = None

        try:
            if self.DEBUG:
                print >>sys.stderr, "Parsing XML content WRT baseURI of %s"%(initialBase)
            self.dom = XMLParser.parseString(content,
                                             initialBase,
                                             processIncludes=self.useXInclude)
            #WG consensus is to follow XML Base.  This bottoms out in using
            #the base URI of the root node once the parser has been given
            #the base that HTTP indicates via RFC 3986
            #Note: this interpretation is based off the assumption that the
            #encapsulating context for a GRDDL result is the root node of the
            #source document
            #See: http://4suite.org/docs/CoreManual.xml#base_URIs
            if self.baseURI is None:
                self.baseURI = self.dom.xpath(u'/*')[0].baseURI
                if self.DEBUG:
                    print >>sys.stderr,\
                     "Adopting the baseURI of the root node: %s"%(self.baseURI)
            
            #Note, if an XHTML Base is embedded, it needs to be respected also
            for htmlBase in self.dom.xpath(u'/xhtml:html/xhtml:head/xhtml:base/@href',{u'xhtml': XHTML_NS}):
                if self.DEBUG:
                    print >>sys.stderr, "Found an XHTML Base: %s"%(htmlBase.value)
                self.baseURI = htmlBase.value
                        
            #WG consensus is that we should peek into XML content for rdf:RDF
            #at the root, if we find it we need to attempt a parse as RDF/XML
            if not parsedAsRDF and self.dom.xpath(u'/rdf:RDF',{u'rdf':str(RDF.RDFNS)}):
                try:
                    self.graph.parse(StringIO(content), publicID=self.baseURI)
                except:
                    pass
                        
        except Exception, e: #@@ narrow exception
            if self.DEBUG:
                print >>sys.stderr, "Unable to parse ", self.baseURI, repr(e)
            #Unable to glean.  Fail gracefully..
            self.dom = None
Example #8
0
    def transform(self, transformURLs, webget):
        """
        Takes a space seperated list of transform url's and applies
        them against the pre-parsed DOM of the GRDDL source - making
        sure to avoid transformation already applied
        """                
        for xformURL in transformURLs.split():
            if self.DEBUG:
                print >>sys.stderr, "applying transformation %s" % (xformURL)
            if xformURL not in self.appliedTransforms:
                self.appliedTransforms.append(xformURL)
            #The transform url is resolved against the source URL (to
            #accomodate relative urls)
            stylesheetLoc = Absolutize(xformURL, self.baseURI)
            lastUri, (content, info) = webget(stylesheetLoc, (XSLT_MT,))
            transform = InputSource.DefaultFactory.fromString(content,
                                                              stylesheetLoc)
            processor = Processor.Processor()
            processor.appendStylesheet(transform)
            #see: http://www.w3.org/TR/grddl/#stylepi
            #Note, for the XSLT transform, the base URI of the source document
            #is passed in, instead of the base URI of the root node   
            result = processor.runNode(self.dom, self.url, ignorePis=1)
            #get output method / media-type
#            <!-- Category: top-level-element -->
#            <xsl:output
#              method = "xml" | "html" | "text" | qname-but-not-ncname
#              version = nmtoken
#              encoding = string
#              omit-xml-declaration = "yes" | "no"
#              standalone = "yes" | "no"
#              doctype-public = string
#              doctype-system = string
#              cdata-section-elements = qnames
#              indent = "yes" | "no"
#              media-type = string />

            #How to accomodate @media-type?
            method = processor.outputParams.method[-1]
            currLen = len(self.graph)
            if method == 'xml':
                self.graph.parse(StringIO(result), 
                                 publicID=self.baseURI)
                #@@This is mostly as a workaround for RDFLib 2.4 which will 
                #force an empty URI string as the subject if xml:base = ''                
                if XMLParser.parseString(result, self.baseURI).xpath('//@xml:base'):
                    import warnings;warnings.warn(
                      "RDFLib 2.4.0 may not be resolving relative xml:base values")
                replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \
                               self.graph.triples((URIRef(''),None,None))]
                if replace:
                    if self.DEBUG:
                        print >>sys.stderr, \
                          "Replacing empty string URI ref with %s" % (
                            self.baseURI)                        
                    self.graph.remove((URIRef(''),None,None))
                    self.graph.addN(replace)                
                if self.DEBUG:
                    print >>sys.stderr,\
                     "Parsed %s triples (using baseURI: %s) as RDF/XML" % (
                        max(0,len(self.graph) - currLen),self.baseURI)
            elif method == 'text':
                #Attempt a Notation 3 parse (covers NTriples, and Turtle)
                try:
                    self.graph.parse(StringIO(result), format='n3',
                                     publicID=self.baseURI)
                    #@@This is mostly as a workaround for RDFLib 2.4 which will 
                    #force an empty URI string as the subject if xml:base = ''                    
                    replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \
                                   self.graph.triples((URIRef(''),None,None))]
                    if replace:
                        if self.DEBUG:
                            print >>sys.stderr, \
                              "Replacing empty string URI ref with %s" % (
                                self.baseURI)                        
                        self.graph.remove((URIRef(''),None,None))
                        self.graph.addN(replace)                    
                    if self.DEBUG:
                        print >>sys.stderr, \
                        "Parsed %s triples (using baseURI: %s) as Notation 3" % (
                            max(0,len(self.graph) - currLen),self.baseURI)
                except:
                    if self.DEBUG:
                        print >>sys.stderr, "Unknown text-based RDF serialization"
            else:
                #HTML result - recursive GRDDL mechanism?
                raise Exception("unsupported output type")
Example #9
0
def cdom(name):
    doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(name))