Exemple #1
0
    def startElementNS(self, name, qname, attrs):
        """ Handle start tag.
        """

        if self._state != STATE_LITERAL:
            self.flush()
        self.bnode = None
        
        tagURI = ((name[0] or "") + name[1])

        if verbosity() > 80:
            indent = ". " * len(self._stack) 
            if not attrs:
                progress(indent+'# State was', self._state, ', start tag: <' + tagURI + '>')
            else:
                str = '# State =%s, start tag= <%s ' %( self._state, tagURI)
                for name, value in attrs.items():
                    str = str + "  " + `name` + '=' + '"' + `value` + '"'
                progress(indent + str + '>')


        self._stack.append([self._state, self._context, self._predicate,
                                self._subject, self._delayedStatement, self._base])
                                
        self._delayedStatement = None

        self._base = uripath.join(self._base, attrs.get((XML_NS_URI, "base"), self._base))
        x = self._base.find("#")
        if x >= 0: self._base = self._base[:x] # See rdf-tests/rdfcore/xmlbase/test013.rdf

        try:
            tagURI = uripath.join(self._base, tagURI)  # If relative, make absolute. Not needed for standard.
        except ValueError:
            pass
                                             # Needed for portable RDF generated with --rdf=z 
        
        self._language = attrs.get((XML_NS_URI, "lang"), None)

        value = attrs.get((RDF_NS_URI, "datatype"), None)
        if value != None: self._datatype = self.sink.newSymbol(self.uriref(value))
        else: self._datatype = None

        if self._state == STATE_OUTERMOST:
            if tagURI == RDF_NS_URI + "RDF":
                self._state = STATE_NO_SUBJECT
            else:
                if "R" not in self.flags:
                    self._state = STATE_NOT_RDF           # Ignore random XML without rdf:RDF
                else:
                    self._nodeElement(tagURI, attrs)    # Parse it as RDF.
                # http://www.w3.org/2000/10/rdf-tests/rdfcore/rdf-element-not-mandatory/test001.rdf
                    
        elif self._state == STATE_NOT_RDF:
            if tagURI == RDF_NS_URI + "RDF" and "T" in self.flags:
                self._state = STATE_NO_SUBJECT
            else:
                pass                    # Ignore embedded RDF

        elif self._state == STATE_NO_SUBJECT:  #MS1.0 6.2 obj :: desription | container
            self._nodeElement(tagURI, attrs)
                
        elif self._state == STATE_DESCRIPTION:   # Expect predicate (property) PropertyElt
            #  propertyElt #MS1.0 6.12
            #  http://www.w3.org/2000/03/rdf-tracking/#rdf-containers-syntax-ambiguity
            if tagURI == RDF_NS_URI + "li":
                item = self._items[-1] + 1
                self._predicate = self.sink.newSymbol("%s_%s" % (RDF_NS_URI, item))
                self._items[-1] = item
            else:
                if tagURI in propertyElementExceptions:
                    raise BadSyntax(sys.exc_info(), 'Invalid predicate URI: %s' % tagURI) 
                self._predicate = self.sink.newSymbol(tagURI)

            self._state = STATE_VALUE  # May be looking for value but see parse type
#           self._datatype = None
#           self._language = None
            self.testdata = ""         # Flush value data
            
            # print "\n  attributes:", `attrs`
            properties = []
            gotSubject = 0
            haveResource = 0
            haveParseType = 0
            haveExtras = 0
            for name, value in attrs.items():
                ns, name = name
                if name == "ID":
                    print "# Warning: ID=%s on statement ignored" %  (value) # I consider these a bug
                    raise ValueError("ID attribute?  Reification not supported.")
                elif name == "parseType":
                    haveParseType = 1
#                   x = value.find(":")
#                   if x>=0: pref = value[:x]
#                   else: pref = ""
#                   nsURI = self._nsmap[-1].get(pref, None)
                    if value == "Resource":
                        c = self._context
                        s = self._subject
#                        self._subject = self.sink.newBlankNode(self._context, why=self._reason2)
                        self.idAboutAttr(attrs) #@@ not according to current syntax @@@@@@@@@@@
                        self.sink.makeStatement(( c, self._predicate, s, self._subject), why=self._reason2)
                        self._state = STATE_DESCRIPTION  # Nest description
                        
                    elif value == "Quote":
                            c = self._context
                            s = self._subject
                            self.idAboutAttr(attrs)  # set subject and context for nested description
                            self._subject = self.sink.newFormula()  # Forget anonymous genid - context is subect
                            if self._predicate is self.merge: # magic :-(
                                self._stack[-1][3] = self._subject  # St C P S retrofit subject of outer level!
                                self._delayedStatement = 1 # flag
                            else:
                                self._delayedStatement = c, self._predicate, s, self._subject
                            self._context = self._subject
                            self._subject = None
                            self._state = STATE_NO_SUBJECT  # Inside quote, there is no subject
                        
                    elif (value=="Collection" or
                        value[-11:] == ":collection"):  # Is this a daml:collection qname?

                        self._state = STATE_LIST  # Linked list of obj's
                    elif value == "Literal" or "S" in self.flags:  # Strictly, other types are literal SYN#7.2.20
                        self._state = STATE_LITERAL # That's an XML subtree not a string
                        self._litDepth = 1
                        self.LiteralNS = [{}]
                        self.testdata = '' #"@@sax2rdf.py bug@@" # buggy implementation
                        self._datatype = self.sink.newSymbol("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral")
                        if XMLLiteralsAsDomTrees:
                            self.domDocument = self.domImplementation.createDocument(
                                'http://www.w3.org/1999/02/22-rdf-syntax-ns', 'envelope', None)
                            self.domElement = self.domDocument.documentElement
                    else:
                        raise SyntaxError("Unknown parse type '%s'" % value )
                elif name == "nodeID":
                    assert not gotSubject
                    if not isXML.isNCName(value):
                        raise  BadSyntax(sys.exc_info(), 'A nodeID must be a NCName %s' % value)
                    obj = self._nodeIDs.get(value, None)
                    if obj == None:
                        obj = self.newBlankNode()
                        self._nodeIDs[value] = obj
                    self.sink.makeStatement((self._context,
                                             self._predicate,
                                             self._subject,
                                             obj ), why=self._reason2)
                    self._state = STATE_NOVALUE  # NOT looking for value
                    self._subject = obj
                    gotSubject = 1
                elif name == "resource":
                    haveResource = 1
                    assert not gotSubject
                    x = self.sink.newSymbol(self.uriref(value)) 
                    self.sink.makeStatement((self._context,
                                             self._predicate,
                                             self._subject,
                                             x ), why=self._reason2)
                    self._state = STATE_NOVALUE  # NOT looking for value
                    self._subject = x
                    gotSubject = 1
                elif name == "datatype":
                    pass # Already set
                elif ns == XML_NS_URI or name[:3] == "xml":  #  Ignore (lang is already done)
                    pass # see rdf-tests/rdfcore/unrecognised-xml-attributes/test002.rdf
                else:
                    haveExtras = 1
                    properties.append((ns, name, value)) # wait till subject is clear
                assert haveResource + haveParseType  <= 1
                assert haveParseType + haveExtras <= 1
            if not gotSubject and properties:
                obj = self.newBlankNode()
                self.sink.makeStatement((self._context,
                                            self._predicate,
                                            self._subject,
                                            obj ), why=self._reason2)
                self._state = STATE_NOVALUE  # NOT looking for value
                self._subject = obj
            
            for ns, name, value in properties:
                self._propertyAttr(ns, name, value)
                
        elif self._state == STATE_LIST:   # damlCollection :: objs - make list
            # Subject and predicate are set and dangling. 
            c = self._context
            s = self._subject  # The tail of the list so far
            p = self._predicate
            pair = self.newBlankNode()        # The new pair
            self.sink.makeStatement(( c,   # Link in new pair
                                      p,
                                      s,
                                      pair ), why=self._reason2) 
            self.idAboutAttr(attrs)  # set subject (the next item) and context 
            if tagURI != RDF_NS_URI + "Description":
                self.sink.makeStatement((c,
                                         self.sink.newSymbol(RDF_NS_URI
                                                             +"type"),
                                         self._subject,
                                         self.sink.newSymbol(tagURI) ),
                                        why=self._reason2)

            self.sink.makeStatement(( c,
                                      self.sink.newSymbol(List_NS + "first"),
                                      pair,
                                      self._subject), why=self._reason2) # new item
            if "S" in self.flags: # Strictly to spec
                self.sink.makeStatement(( c,
                                        self.sink.newSymbol(RDF_NS_URI + "type"),
                                        self.sink.newSymbol(List_NS + "List"),
                                        self._subject), why=self._reason2) # new item
            
            self._stack[-1][2] = self.sink.newSymbol(List_NS + "rest")  # Leave dangling link   #@check
            self._stack[-1][3] = pair  # Underlying state tracks tail of growing list

         
        elif self._state == STATE_VALUE:   # Value :: Obj in this case #MS1.0 6.17  6.2
            c = self._context
            p = self._predicate
            s = self._subject
            self._nodeElement(tagURI, attrs)   # Parse the object thing's attributes
            self.sink.makeStatement((c, p, s, self._subject), why=self._reason2)
            
            self._stack[-1][0] = STATE_NOVALUE  # When we return, cannot have literal now

        elif self._state == STATE_NOVALUE:
            str = ""
            for e in self._stack: str = str + `e`+"\n"
            raise BadSyntax(sys.exc_info(), """Expected no value, found name=%s; qname=%s, attrs=%s
            in nested context:\n%s""" %(name, qname, attrs, str))

        elif self._state == STATE_LITERAL:
            self._litDepth = self._litDepth + 1
            if XMLLiteralsAsDomTrees:
#               progress("@@@ XML literal name: ", name)
                self.literal_element_start_DOM(name, qname, attrs)
            else:
                self.literal_element_start(name, qname, attrs)
            #@@ need to capture the literal
        else:
            raise RuntimeError, ("Unknown state in RDF parser", self._stack) # Unknown state
Exemple #2
0
    def idAboutAttr(self, attrs):  #MS1.0 6.5 also proprAttr 6.10
        """ set up subject and maybe context from attributes
        """
        self._subject = None
        self._state = STATE_DESCRIPTION
        self._items.append(0)
        properties = []
        
        for name, value in attrs.items():
            ns, ln = name
            
# The following section was a kludge to work with presumably old bad RDF
# files while RDF was being defined way back when.
#            if ns:              # Removed 2010 as this is a kludge which creaks with sioc:about - timbl 2010-07-19
#                if string.find("ID about aboutEachPrefix bagID type", ln)>0:
#                    if ns != RDF_NS_URI:
#                       print ("# Warning -- %s attribute in %s namespace not RDF NS." %
#                              name, ln)
#                       ns = RDF_NS_URI  # Allowed as per dajobe: ID, bagID, about, resource, parseType or type

            if ns == RDF_NS_URI or ns == None:   # Opinions vary sometimes none but RDF_NS is common :-(
                if ln == "ID":
                    if not isXML.isName(value):
                        raise  BadSyntax(sys.exc_info(), 'An ID must be a Name %s' % value)
                    if (self._base, value) in self._usedIDs:
                        raise BadSyntax(sys.exc_info(), "Two elements cannot have the same ID, %s" % value)
                    self._usedIDs.add((self._base, value))
                    if self._subject:
                        print "# oops - subject already", self._subject
                        raise BadSyntax(sys.exc_info(), ">1 subject")
                    self._subject = self.sink.newSymbol(self.uriref("#" + value))
                elif ln == "about":
                    if self._subject: raise BadSyntax(sys.exc_info(),
                        "Subject already defined to be %s, can't have attribute about='%s'" %
                        (`self._subject`, value))
                    self._subject = self.sink.newSymbol(self.uriref(value))
                elif ln == "nodeID":
                    if self._subject: raise BadSyntax(sys.exc_info(),
                        "Subject already defined to be %s, can't have attribute nodeID='%s'" %
                        (`self._subject`, value))
                    if not isXML.isNCName(value):
                        raise  BadSyntax(sys.exc_info(), 'A nodeID must be a NCName %s' % value)
                    s = self._nodeIDs.get(value, None)
                    if s == None:
                        s = self.newBlankNode()
                        self._nodeIDs[value] = s
                    self._subject = s
                elif ln == "aboutEachPrefix":
                    if value == " ":  # OK - a trick to make NO subject
                        self._subject = None
                    else: raise ooops # can't do about each prefix yet
                elif ln == "bagID":
                    if not isXML.isName(value):
                        raise  BadSyntax(sys.exc_info(), 'A bagID must be a Name %s' % value)
                    c = self._context #@@dwc: this is broken, no?
                    self._context = FORMULA, self.uriref("#" + value) #@@ non-ascii
                elif ln == "parseType":
                    pass  #later - object-related
                elif ln == "resource":
                    pass  #later
                elif ln == "datatype":
                    pass  #later
                elif RDF_NS_URI + ln in propertyAttributeExceptions:
                    raise BadSyntax(sys.exc_info(), "%s is not a valid attribute named here" % RDF_NS_URI + ln)
                else:
                    if not ns:
                        if "L" not in self.flags:  # assume local?
                            raise BadSyntax(sys.exc_info(), "No namespace on property attribute %s" % ln)
                        properties.append((self._thisDoc + "#" + ln, value))
                    else:
                        properties.append((RDF_NS_URI + ln, value))# If no uri, syntax error @@
#                    self.sink.makeComment("xml2rdf: Ignored attribute "+ RDF_NS_URI + ln)
            elif ns == XML_NS_URI:
                pass    # lang already done, others ignored

            else:  # Property attribute propAttr #MS1.0 6.10
                uri = (ns + ln);
                properties.append((uri, value)) 
#                print "@@@@@@ <%s> <%s>" % properties[-1]

        if self._subject == None:
            self._subject = self.newBlankNode()
        for pred, obj in properties:
            if pred == RDF_NS_URI + "type":
                self.sink.makeStatement(( self._context,
                                        self.sink.newSymbol(pred),
                                        self._subject,
                                        self.sink.newSymbol(self.uriref(obj)) ), why=self._reason2)
            else:
                dt = self._datatype
                if dt == None: lang = self._language
                else: lang = None
                self.sink.makeStatement(( self._context,
                                        self.sink.newSymbol(pred),
                                        self._subject,
                                        self.sink.newLiteral(obj, dt, lang) ), why=self._reason2)