def runNamespace(): "Return a URI suitable as a namespace for run-local objects" # @@@ include hostname (privacy?) (hash it?) global runNamespaceValue if runNamespaceValue == None: try: runNamespaceValue = environ["CWM_RUN_NS"] except KeyError: runNamespaceValue = uripath.join( uripath.base(), ".run-" + `time()` + "p"+ `getpid()` +"#") runNamespaceValue = join(base(), runNamespaceValue) # absolutize return runNamespaceValue
def __init__(self, name, store=None): if ':' not in name: #, "must be absolute: %s" % name base = uripath.base() name = uripath.join(base, name) self._name = name self.store = store self._seen = {}
def bind(self, pfx, ref): ref = ref[1:-1] # take of <>'s addr = uripath.join(self._baseURI, ref) #DEBUG("bind", pfx, ref, addr) self._sink.bind(pfx, addr) #@@ check for pfx already bound? self._prefixes[pfx] = addr
def __init__(self, sink, openFormula, thisDoc, baseURI=None, flags="", why=None): self.testdata = "" if XMLLiteralsAsDomTrees: self.domImplementation = xml.dom.getDOMImplementation() self.domDocument = None self.domElement = None self.flags = flags self._stack = [] # Stack of states self._nsmap = [] # stack of namespace bindings self._prefixMap = [] self.LiteralNS = None self._delayedStatement = None self.sink = sink self._thisDoc = thisDoc if baseURI != None: self._base = baseURI else: self._base = thisDoc self._state = STATE_OUTERMOST # Maybe should ignore RDF outside <rdf:RDF>?? if sink: if openFormula == None: self._context = sink.newFormula(thisDoc + "#_formula") else: self._context = openFormula self._formula = self._context # Root formula self._genPrefix = uripath.join( thisDoc, "#_rdfxg") # allow parameter override? self.sink.setGenPrefix(self._genPrefix) self.sink.startDoc() self.merge = self.sink.newSymbol(NODE_MERGE_URI) else: self._context = None self._reason = why # Why the parser w self._reason2 = None # Why these triples if diag.tracking: self._reason2 = BecauseOfData(sink.newSymbol(thisDoc), because=self._reason) self._subject = None self._predicate = None self._datatype = None self._language = None self._nodeIDs = {} self._items = [] # for <rdf:li> containers self._litDepth = 0 version = "$Id: sax2rdf.py,v 1.52 2007/06/26 02:36:15 syosi Exp $" # self.sink.makeComment("RDF parsed by "+version[1:-1]) if "D" in self.flags: # Assume default namespace declaration if sink: self.sink.setDefaultNamespace(self._thisDoc + "#") self._nsmap = [{"": "#"}]
def __init__(self, sink, openFormula, thisDoc, baseURI=None, flags="", why=None): self.testdata = "" if XMLLiteralsAsDomTrees: self.domImplementation = xml.dom.getDOMImplementation() self.domDocument = None self.domElement = None self.flags = flags self._stack = [] # Stack of states self._nsmap = [] # stack of namespace bindings self._prefixMap = [] self.LiteralNS = None self._delayedStatement = None self.sink = sink self._thisDoc = thisDoc if baseURI != None: self._base = baseURI else: self._base = thisDoc self._state = STATE_OUTERMOST # Maybe should ignore RDF outside <rdf:RDF>?? if sink: if openFormula == None: self._context = sink.newFormula(thisDoc + "#_formula") else: self._context = openFormula self._formula = self._context # Root formula self._genPrefix = uripath.join(thisDoc, "#_rdfxg") # allow parameter override? self.sink.setGenPrefix(self._genPrefix) self.sink.startDoc() self.merge = self.sink.newSymbol(NODE_MERGE_URI) else: self._context = None self._reason = why # Why the parser w self._reason2 = None # Why these triples if diag.tracking: self._reason2 = BecauseOfData(sink.newSymbol(thisDoc), because=self._reason) self._subject = None self._predicate = None self._datatype = None self._language = None self._nodeIDs = {} self._items = [] # for <rdf:li> containers self._litDepth = 0 version = "$Id: sax2rdf.py,v 1.52 2007/06/26 02:36:15 syosi Exp $" # self.sink.makeComment("RDF parsed by "+version[1:-1]) if "D" in self.flags: # Assume default namespace declaration if sink: self.sink.setDefaultNamespace(self._thisDoc + "#") self._nsmap = [{"": "#"}]
def load(self, uri, baseURI=''): if uri: import urllib uri = uripath.join(baseURI, uri) self._sink.makeComment("Taking input from " + uri) self.startDoc() self.feed(urllib.urlopen(uri).read()) self.endDoc() else: import sys self._sink.makeComment("Taking input from standard input") self.startDoc() self.feed(sys.stdin.read()) self.endDoc()
def doCommand(): """Command line RDF/N3 crawler crawl <uriref> options: See http://www.w3.org/2000/10/swap/doc/cwm for more documentation. """ global agenda global already uriref = sys.argv[1] uri = join(base(), uriref) r = symbol(uri) diag.setVerbosity(0) print "@prefix : <http://www.w3.org/2000/10/swap/util/semweb#>." print "# Generated by crawl.py ", cvsRevision[1:-1] agenda = [r] while agenda != []: r = agenda[0] agenda = agenda[1:] already.append(r) crawl(r) print "# ", len(already), "attempts,", successes, "successes."
else: print "Ok for predictive parsing" #print "Branch table:", branchTable print "Literal terminals:", literalTerminals.keys() print "Token regular expressions:" for r in tokenRegexps: print "\t%s matches %s" % (r, tokenRegexps[r].pattern) yacc = open(argv[1] + "-yacc.y", "w") yaccConvert(yacc, document, tokenRegexps) #while agenda: # x = agenda[0] # agenda = agenda[1:] # already.append(x) # yaccProduction(yacc, x, tokenRegexps) yacc.close() if len(argv) <= 3: exit(0) parseFile = argv[3] ip = webAccess.urlopenForRDF(uripath.join(uripath.base(), parseFile), None) str = ip.read() sink = g.newFormula() p = PredictiveParser(sink=sink, top=document, branchTable=branchTable, tokenRegexps=tokenRegexps) p.parse(str) #ends
def crawlFrom(self, addr, prefix, max): fmla = self._fmla iter = 1 queue = [addr] seen = [] while queue: head = queue.pop() progress("crawling at: ", head, " iter ", iter, " of ", max) iter = iter + 1 if iter > max: progress ("max limit reached.") break seen.append(head) try: rep = urllib2.urlopen(head) content = rep.read() except IOError: progress("can't GET", head) continue #@@ makeStatement(head type NoGood) # try to find a short label for # a diagram or some such. # try the last path segment, # or the 2nd last in case of an empty last segment... slash = head[:-1].rfind('/') label = head[slash+1:] ct = rep.info().getheader('content-type') progress("... got content of type ", ct) isHTML = ct.find('text/html') == 0 fmla.add(symbol(head), symbol(DC('type')), literal(ct)) # note that we're not peeking into the URI # to find out if it's HTML; we're just # eliding the extension in the case we # know (from the HTTP headers) that it's HTML. if isHTML and label[-5:] == '.html': label = label[:-5] fmla.add(symbol(head), symbol(RDFS('label')), literal(label)) if not isHTML: continue progress("... parsing text/html content") doc = libxml2.htmlParseDoc(content, 'us-ascii') try: titles = doc.xpathNewContext().xpathEval('//title') title = titles[0].getContent() except: #@@figure out the right exceptions pass else: progress("... found title:", title) fmla.add(symbol(head), symbol(DC('title')), literal(str(title)) ) hrefs = doc.xpathNewContext().xpathEval('//a/@href') progress("... found ", len(hrefs), " links") for h in hrefs: h = h.getContent() progress("... found href", h) i = uripath.join(head, h) i = uripath.splitFrag(i)[0] progress("... found link", head, ' -> ', i) fmla.add(symbol(head), symbol(DC('relation')), symbol(i)) if i[:len(prefix)] == prefix and i not in seen: queue.append(i)
def uri_ref2(self, str, i, res): """Generate uri from n3 representation. Note that the RDF convention of directly concatenating NS and local name is now used though I prefer inserting a '#' to make the namesapces look more like what XML folks expect. """ qn = [] j = self.qname(str, i, qn) if j >= 0: pairFudge = qn[0] pfx = pairFudge[0] ln = pairFudge[1] if pfx is None: assertFudge(0, "not used?") ns = self._baseURI + ADDED_HASH else: ns = self._bindings[pfx] if not ns: # @@ pyjs should test undefined if pfx == "_": # Magic prefix 2001/05/30, can be overridden res.append(self.anonymousNode(ln)) return j raise BadSyntax(self._thisDoc, self.lines, str, i, 'Prefix ' + pfx + ' not bound.') symb = self._store.newSymbol(ns + ln) if symb in self._variables: res.append(self._variables[symb]) else: res.append(symb) # @@@ "#" CONVENTION return j i = self.skipSpace(str, i) if i < 0: return -1 if str[i] == "?": v = [] j = self.variable(str, i, v) if j > 0: #Forget varibles as a class, only in context. res.append(v[0]) return j return -1 elif str[i] == "<": i = i + 1 st = i while i < len(str): if str[i] == ">": uref = str[st:i] # the join should dealt with "": if self._baseURI: uref = uripath.join(self._baseURI, uref) else: assertFudge( ":" in uref, \ "With no base URI, cannot deal with relative URIs") if str[i - 1:i] == "#" and not uref[-1:] == "#": uref = uref + "#" # She meant it! Weirdness in urlparse? symb = self._store.newSymbol(uref) if symb in self._variables: res.append(self._variables[symb]) else: res.append(symb) return i + 1 i = i + 1 raise BadSyntax(self._thisDoc, self.lines, str, j, "unterminated URI reference") elif self.keywordsSet: v = [] j = self.bareWord(str, i, v) if j < 0: return -1 #Forget varibles as a class, only in context. if v[0] in self.keywords: raise BadSyntax(self._thisDoc, self.lines, str, i, 'Keyword "' + v[0] + '" not allowed here.') res.append(self._store.newSymbol(self._bindings[""] + v[0])) return j else: return -1
def startElementNS(self, name, qname, attrs): """ Handle start tag. """ if self._state != STATE_LITERAL: self.flush() self.bnode = None tagURI = ((name[0] or "") + name[1]) if verbosity() > 80: indent = ". " * len(self._stack) if not attrs: progress(indent+'# State was', self._state, ', start tag: <' + tagURI + '>') else: str = '# State =%s, start tag= <%s ' %( self._state, tagURI) for name, value in attrs.items(): str = str + " " + `name` + '=' + '"' + `value` + '"' progress(indent + str + '>') self._stack.append([self._state, self._context, self._predicate, self._subject, self._delayedStatement, self._base]) self._delayedStatement = None self._base = uripath.join(self._base, attrs.get((XML_NS_URI, "base"), self._base)) x = self._base.find("#") if x >= 0: self._base = self._base[:x] # See rdf-tests/rdfcore/xmlbase/test013.rdf try: tagURI = uripath.join(self._base, tagURI) # If relative, make absolute. Not needed for standard. except ValueError: pass # Needed for portable RDF generated with --rdf=z self._language = attrs.get((XML_NS_URI, "lang"), None) value = attrs.get((RDF_NS_URI, "datatype"), None) if value != None: self._datatype = self.sink.newSymbol(self.uriref(value)) else: self._datatype = None if self._state == STATE_OUTERMOST: if tagURI == RDF_NS_URI + "RDF": self._state = STATE_NO_SUBJECT else: if "R" not in self.flags: self._state = STATE_NOT_RDF # Ignore random XML without rdf:RDF else: self._nodeElement(tagURI, attrs) # Parse it as RDF. # http://www.w3.org/2000/10/rdf-tests/rdfcore/rdf-element-not-mandatory/test001.rdf elif self._state == STATE_NOT_RDF: if tagURI == RDF_NS_URI + "RDF" and "T" in self.flags: self._state = STATE_NO_SUBJECT else: pass # Ignore embedded RDF elif self._state == STATE_NO_SUBJECT: #MS1.0 6.2 obj :: desription | container self._nodeElement(tagURI, attrs) elif self._state == STATE_DESCRIPTION: # Expect predicate (property) PropertyElt # propertyElt #MS1.0 6.12 # http://www.w3.org/2000/03/rdf-tracking/#rdf-containers-syntax-ambiguity if tagURI == RDF_NS_URI + "li": item = self._items[-1] + 1 self._predicate = self.sink.newSymbol("%s_%s" % (RDF_NS_URI, item)) self._items[-1] = item else: if tagURI in propertyElementExceptions: raise BadSyntax(sys.exc_info(), 'Invalid predicate URI: %s' % tagURI) self._predicate = self.sink.newSymbol(tagURI) self._state = STATE_VALUE # May be looking for value but see parse type # self._datatype = None # self._language = None self.testdata = "" # Flush value data # print "\n attributes:", `attrs` properties = [] gotSubject = 0 haveResource = 0 haveParseType = 0 haveExtras = 0 for name, value in attrs.items(): ns, name = name if name == "ID": print "# Warning: ID=%s on statement ignored" % (value) # I consider these a bug raise ValueError("ID attribute? Reification not supported.") elif name == "parseType": haveParseType = 1 # x = value.find(":") # if x>=0: pref = value[:x] # else: pref = "" # nsURI = self._nsmap[-1].get(pref, None) if value == "Resource": c = self._context s = self._subject # self._subject = self.sink.newBlankNode(self._context, why=self._reason2) self.idAboutAttr(attrs) #@@ not according to current syntax @@@@@@@@@@@ self.sink.makeStatement(( c, self._predicate, s, self._subject), why=self._reason2) self._state = STATE_DESCRIPTION # Nest description elif value == "Quote": c = self._context s = self._subject self.idAboutAttr(attrs) # set subject and context for nested description self._subject = self.sink.newFormula() # Forget anonymous genid - context is subect if self._predicate is self.merge: # magic :-( self._stack[-1][3] = self._subject # St C P S retrofit subject of outer level! self._delayedStatement = 1 # flag else: self._delayedStatement = c, self._predicate, s, self._subject self._context = self._subject self._subject = None self._state = STATE_NO_SUBJECT # Inside quote, there is no subject elif (value=="Collection" or value[-11:] == ":collection"): # Is this a daml:collection qname? self._state = STATE_LIST # Linked list of obj's elif value == "Literal" or "S" in self.flags: # Strictly, other types are literal SYN#7.2.20 self._state = STATE_LITERAL # That's an XML subtree not a string self._litDepth = 1 self.LiteralNS = [{}] self.testdata = '' #"@@sax2rdf.py bug@@" # buggy implementation self._datatype = self.sink.newSymbol("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral") if XMLLiteralsAsDomTrees: self.domDocument = self.domImplementation.createDocument( 'http://www.w3.org/1999/02/22-rdf-syntax-ns', 'envelope', None) self.domElement = self.domDocument.documentElement else: raise SyntaxError("Unknown parse type '%s'" % value ) elif name == "nodeID": assert not gotSubject if not isXML.isNCName(value): raise BadSyntax(sys.exc_info(), 'A nodeID must be a NCName %s' % value) obj = self._nodeIDs.get(value, None) if obj == None: obj = self.newBlankNode() self._nodeIDs[value] = obj self.sink.makeStatement((self._context, self._predicate, self._subject, obj ), why=self._reason2) self._state = STATE_NOVALUE # NOT looking for value self._subject = obj gotSubject = 1 elif name == "resource": haveResource = 1 assert not gotSubject x = self.sink.newSymbol(self.uriref(value)) self.sink.makeStatement((self._context, self._predicate, self._subject, x ), why=self._reason2) self._state = STATE_NOVALUE # NOT looking for value self._subject = x gotSubject = 1 elif name == "datatype": pass # Already set elif ns == XML_NS_URI or name[:3] == "xml": # Ignore (lang is already done) pass # see rdf-tests/rdfcore/unrecognised-xml-attributes/test002.rdf else: haveExtras = 1 properties.append((ns, name, value)) # wait till subject is clear assert haveResource + haveParseType <= 1 assert haveParseType + haveExtras <= 1 if not gotSubject and properties: obj = self.newBlankNode() self.sink.makeStatement((self._context, self._predicate, self._subject, obj ), why=self._reason2) self._state = STATE_NOVALUE # NOT looking for value self._subject = obj for ns, name, value in properties: self._propertyAttr(ns, name, value) elif self._state == STATE_LIST: # damlCollection :: objs - make list # Subject and predicate are set and dangling. c = self._context s = self._subject # The tail of the list so far p = self._predicate pair = self.newBlankNode() # The new pair self.sink.makeStatement(( c, # Link in new pair p, s, pair ), why=self._reason2) self.idAboutAttr(attrs) # set subject (the next item) and context if tagURI != RDF_NS_URI + "Description": self.sink.makeStatement((c, self.sink.newSymbol(RDF_NS_URI +"type"), self._subject, self.sink.newSymbol(tagURI) ), why=self._reason2) self.sink.makeStatement(( c, self.sink.newSymbol(List_NS + "first"), pair, self._subject), why=self._reason2) # new item if "S" in self.flags: # Strictly to spec self.sink.makeStatement(( c, self.sink.newSymbol(RDF_NS_URI + "type"), self.sink.newSymbol(List_NS + "List"), self._subject), why=self._reason2) # new item self._stack[-1][2] = self.sink.newSymbol(List_NS + "rest") # Leave dangling link #@check self._stack[-1][3] = pair # Underlying state tracks tail of growing list elif self._state == STATE_VALUE: # Value :: Obj in this case #MS1.0 6.17 6.2 c = self._context p = self._predicate s = self._subject self._nodeElement(tagURI, attrs) # Parse the object thing's attributes self.sink.makeStatement((c, p, s, self._subject), why=self._reason2) self._stack[-1][0] = STATE_NOVALUE # When we return, cannot have literal now elif self._state == STATE_NOVALUE: str = "" for e in self._stack: str = str + `e`+"\n" raise BadSyntax(sys.exc_info(), """Expected no value, found name=%s; qname=%s, attrs=%s in nested context:\n%s""" %(name, qname, attrs, str)) elif self._state == STATE_LITERAL: self._litDepth = self._litDepth + 1 if XMLLiteralsAsDomTrees: # progress("@@@ XML literal name: ", name) self.literal_element_start_DOM(name, qname, attrs) else: self.literal_element_start(name, qname, attrs) #@@ need to capture the literal else: raise RuntimeError, ("Unknown state in RDF parser", self._stack) # Unknown state
def load(store, uri=None, openFormula=None, asIfFrom=None, contentType=None, flags="", referer=None, why=None, topLevel=False): """Get and parse document. Guesses format if necessary. uri: if None, load from standard input. remember: if 1, store as metadata the relationship between this URI and this formula. Returns: top-level formula of the parsed document. Raises: IOError, SyntaxError, DocumentError This is an independent function, as it is fairly independent of the store. However, it is natural to call it as a method on the store. And a proliferation of APIs confuses. """ # if referer is None: # raise RuntimeError("We are trying to force things to include a referer header") try: baseURI = uripath.base() if uri != None: addr = uripath.join(baseURI, uri) # Make abs from relative if diag.chatty_flag > 40: progress("Taking input from " + addr) netStream = urlopenForRDF(addr, referer) if diag.chatty_flag > 60: progress(" Headers for %s: %s\n" %(addr, netStream.headers.items())) receivedContentType = netStream.headers.get(HTTP_Content_Type, None) else: if diag.chatty_flag > 40: progress("Taking input from standard input") addr = uripath.join(baseURI, "STDIN") # Make abs from relative netStream = sys.stdin receivedContentType = None # if diag.chatty_flag > 19: progress("HTTP Headers:" +`netStream.headers`) # @@How to get at all headers?? # @@ Get sensible net errors and produce dignostics guess = None if receivedContentType: if diag.chatty_flag > 9: progress("Recieved Content-type: " + `receivedContentType` + " for "+addr) if receivedContentType.find('xml') >= 0 or ( receivedContentType.find('rdf')>=0 and not (receivedContentType.find('n3')>=0) ): guess = "application/rdf+xml" elif receivedContentType.find('n3') >= 0: guess = "text/n3" if guess== None and contentType: if diag.chatty_flag > 9: progress("Given Content-type: " + `contentType` + " for "+addr) if contentType.find('xml') >= 0 or ( contentType.find('rdf') >= 0 and not (contentType.find('n3') >= 0 )): guess = "application/rdf+xml" elif contentType.find('n3') >= 0: guess = "text/n3" elif contentType.find('sparql') >= 0 or contentType.find('rq'): guess = "x-application/sparql" buffer = netStream.read() if guess == None: # can't be XML if it starts with these... if buffer[0:1] == "#" or buffer[0:7] == "@prefix": guess = 'text/n3' elif buffer[0:6] == 'PREFIX' or buffer[0:4] == 'BASE': guess = "x-application/sparql" elif buffer.find('xmlns="') >=0 or buffer.find('xmlns:') >=0: #" guess = 'application/rdf+xml' else: guess = 'text/n3' if diag.chatty_flag > 9: progress("Guessed ContentType:" + guess) except (IOError, OSError): raise DocumentAccessError(addr, sys.exc_info() ) if asIfFrom == None: asIfFrom = addr if openFormula != None: F = openFormula else: F = store.newFormula() if topLevel: newTopLevelFormula(F) import os if guess == "x-application/sparql": if diag.chatty_flag > 49: progress("Parsing as SPARQL") from sparql import sparql_parser import sparql2cwm convertor = sparql2cwm.FromSparql(store, F, why=why) import StringIO p = sparql_parser.N3Parser(StringIO.StringIO(buffer), sparql_parser.branches, convertor) F = p.parse(sparql_parser.start).close() elif guess == 'application/rdf+xml': if diag.chatty_flag > 49: progress("Parsing as RDF") # import sax2rdf, xml.sax._exceptions # p = sax2rdf.RDFXMLParser(store, F, thisDoc=asIfFrom, flags=flags) if flags == 'rdflib' or int(os.environ.get("CWM_RDFLIB", 0)): parser = 'rdflib' flags = '' else: parser = os.environ.get("CWM_RDF_PARSER", "sax2rdf") import rdfxml p = rdfxml.rdfxmlparser(store, F, thisDoc=asIfFrom, flags=flags, parser=parser, why=why) p.feed(buffer) F = p.close() else: assert guess == 'text/n3' if diag.chatty_flag > 49: progress("Parsing as N3") if os.environ.get("CWM_N3_PARSER", 0) == 'n3p': import n3p_tm import triple_maker tm = triple_maker.TripleMaker(formula=F, store=store) p = n3p_tm.n3p_tm(asIfFrom, tm) else: p = notation3.SinkParser(store, F, thisDoc=asIfFrom,flags=flags, why=why) try: p.startDoc() p.feed(buffer) p.endDoc() except: progress("Failed to parse %s" % uri or buffer) raise if not openFormula: F = F.close() return F
def bind(self, prefix, ns): # if ns.endswith('#'): ns, sep = ns[:-1], '#' # else: sep = '' ns = uripath.join(self._baseURI, ns) # + sep self._bindings[prefix] = ns self._sink.bind(prefix, (URI, ns))
def load(self, uri, baseURI=""): if uri: uri = uripath.join(baseURI, uri) # Make abs from relative source = URLInputSource(uri) self.parse(source)
def absolutize(uri, baseURI): return uripath.join(baseURI, uri)
def __getitem__(self, other): return join(self, other)
def crawlFrom(self, addr, prefix, max): fmla = self._fmla iter = 1 queue = [addr] seen = [] while queue: head = queue.pop() progress("crawling at: ", head, " iter ", iter, " of ", max) iter = iter + 1 if iter > max: progress("max limit reached.") break seen.append(head) try: rep = urllib2.urlopen(head) content = rep.read() except IOError: progress("can't GET", head) continue #@@ makeStatement(head type NoGood) # try to find a short label for # a diagram or some such. # try the last path segment, # or the 2nd last in case of an empty last segment... slash = head[:-1].rfind('/') label = head[slash + 1:] ct = rep.info().getheader('content-type') progress("... got content of type ", ct) isHTML = ct.find('text/html') == 0 fmla.add(symbol(head), symbol(DC('type')), literal(ct)) # note that we're not peeking into the URI # to find out if it's HTML; we're just # eliding the extension in the case we # know (from the HTTP headers) that it's HTML. if isHTML and label[-5:] == '.html': label = label[:-5] fmla.add(symbol(head), symbol(RDFS('label')), literal(label)) if not isHTML: continue progress("... parsing text/html content") doc = libxml2.htmlParseDoc(content, 'us-ascii') try: titles = doc.xpathNewContext().xpathEval('//title') title = titles[0].getContent() except: #@@figure out the right exceptions pass else: progress("... found title:", title) fmla.add(symbol(head), symbol(DC('title')), literal(str(title))) hrefs = doc.xpathNewContext().xpathEval('//a/@href') progress("... found ", len(hrefs), " links") for h in hrefs: h = h.getContent() progress("... found href", h) i = uripath.join(head, h) i = uripath.splitFrag(i)[0] progress("... found link", head, ' -> ', i) fmla.add(symbol(head), symbol(DC('relation')), symbol(i)) if i[:len(prefix)] == prefix and i not in seen: queue.append(i)
def uri_ref2(self, str, i, res): """Generate uri from n3 representation. Note that the RDF convention of directly concatenating NS and local name is now used though I prefer inserting a '#' to make the namesapces look more like what XML folks expect. """ qn = [] j = self.qname(str, i, qn) if j >= 0: pairFudge = qn[0] pfx = pairFudge[0] ln = pairFudge[1] if pfx is None: assertFudge(0, "not used?") ns = self._baseURI + ADDED_HASH else: ns = self._bindings[pfx] if not ns: # @@ pyjs should test undefined if pfx == "_": # Magic prefix 2001/05/30, can be overridden res.append(self.anonymousNode(ln)) return j raise BadSyntax(self._thisDoc, self.lines, str, i, "Prefix " + pfx + " not bound.") symb = self._store.newSymbol(ns + ln) if symb in self._variables: res.append(self._variables[symb]) else: res.append(symb) # @@@ "#" CONVENTION return j i = self.skipSpace(str, i) if i < 0: return -1 if str[i] == "?": v = [] j = self.variable(str, i, v) if j > 0: # Forget varibles as a class, only in context. res.append(v[0]) return j return -1 elif str[i] == "<": i = i + 1 st = i while i < len(str): if str[i] == ">": uref = str[st:i] # the join should dealt with "": if self._baseURI: uref = uripath.join(self._baseURI, uref) else: assertFudge(":" in uref, "With no base URI, cannot deal with relative URIs") if str[i - 1 : i] == "#" and not uref[-1:] == "#": uref = uref + "#" # She meant it! Weirdness in urlparse? symb = self._store.newSymbol(uref) if symb in self._variables: res.append(self._variables[symb]) else: res.append(symb) return i + 1 i = i + 1 raise BadSyntax(self._thisDoc, self.lines, str, j, "unterminated URI reference") elif self.keywordsSet: v = [] j = self.bareWord(str, i, v) if j < 0: return -1 # Forget varibles as a class, only in context. if v[0] in self.keywords: raise BadSyntax(self._thisDoc, self.lines, str, i, 'Keyword "' + v[0] + '" not allowed here.') res.append(self._store.newSymbol(self._bindings[""] + v[0])) return j else: return -1
def directive(self, str, i): j = self.skipSpace(str, i) if j<0: return j # eof res = [] j = self.tok('bind', str, i) # implied "#". Obsolete. if j>0: raise BadSyntax(self._thisDoc, self.lines, str, i, "keyword bind is obsolete: use @prefix") j = self.tok('keywords', str, i) if j>0: i = self.commaSeparatedList(str, j, res, self.bareWord) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "'@keywords' needs comma separated list of words") self.setKeywords(res[:]) if diag.chatty_flag > 80: progress("Keywords ", self.keywords) return i j = self.tok('forAll', str, i) if j > 0: i = self.commaSeparatedList(str, j, res, self.uri_ref2) if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forAll") for x in res: #self._context.declareUniversal(x) if x not in self._variables or x in self._parentVariables: self._variables[x] = self._context.newUniversal(x) return i j = self.tok('forSome', str, i) if j > 0: i = self. commaSeparatedList(str, j, res, self.uri_ref2) if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forSome") for x in res: self._context.declareExistential(x) return i j=self.tok('prefix', str, i) # no implied "#" if j>=0: t = [] i = self.qname(str, j, t) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected qname after @prefix") j = self.uri_ref2(str, i, t) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected <uriref> after @prefix _qname_") ns = self.uriOf(t[1]) if self._baseURI: ns = join(self._baseURI, ns) elif ":" not in ns: raise BadSyntax(self._thisDoc, self.lines, str, j, "With no base URI, cannot use relative URI in @prefix <"+ns+">") assert ':' in ns # must be absolute self._bindings[t[0][0]] = ns self.bind(t[0][0], hexify(ns)) return j j=self.tok('base', str, i) # Added 2007/7/7 if j >= 0: t = [] i = self.uri_ref2(str, j, t) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected <uri> after @base ") ns = self.uriOf(t[0]) if self._baseURI: ns = join(self._baseURI, ns) elif ':' not in ns: raise BadSyntax(self._thisDoc, self.lines, str, j, "With no previous base URI, cannot use relative URI in @base <"+ns+">") assert ':' in ns # must be absolute self._baseURI = ns return i return -1 # Not a directive, could be something else.
def uriref(self, str): return Symbol(uripath.join(self._baseURI, str[1:-1]))
for s in errors: print "\t%s" % s exit(-2) else: print "Ok for predictive parsing" #print "Branch table:", branchTable print "Literal terminals:", literalTerminals.keys() print "Token regular expressions:" for r in tokenRegexps: print "\t%s matches %s" %(r, tokenRegexps[r].pattern) yacc=open(argv[1]+"-yacc.y", "w") yaccConvert(yacc, document, tokenRegexps) #while agenda: # x = agenda[0] # agenda = agenda[1:] # already.append(x) # yaccProduction(yacc, x, tokenRegexps) yacc.close() if len(argv) <= 3: exit(0) parseFile = argv[3] ip = webAccess.urlopenForRDF(uripath.join(uripath.base(), parseFile), None) str = ip.read() sink = g.newFormula() p = PredictiveParser(sink=sink, top=document, branchTable= branchTable, tokenRegexps= tokenRegexps) p.parse(str) #ends
def __init__(self, mainURL, mergeFunction, provMergeFunction=None): # Try to be smart. We might have accidentally been passed the # provCell or dataCell. bus = dbus.SystemBus() propman = bus.get_object('edu.mit.csail.dig.DPropMan', '/DPropMan') self.initURLType = None self.url = mainURL self.uuid = str(propman.registerRemoteCell(self.url, dbus_interface='edu.mit.csail.dig.DPropMan')) self.dbusCell = bus.get_object('edu.mit.csail.dig.DPropMan', '/Cells/%s' % (self.uuid)) self.dbusCell.connect_to_signal('UpdateSignal', lambda raw_data, peer: self.typeChecker(str(raw_data), str(peer)), dbus_interface='edu.mit.csail.dig.DPropMan.Cell') self.dbusCell.connectToRemote(self.url, dbus_interface='edu.mit.csail.dig.DPropMan.Cell') # Wait for the typeChecker to have gotten something. # WARNING: We assume python-gobject is being used. while self.initURLType is None: gobject.MainLoop().get_context().iteration(True) self.provCell = None self.dataCell = None if self.initURLType == 'dpropProvCell': print "Actually passed a provenance cell!" provURL = mainURL self.provCell = RemoteCell(provURL, lambda provCell, raw_data, peer: self.provCellMerge(provMergeFunction, str(raw_data), str(peer))) self.provCellMerge(provMergeFunction, str(self.initRawData), str(self.initPeer)) provData = dpropjson.loads(self.provCell.data()) if isinstance(provData, dict): if 'type' not in provData or 'mainCell' not in provData or 'data' not in provData: raise DPropException("Provenance cell doesn't match expected cell type!") elif provData['type'] != 'dpropProvCell': raise DPropException("Provenance cell wasn't a dpropProvCell!") else: raise DPropException("Provenance cell didn't contain a dictionary!") mainURL = uripath.join(provURL, provData['provCell']) elif self.initURLType == 'dpropDataCell': print "Actually passed a data cell!" dataURL = mainURL self.dataCell = RemoteCell(dataURL, lambda dataCell, raw_data, peer: self.dataCellMerge(mergeFunction, str(raw_data), str(peer))) self.dataCellMerge(mergeFunction, str(self.initRawData), str(self.initPeer)) dataData = dpropjson.loads(self.dataCell.data()) print dataData if isinstance(dataData, dict): if 'type' not in dataData or 'mainCell' not in dataData or 'data' not in dataData: raise DPropException("Data cell doesn't match expected cell type!") elif dataData['type'] != 'dpropDataCell': raise DPropException("Data cell wasn't a dpropDataCell!") else: raise DPropException("Data cell didn't contain a dictionary!") mainURL = uripath.join(dataURL, dataData['mainCell']) print mainURL self.mainCell = RemoteCell(mainURL, lambda mainCell, raw_data, peer: self.mainCellMerge(str(raw_data), str(peer))) self.uuid = self.mainCell.uuid if self.initURLType == 'dpropMainCell': print "Passed a main cell" self.mainCellMerge(str(self.initRawData), str(self.initPeer)) mainData = dpropjson.loads(self.mainCell.data()) while isinstance(mainData, dpropjson.Nothing): # Gotta wait a few cycles (I really don't like this...) gobject.MainLoop().get_context().iteration(True) mainData = dpropjson.loads(self.mainCell.data()) if isinstance(mainData, dict): if 'type' not in mainData or 'provCell' not in mainData or 'dataCell' not in mainData: raise DPropException("Main cell doesn't match expected cell type!") elif mainData['type'] != 'dpropMainCell': raise DPropException("Main cell wasn't a dpropMainCell!") else: raise DPropException("Main cell didn't contain a dictionary!") # URL for the provCell is relative to mainCell. if self.provCell == None: provURL = uripath.join(mainURL, mainData['provCell']) self.provCell = RemoteCell(provURL, lambda provCell, raw_data, peer: self.provCellMerge(provMergeFunction, str(raw_data), str(peer))) if self.dataCell == None: dataURL = uripath.join(mainURL, mainData['dataCell']) self.dataCell = RemoteCell(dataURL, lambda dataCell, raw_data, peer: self.dataCellMerge(mergeFunction, str(raw_data), str(peer))) provData = dpropjson.loads(self.provCell.data()) while isinstance(provData, dpropjson.Nothing): # Gotta wait a few cycles (I really don't like this...) gobject.MainLoop().get_context().iteration(True) provData = dpropjson.loads(self.provCell.data()) if isinstance(provData, dict): if 'type' not in provData or 'mainCell' not in provData or 'data' not in provData: raise DPropException("Provenance cell doesn't match expected cell type!") elif provData['type'] != 'dpropProvCell': raise DPropException("Provenance cell wasn't a dpropProvCell!") elif provData['mainCell'] != self.mainCell.uuid: raise DPropException("Provenance cell's mainCell doesn't match expected UUID!") else: raise DPropException("Provenance cell didn't contain a dictionary!") dataData = dpropjson.loads(self.dataCell.data()) while isinstance(dataData, dpropjson.Nothing): # Gotta wait a few cycles (I really don't like this...) gobject.MainLoop().get_context().iteration(True) dataData = dpropjson.loads(self.dataCell.data()) if isinstance(dataData, dict): if 'type' not in dataData or 'mainCell' not in dataData or 'data' not in dataData: raise DPropException("Data cell doesn't match expected cell type!") elif dataData['type'] != 'dpropDataCell': raise DPropException("Data cell wasn't a dpropDataCell!") elif dataData['mainCell'] != self.mainCell.uuid: raise DPropException("Data cell's mainCell doesn't match expected UUID!") else: raise DPropException("Data cell didn't contain a dictionary!")
def evaluateObject(self, subj_py): import uripath there, base = subj_py return uripath.join(base, there)
def uriref(self, s): return (URI, uripath.join(self._baseURI, s[1:-1]))
def uriref(self, str): """ Generate uri from uriref in this document unicode strings OK. """ return uripath.join(self._base, str)
def load(store, uri=None, openFormula=None, asIfFrom=None, contentType=None, flags="", referer=None, why=None, topLevel=False): """Get and parse document. Guesses format if necessary. uri: if None, load from standard input. remember: if 1, store as metadata the relationship between this URI and this formula. Returns: top-level formula of the parsed document. Raises: IOError, SyntaxError, DocumentError This is an independent function, as it is fairly independent of the store. However, it is natural to call it as a method on the store. And a proliferation of APIs confuses. """ # if referer is None: # raise RuntimeError("We are trying to force things to include a referer header") try: baseURI = uripath.base() if uri != None: addr = uripath.join(baseURI, uri) # Make abs from relative if diag.chatty_flag > 40: progress("Taking input from " + addr) netStream = urlopenForRDF(addr, referer) if diag.chatty_flag > 60: progress(" Headers for %s: %s\n" % (addr, netStream.headers.items())) receivedContentType = netStream.headers.get( HTTP_Content_Type, None) else: if diag.chatty_flag > 40: progress("Taking input from standard input") addr = uripath.join(baseURI, "STDIN") # Make abs from relative netStream = sys.stdin receivedContentType = None # if diag.chatty_flag > 19: progress("HTTP Headers:" +`netStream.headers`) # @@How to get at all headers?? # @@ Get sensible net errors and produce dignostics guess = None if receivedContentType: if diag.chatty_flag > 9: progress("Recieved Content-type: " + ` receivedContentType ` + " for " + addr) if receivedContentType.find('xml') >= 0 or ( receivedContentType.find('rdf') >= 0 and not (receivedContentType.find('n3') >= 0)): guess = "application/rdf+xml" elif receivedContentType.find('n3') >= 0: guess = "text/rdf+n3" if guess == None and contentType: if diag.chatty_flag > 9: progress("Given Content-type: " + ` contentType ` + " for " + addr) if contentType.find('xml') >= 0 or ( contentType.find('rdf') >= 0 and not (contentType.find('n3') >= 0)): guess = "application/rdf+xml" elif contentType.find('n3') >= 0: guess = "text/rdf+n3" elif contentType.find('sparql') >= 0 or contentType.find('rq'): guess = "x-application/sparql" buffer = netStream.read() if guess == None: # can't be XML if it starts with these... if buffer[0:1] == "#" or buffer[0:7] == "@prefix": guess = 'text/rdf+n3' elif buffer[0:6] == 'PREFIX' or buffer[0:4] == 'BASE': guess = "x-application/sparql" elif buffer.find('xmlns="') >= 0 or buffer.find('xmlns:') >= 0: #" guess = 'application/rdf+xml' else: guess = 'text/rdf+n3' if diag.chatty_flag > 9: progress("Guessed ContentType:" + guess) except (IOError, OSError): raise DocumentAccessError(addr, sys.exc_info()) if asIfFrom == None: asIfFrom = addr if openFormula != None: F = openFormula else: F = store.newFormula() if topLevel: newTopLevelFormula(F) import os if guess == "x-application/sparql": if diag.chatty_flag > 49: progress("Parsing as SPARQL") from sparql import sparql_parser import sparql2cwm convertor = sparql2cwm.FromSparql(store, F, why=why) import StringIO p = sparql_parser.N3Parser(StringIO.StringIO(buffer), sparql_parser.branches, convertor) F = p.parse(sparql_parser.start).close() elif guess == 'application/rdf+xml': if diag.chatty_flag > 49: progress("Parsing as RDF") # import sax2rdf, xml.sax._exceptions # p = sax2rdf.RDFXMLParser(store, F, thisDoc=asIfFrom, flags=flags) if flags == 'rdflib' or int(os.environ.get("CWM_RDFLIB", 0)): parser = 'rdflib' flags = '' else: parser = os.environ.get("CWM_RDF_PARSER", "sax2rdf") import rdfxml p = rdfxml.rdfxmlparser(store, F, thisDoc=asIfFrom, flags=flags, parser=parser, why=why) p.feed(buffer) F = p.close() else: assert guess == 'text/rdf+n3' if diag.chatty_flag > 49: progress("Parsing as N3") if os.environ.get("CWM_N3_PARSER", 0) == 'n3p': import n3p_tm import triple_maker tm = triple_maker.TripleMaker(formula=F, store=store) p = n3p_tm.n3p_tm(asIfFrom, tm) else: p = notation3.SinkParser(store, F, thisDoc=asIfFrom, flags=flags, why=why) try: p.startDoc() p.feed(buffer) p.endDoc() except: progress("Failed to parse %s" % uri or buffer) raise if not openFormula: F = F.close() return F
def directive(self, str, i): j = self.skipSpace(str, i) if j < 0: return j # eof res = [] j = self.tok('bind', str, i) # implied "#". Obsolete. if j > 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "keyword bind is obsolete: use @prefix") j = self.tok('keywords', str, i) if j > 0: i = self.commaSeparatedList(str, j, res, false) if i < 0: raise BadSyntax( self._thisDoc, self.lines, str, i, "'@keywords' needs comma separated list of words") self.setKeywords(res[:]) if diag.chatty_flag > 80: progress("Keywords ", self.keywords) return i j = self.tok('forAll', str, i) if j > 0: i = self.commaSeparatedList(str, j, res, true) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forAll") for x in res: #self._context.declareUniversal(x) if x not in self._variables or x in self._parentVariables: self._variables[x] = self._context.newUniversal(x) return i j = self.tok('forSome', str, i) if j > 0: i = self.commaSeparatedList(str, j, res, self.uri_ref2) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forSome") for x in res: self._context.declareExistential(x) return i j = self.tok('prefix', str, i) # no implied "#" if j >= 0: t = [] i = self.qname(str, j, t) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected qname after @prefix") j = self.uri_ref2(str, i, t) if j < 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected <uriref> after @prefix _qname_") ns = t[1].uri # pyjs was uriref() if self._baseURI: ns = join(self._baseURI, ns) else: assertFudge( ":" in ns, "With no base URI, cannot handle relative URI for NS") assertFudge(':' in ns) # must be absolute self._bindings[t[0][0]] = ns self.bind(t[0][0], hexify(ns)) return j j = self.tok('base', str, i) # Added 2007/7/7 if j >= 0: t = [] i = self.uri_ref2(str, j, t) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected <uri> after @base ") ns = t[0].uri # pyjs was uriref() if self._baseURI: ns = join(self._baseURI, ns) else: raise BadSyntax( self._thisDoc, self.lines, str, j, "With no previous base URI, cannot use relative URI in @base <" + ns + ">") assertFudge(':' in ns) # must be absolute self._baseURI = ns return i return -1 # Not a directive, could be something else.
def evaluateObject(self, subj_py): if verbosity() > 80: progress("os:baseAbsolute input:"+`subj_py`) if isString(subj_py): return uripath.join(uripath.base(), subj_py) progress("Warning: os:baseAbsolute input is not a string: "+`subj_py`)