def runNamespace(): "Return a URI suitable as a namespace for run-local objects" # @@@ include hostname (privacy?) (hash it?) global runNamespaceValue if runNamespaceValue == None: try: runNamespaceValue = environ["CWM_RUN_NS"] except KeyError: runNamespaceValue = uripath.join( uripath.base(), ".run-" + `time()` + "p"+ `getpid()` +"#") runNamespaceValue = join(base(), runNamespaceValue) # absolutize return runNamespaceValue
def __init__(self, name, store=None): if ':' not in name: #, "must be absolute: %s" % name base = uripath.base() name = uripath.join(base, name) self._name = name self.store = store self._seen = {}
def main(argv): import sys site, max = argv[1:3] max = int(max) f = formula() here = uripath.base() c = Crawler(f, here) c.crawlFrom(site, site, max) f.close() sink = toXML.ToRDF(sys.stdout, here) bind('dc', DC('')) bind('s', RDFS('')) myStore.store.dumpNested(f, sink)
def doCommand(): """Command line RDF/N3 crawler crawl <uriref> options: See http://www.w3.org/2000/10/swap/doc/cwm for more documentation. """ global agenda global already uriref = sys.argv[1] uri = join(base(), uriref) r = symbol(uri) diag.setVerbosity(0) print "@prefix : <http://www.w3.org/2000/10/swap/util/semweb#>." print "# Generated by crawl.py ", cvsRevision[1:-1] agenda = [r] while agenda != []: r = agenda[0] agenda = agenda[1:] already.append(r) crawl(r) print "# ", len(already), "attempts,", successes, "successes."
def evaluateObject(self, subj_py): if verbosity() > 80: progress("os:baseRelative input:"+`subj_py`) if isString(subj_py): return uripath.refTo(uripath.base(), subj_py) progress("Warning: os:baseRelative input is not a string: "+`subj_py`)
def load(store, uri=None, openFormula=None, asIfFrom=None, contentType=None, flags="", referer=None, why=None, topLevel=False): """Get and parse document. Guesses format if necessary. uri: if None, load from standard input. remember: if 1, store as metadata the relationship between this URI and this formula. Returns: top-level formula of the parsed document. Raises: IOError, SyntaxError, DocumentError This is an independent function, as it is fairly independent of the store. However, it is natural to call it as a method on the store. And a proliferation of APIs confuses. """ # if referer is None: # raise RuntimeError("We are trying to force things to include a referer header") try: baseURI = uripath.base() if uri != None: addr = uripath.join(baseURI, uri) # Make abs from relative if diag.chatty_flag > 40: progress("Taking input from " + addr) netStream = urlopenForRDF(addr, referer) if diag.chatty_flag > 60: progress(" Headers for %s: %s\n" %(addr, netStream.headers.items())) receivedContentType = netStream.headers.get(HTTP_Content_Type, None) else: if diag.chatty_flag > 40: progress("Taking input from standard input") addr = uripath.join(baseURI, "STDIN") # Make abs from relative netStream = sys.stdin receivedContentType = None # if diag.chatty_flag > 19: progress("HTTP Headers:" +`netStream.headers`) # @@How to get at all headers?? # @@ Get sensible net errors and produce dignostics guess = None if receivedContentType: if diag.chatty_flag > 9: progress("Recieved Content-type: " + `receivedContentType` + " for "+addr) if receivedContentType.find('xml') >= 0 or ( receivedContentType.find('rdf')>=0 and not (receivedContentType.find('n3')>=0) ): guess = "application/rdf+xml" elif receivedContentType.find('n3') >= 0: guess = "text/n3" if guess== None and contentType: if diag.chatty_flag > 9: progress("Given Content-type: " + `contentType` + " for "+addr) if contentType.find('xml') >= 0 or ( contentType.find('rdf') >= 0 and not (contentType.find('n3') >= 0 )): guess = "application/rdf+xml" elif contentType.find('n3') >= 0: guess = "text/n3" elif contentType.find('sparql') >= 0 or contentType.find('rq'): guess = "x-application/sparql" buffer = netStream.read() if guess == None: # can't be XML if it starts with these... if buffer[0:1] == "#" or buffer[0:7] == "@prefix": guess = 'text/n3' elif buffer[0:6] == 'PREFIX' or buffer[0:4] == 'BASE': guess = "x-application/sparql" elif buffer.find('xmlns="') >=0 or buffer.find('xmlns:') >=0: #" guess = 'application/rdf+xml' else: guess = 'text/n3' if diag.chatty_flag > 9: progress("Guessed ContentType:" + guess) except (IOError, OSError): raise DocumentAccessError(addr, sys.exc_info() ) if asIfFrom == None: asIfFrom = addr if openFormula != None: F = openFormula else: F = store.newFormula() if topLevel: newTopLevelFormula(F) import os if guess == "x-application/sparql": if diag.chatty_flag > 49: progress("Parsing as SPARQL") from sparql import sparql_parser import sparql2cwm convertor = sparql2cwm.FromSparql(store, F, why=why) import StringIO p = sparql_parser.N3Parser(StringIO.StringIO(buffer), sparql_parser.branches, convertor) F = p.parse(sparql_parser.start).close() elif guess == 'application/rdf+xml': if diag.chatty_flag > 49: progress("Parsing as RDF") # import sax2rdf, xml.sax._exceptions # p = sax2rdf.RDFXMLParser(store, F, thisDoc=asIfFrom, flags=flags) if flags == 'rdflib' or int(os.environ.get("CWM_RDFLIB", 0)): parser = 'rdflib' flags = '' else: parser = os.environ.get("CWM_RDF_PARSER", "sax2rdf") import rdfxml p = rdfxml.rdfxmlparser(store, F, thisDoc=asIfFrom, flags=flags, parser=parser, why=why) p.feed(buffer) F = p.close() else: assert guess == 'text/n3' if diag.chatty_flag > 49: progress("Parsing as N3") if os.environ.get("CWM_N3_PARSER", 0) == 'n3p': import n3p_tm import triple_maker tm = triple_maker.TripleMaker(formula=F, store=store) p = n3p_tm.n3p_tm(asIfFrom, tm) else: p = notation3.SinkParser(store, F, thisDoc=asIfFrom,flags=flags, why=why) try: p.startDoc() p.feed(buffer) p.endDoc() except: progress("Failed to parse %s" % uri or buffer) raise if not openFormula: F = F.close() return F
for s in errors: print "\t%s" % s exit(-2) else: print "Ok for predictive parsing" #print "Branch table:", branchTable print "Literal terminals:", literalTerminals.keys() print "Token regular expressions:" for r in tokenRegexps: print "\t%s matches %s" %(r, tokenRegexps[r].pattern) yacc=open(argv[1]+"-yacc.y", "w") yaccConvert(yacc, document, tokenRegexps) #while agenda: # x = agenda[0] # agenda = agenda[1:] # already.append(x) # yaccProduction(yacc, x, tokenRegexps) yacc.close() if len(argv) <= 3: exit(0) parseFile = argv[3] ip = webAccess.urlopenForRDF(uripath.join(uripath.base(), parseFile), None) str = ip.read() sink = g.newFormula() p = PredictiveParser(sink=sink, top=document, branchTable= branchTable, tokenRegexps= tokenRegexps) p.parse(str) #ends
else: print "Ok for predictive parsing" #print "Branch table:", branchTable print "Literal terminals:", literalTerminals.keys() print "Token regular expressions:" for r in tokenRegexps: print "\t%s matches %s" % (r, tokenRegexps[r].pattern) yacc = open(argv[1] + "-yacc.y", "w") yaccConvert(yacc, document, tokenRegexps) #while agenda: # x = agenda[0] # agenda = agenda[1:] # already.append(x) # yaccProduction(yacc, x, tokenRegexps) yacc.close() if len(argv) <= 3: exit(0) parseFile = argv[3] ip = webAccess.urlopenForRDF(uripath.join(uripath.base(), parseFile), None) str = ip.read() sink = g.newFormula() p = PredictiveParser(sink=sink, top=document, branchTable=branchTable, tokenRegexps=tokenRegexps) p.parse(str) #ends
def load(store, uri=None, openFormula=None, asIfFrom=None, contentType=None, flags="", referer=None, why=None, topLevel=False): """Get and parse document. Guesses format if necessary. uri: if None, load from standard input. remember: if 1, store as metadata the relationship between this URI and this formula. Returns: top-level formula of the parsed document. Raises: IOError, SyntaxError, DocumentError This is an independent function, as it is fairly independent of the store. However, it is natural to call it as a method on the store. And a proliferation of APIs confuses. """ # if referer is None: # raise RuntimeError("We are trying to force things to include a referer header") try: baseURI = uripath.base() if uri != None: addr = uripath.join(baseURI, uri) # Make abs from relative if diag.chatty_flag > 40: progress("Taking input from " + addr) netStream = urlopenForRDF(addr, referer) if diag.chatty_flag > 60: progress(" Headers for %s: %s\n" % (addr, netStream.headers.items())) receivedContentType = netStream.headers.get( HTTP_Content_Type, None) else: if diag.chatty_flag > 40: progress("Taking input from standard input") addr = uripath.join(baseURI, "STDIN") # Make abs from relative netStream = sys.stdin receivedContentType = None # if diag.chatty_flag > 19: progress("HTTP Headers:" +`netStream.headers`) # @@How to get at all headers?? # @@ Get sensible net errors and produce dignostics guess = None if receivedContentType: if diag.chatty_flag > 9: progress("Recieved Content-type: " + ` receivedContentType ` + " for " + addr) if receivedContentType.find('xml') >= 0 or ( receivedContentType.find('rdf') >= 0 and not (receivedContentType.find('n3') >= 0)): guess = "application/rdf+xml" elif receivedContentType.find('n3') >= 0: guess = "text/rdf+n3" if guess == None and contentType: if diag.chatty_flag > 9: progress("Given Content-type: " + ` contentType ` + " for " + addr) if contentType.find('xml') >= 0 or ( contentType.find('rdf') >= 0 and not (contentType.find('n3') >= 0)): guess = "application/rdf+xml" elif contentType.find('n3') >= 0: guess = "text/rdf+n3" elif contentType.find('sparql') >= 0 or contentType.find('rq'): guess = "x-application/sparql" buffer = netStream.read() if guess == None: # can't be XML if it starts with these... if buffer[0:1] == "#" or buffer[0:7] == "@prefix": guess = 'text/rdf+n3' elif buffer[0:6] == 'PREFIX' or buffer[0:4] == 'BASE': guess = "x-application/sparql" elif buffer.find('xmlns="') >= 0 or buffer.find('xmlns:') >= 0: #" guess = 'application/rdf+xml' else: guess = 'text/rdf+n3' if diag.chatty_flag > 9: progress("Guessed ContentType:" + guess) except (IOError, OSError): raise DocumentAccessError(addr, sys.exc_info()) if asIfFrom == None: asIfFrom = addr if openFormula != None: F = openFormula else: F = store.newFormula() if topLevel: newTopLevelFormula(F) import os if guess == "x-application/sparql": if diag.chatty_flag > 49: progress("Parsing as SPARQL") from sparql import sparql_parser import sparql2cwm convertor = sparql2cwm.FromSparql(store, F, why=why) import StringIO p = sparql_parser.N3Parser(StringIO.StringIO(buffer), sparql_parser.branches, convertor) F = p.parse(sparql_parser.start).close() elif guess == 'application/rdf+xml': if diag.chatty_flag > 49: progress("Parsing as RDF") # import sax2rdf, xml.sax._exceptions # p = sax2rdf.RDFXMLParser(store, F, thisDoc=asIfFrom, flags=flags) if flags == 'rdflib' or int(os.environ.get("CWM_RDFLIB", 0)): parser = 'rdflib' flags = '' else: parser = os.environ.get("CWM_RDF_PARSER", "sax2rdf") import rdfxml p = rdfxml.rdfxmlparser(store, F, thisDoc=asIfFrom, flags=flags, parser=parser, why=why) p.feed(buffer) F = p.close() else: assert guess == 'text/rdf+n3' if diag.chatty_flag > 49: progress("Parsing as N3") if os.environ.get("CWM_N3_PARSER", 0) == 'n3p': import n3p_tm import triple_maker tm = triple_maker.TripleMaker(formula=F, store=store) p = n3p_tm.n3p_tm(asIfFrom, tm) else: p = notation3.SinkParser(store, F, thisDoc=asIfFrom, flags=flags, why=why) try: p.startDoc() p.feed(buffer) p.endDoc() except: progress("Failed to parse %s" % uri or buffer) raise if not openFormula: F = F.close() return F