def doCommand(): """Command line RDF/N3 crawler crawl <uriref> options: See http://www.w3.org/2000/10/swap/doc/cwm for more documentation. """ global agenda global already uriref = sys.argv[1] uri = join(base(), uriref) r = symbol(uri) diag.setVerbosity(0) print "@prefix : <http://www.w3.org/2000/10/swap/util/semweb#>." print "# Generated by crawl.py ", cvsRevision[1:-1] agenda = [r] while agenda != []: r = agenda[0] agenda = agenda[1:] already.append(r) crawl(r) print "# ", len(already), "attempts,", successes, "successes."
def crawlFrom(self, addr, prefix, max): fmla = self._fmla iter = 1 queue = [addr] seen = [] while queue: head = queue.pop() progress("crawling at: ", head, " iter ", iter, " of ", max) iter = iter + 1 if iter > max: progress("max limit reached.") break seen.append(head) try: rep = urllib2.urlopen(head) content = rep.read() except IOError: progress("can't GET", head) continue #@@ makeStatement(head type NoGood) # try to find a short label for # a diagram or some such. # try the last path segment, # or the 2nd last in case of an empty last segment... slash = head[:-1].rfind('/') label = head[slash + 1:] ct = rep.info().getheader('content-type') progress("... got content of type ", ct) isHTML = ct.find('text/html') == 0 fmla.add(symbol(head), symbol(DC('type')), literal(ct)) # note that we're not peeking into the URI # to find out if it's HTML; we're just # eliding the extension in the case we # know (from the HTTP headers) that it's HTML. if isHTML and label[-5:] == '.html': label = label[:-5] fmla.add(symbol(head), symbol(RDFS('label')), literal(label)) if not isHTML: continue progress("... parsing text/html content") doc = libxml2.htmlParseDoc(content, 'us-ascii') try: titles = doc.xpathNewContext().xpathEval('//title') title = titles[0].getContent() except: #@@figure out the right exceptions pass else: progress("... found title:", title) fmla.add(symbol(head), symbol(DC('title')), literal(str(title))) hrefs = doc.xpathNewContext().xpathEval('//a/@href') progress("... found ", len(hrefs), " links") for h in hrefs: h = h.getContent() progress("... found href", h) i = uripath.join(head, h) i = uripath.splitFrag(i)[0] progress("... found link", head, ' -> ', i) fmla.add(symbol(head), symbol(DC('relation')), symbol(i)) if i[:len(prefix)] == prefix and i not in seen: queue.append(i)
def crawlFrom(self, addr, prefix, max): fmla = self._fmla iter = 1 queue = [addr] seen = [] while queue: head = queue.pop() progress("crawling at: ", head, " iter ", iter, " of ", max) iter = iter + 1 if iter > max: progress ("max limit reached.") break seen.append(head) try: rep = urllib2.urlopen(head) content = rep.read() except IOError: progress("can't GET", head) continue #@@ makeStatement(head type NoGood) # try to find a short label for # a diagram or some such. # try the last path segment, # or the 2nd last in case of an empty last segment... slash = head[:-1].rfind('/') label = head[slash+1:] ct = rep.info().getheader('content-type') progress("... got content of type ", ct) isHTML = ct.find('text/html') == 0 fmla.add(symbol(head), symbol(DC('type')), literal(ct)) # note that we're not peeking into the URI # to find out if it's HTML; we're just # eliding the extension in the case we # know (from the HTTP headers) that it's HTML. if isHTML and label[-5:] == '.html': label = label[:-5] fmla.add(symbol(head), symbol(RDFS('label')), literal(label)) if not isHTML: continue progress("... parsing text/html content") doc = libxml2.htmlParseDoc(content, 'us-ascii') try: titles = doc.xpathNewContext().xpathEval('//title') title = titles[0].getContent() except: #@@figure out the right exceptions pass else: progress("... found title:", title) fmla.add(symbol(head), symbol(DC('title')), literal(str(title)) ) hrefs = doc.xpathNewContext().xpathEval('//a/@href') progress("... found ", len(hrefs), " links") for h in hrefs: h = h.getContent() progress("... found href", h) i = uripath.join(head, h) i = uripath.splitFrag(i)[0] progress("... found link", head, ' -> ', i) fmla.add(symbol(head), symbol(DC('relation')), symbol(i)) if i[:len(prefix)] == prefix and i not in seen: queue.append(i)