def crawl(site, trm , depth, linksfile): from java.net import URL from org.w3c.tidy import Tidy pattern = re.compile('href="/wiki/(.*?)"') f = open(linksfile, 'a+') #try: if depth < MAX_DEPTH: print 'crawling [%s]...' % trm, print >> f, '[%s]' % trm td = Tidy() td.setXmlOut(1) u = URL(site + trm) input = BufferedInputStream(u.openStream()) output = ByteArrayOutputStream() #tidy.setInputEncoding("UTF8") #tidy.setOutputEncoding("UTF8") td.parse(input, output) content = output.toString() hits = pattern.findall(content) for hit in hits: if hit.find(":") == -1: print >> f, hit print 'done.' print >> f, '' for hit in hits: if hit.find(":") == -1: crawl(site, hit, depth + 1, linksfile) #except: # print "wrong" f.close()
def __tidy(self, content): tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(True) tidy.setNumEntities(True) tidy.setShowWarnings(False) tidy.setQuiet(True) out = ByteArrayOutputStream() tidy.parse(IOUtils.toInputStream(content, "UTF-8"), out) return out.toString("UTF-8")