def crawl(site, trm , depth, linksfile): from java.net import URL from org.w3c.tidy import Tidy pattern = re.compile('href="/wiki/(.*?)"') f = open(linksfile, 'a+') #try: if depth < MAX_DEPTH: print 'crawling [%s]...' % trm, print >> f, '[%s]' % trm td = Tidy() td.setXmlOut(1) u = URL(site + trm) input = BufferedInputStream(u.openStream()) output = ByteArrayOutputStream() #tidy.setInputEncoding("UTF8") #tidy.setOutputEncoding("UTF8") td.parse(input, output) content = output.toString() hits = pattern.findall(content) for hit in hits: if hit.find(":") == -1: print >> f, hit print 'done.' print >> f, '' for hit in hits: if hit.find(":") == -1: crawl(site, hit, depth + 1, linksfile) #except: # print "wrong" f.close()
def __tidy(self, content): tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(True) tidy.setNumEntities(True) tidy.setShowWarnings(False) tidy.setQuiet(True) out = ByteArrayOutputStream() tidy.parse(IOUtils.toInputStream(content, "UTF-8"), out) return out.toString("UTF-8")
def cleanData(data): from org.w3c.tidy import Tidy tidy = Tidy() tidy.setXHTML(True) tidy.setInputEncoding("UTF-8") tidy.setOutputEncoding("UTF-8") tidy.setMakeClean(False) tidy.setDropEmptyParas(False) tidy.setPrintBodyOnly(True) tidy.setQuoteAmpersand(True) tidy.setTrimEmptyElements(False) inputStream = ByteArrayInputStream(String(data).getBytes("UTF-8")) outputStream = ByteArrayOutputStream() tidy.parseDOM(inputStream, outputStream) return outputStream.toString("UTF-8")
def __getContent(self, oid): slash = oid.rfind("/") pid = os.path.splitext(oid[slash+1:])[0] + ".htm" payload = Services.storage.getObject(oid).getPayload(pid) tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(False) tidy.setNumEntities(True) out = ByteArrayOutputStream() try: doc = tidy.parseDOM(payload.getInputStream(), out) content = out.toString("UTF-8") content = self.__processMedia(oid, doc, content) #print "[\n%s\n]" % content except Exception, e: print " * blog.py: Failed to get content: %s" % e.getMessage()
def __tidy(self, content): tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(False) tidy.setNumEntities(True) out = ByteArrayOutputStream() doc = tidy.parseDOM(ByteArrayInputStream(String(content).getBytes()), out) content = out.toString("UTF-8") return content, doc