Exemple #1
0
    def crawl(site, trm , depth, linksfile):
        from java.net import URL
        from org.w3c.tidy import Tidy
        pattern = re.compile('href="/wiki/(.*?)"')
        f = open(linksfile, 'a+')
        #try:
        if depth < MAX_DEPTH:
            print 'crawling [%s]...' % trm,
            print >> f, '[%s]' % trm

            td = Tidy()
            td.setXmlOut(1)

            u = URL(site + trm)

            input = BufferedInputStream(u.openStream())
            output = ByteArrayOutputStream()
            #tidy.setInputEncoding("UTF8")
            #tidy.setOutputEncoding("UTF8")

            td.parse(input, output)
            content = output.toString()
            hits = pattern.findall(content)

            for hit in hits:
                if hit.find(":") == -1:
                    print >> f, hit
            print 'done.'
            print >> f, ''
            for hit in hits:
                if hit.find(":") == -1:
                    crawl(site, hit, depth + 1, linksfile)
        #except:
        #    print "wrong"
        f.close()
Exemple #2
0
 def __tidy(self, content):
     tidy = Tidy()
     tidy.setIndentAttributes(False)
     tidy.setIndentContent(False)
     tidy.setPrintBodyOnly(True)
     tidy.setSmartIndent(False)
     tidy.setWraplen(0)
     tidy.setXHTML(True)
     tidy.setNumEntities(True)
     tidy.setShowWarnings(False)
     tidy.setQuiet(True)
     out = ByteArrayOutputStream()
     tidy.parse(IOUtils.toInputStream(content, "UTF-8"), out)
     return out.toString("UTF-8")
Exemple #3
0
def cleanData(data):
    from org.w3c.tidy import Tidy
    tidy = Tidy()
    tidy.setXHTML(True)
    tidy.setInputEncoding("UTF-8")
    tidy.setOutputEncoding("UTF-8")

    tidy.setMakeClean(False)
    tidy.setDropEmptyParas(False)
    tidy.setPrintBodyOnly(True)
    tidy.setQuoteAmpersand(True)
    tidy.setTrimEmptyElements(False)

    inputStream = ByteArrayInputStream(String(data).getBytes("UTF-8"))
    outputStream = ByteArrayOutputStream()

    tidy.parseDOM(inputStream, outputStream)

    return outputStream.toString("UTF-8")
Exemple #4
0
 def __getContent(self, oid):
     slash = oid.rfind("/")
     pid = os.path.splitext(oid[slash+1:])[0] + ".htm"
     payload = Services.storage.getObject(oid).getPayload(pid)
     tidy = Tidy()
     tidy.setIndentAttributes(False)
     tidy.setIndentContent(False)
     tidy.setPrintBodyOnly(True)
     tidy.setSmartIndent(False)
     tidy.setWraplen(0)
     tidy.setXHTML(False)
     tidy.setNumEntities(True)
     out = ByteArrayOutputStream()
     try:
         doc = tidy.parseDOM(payload.getInputStream(), out)
         content = out.toString("UTF-8")
         content = self.__processMedia(oid, doc, content)
         #print "[\n%s\n]" % content
     except Exception, e:
         print " * blog.py: Failed to get content: %s" % e.getMessage()
Exemple #5
0
 def __tidy(self, content):
     tidy = Tidy()
     tidy.setIndentAttributes(False)
     tidy.setIndentContent(False)
     tidy.setPrintBodyOnly(True)
     tidy.setSmartIndent(False)
     tidy.setWraplen(0)
     tidy.setXHTML(False)
     tidy.setNumEntities(True)
     out = ByteArrayOutputStream()
     doc = tidy.parseDOM(ByteArrayInputStream(String(content).getBytes()), out)
     content = out.toString("UTF-8")
     return content, doc