コード例 #1
0
    def testDistillTxt(self):
        self.fp = rspreader.openlog(testdir + 'plaintext.mlog')
        result = distillML.distillTxt(self.fp, self.buf, {})
        self.assertEqual(0, result)

        # check content
        self.buf.seek(0)
        p = patterns_tester.checkPatterns(self.buf, ['Copyright', 'All rights reserved.', 'OF SUCH DAMAGE.'])
        self.assert_(not p, 'unexpected: %s' % p)
コード例 #2
0
    def testDistillTxt(self):
        self.fp = rspreader.openlog(testpath / "plaintext.mlog")
        result = distillML.distillTxt(self.fp, self.buf, {})
        self.assertEqual(0, result)

        # check content
        self.buf.seek(0)
        p = patterns_tester.checkStrings(self.buf.read(), ["Copyright", "All rights reserved.", "OF SUCH DAMAGE."])
        self.assert_(not p, "unexpected: %s" % p)
コード例 #3
0
    def transformDoc(self, inpath, outpath):
        """ Parse a message log file. Filter unwant document and transform it.
            File specified by outpath is only created when this success.

            @return whether the document is transformed.
        """

        mtime = os.path.getmtime(inpath)
        dt = datetime.datetime.utcfromtimestamp(mtime)
        timestamp = _formatTimestamp(dt)

        rfile = file(inpath,'rb')
        try:
            minfo = messagelog.MessageInfo.parseMessageLog(rfile)
            if minfo.discard:
                # these should be filtered in logging phrase, but double
                # check here perhaps for logs collected from other sources.
                log.info('discard %s %s - %s' % (os.path.split(inpath)[1], minfo.flags, minfo.req_path))
                return False

            meta = _extract_meta(minfo, timestamp)

            # simple filtering
            if (minfo.status < 200) or (300 <= minfo.status):
                return False
            if minfo.ctype != 'html' and minfo.ctype != 'txt':
                return False

            rfile.seek(0)
            contentFp = rspreader.ContentReader(rfile, inpath)

            discard = False
            wfile = file(outpath, 'wb')
            try:
                if minfo.ctype == 'html':
                    result = distillML.distill(contentFp, wfile, meta=meta)
                else:
                    result = distillML.distillTxt(contentFp, wfile, meta=meta)
                if result != 0:
                    log.info('discard %s %s - %s' % (os.path.split(inpath)[1], str(result), minfo.req_path))
                    discard = True
            finally:
                wfile.close()

        finally:
            rfile.close()

        if discard:
            os.remove(outpath)      # remove unwanted output
            return False
        else:
            filename = os.path.split(outpath)[1]
            log.debug('transformed %s (%s) - %s', filename, meta.get('encoding','?'), minfo.req_path)

        return True
コード例 #4
0
    def test_big5_txt(self):

        self.fp = file(testdir + 'ah_ying.txt', 'rb')
        title, content = self.test_data[19:21]

        self.meta['content-type'] = 'text/plain; charset=big5'

        result = distillML.distillTxt(self.fp, self.buf, self.meta)
        self.assertEqual(0, result)
        self.assertEqual(self.meta['encoding'], 'big5 [HTTP]')
        #self.assertEqual(self.meta['title'], title)

        s = self.buf.getvalue().decode('utf8')
        self.assert_(s.find(content) > 0)
コード例 #5
0
    def test_big5_txt(self):

        self.fp = file(testpath / "ah_ying.txt", "rb")
        title, content = self.test_data[19:21]

        self.meta["content-type"] = "text/plain; charset=big5"

        result = distillML.distillTxt(self.fp, self.buf, self.meta)
        self.assertEqual(0, result)
        self.assertEqual(self.meta["encoding"], "big5 [HTTP]")
        # self.assertEqual(self.meta['title'], title)

        s = self.buf.getvalue().decode("utf8")
        self.assert_(s.find(content) > 0)
コード例 #6
0
 def testMagicFilteredTxt(self):
    """ Wrong media type text/plain """
    self.fp = rspreader.openlog(testdir + 'favicon.ico_text(nutch).mlog')
    result = distillML.distillTxt(self.fp, self.buf, {})
    self.assertEqual((distillML.NON_HTML, 'image/vnd.microsoft.icon'), result)
コード例 #7
0
 def testDomainFilteredTxt(self):
     self.fp = StringIO.StringIO()
     result = distillML.distillTxt(self.fp, self.buf, {'uri':'http://x.googlesyndication.com/'})
     self.assertEqual((distillML.EXDOMAIN, '.googlesyndication.com'), result)