def testDistill(self):

        # check distilling basic HTML with all tags supported.

        self.fp = rspreader.openlog(testpath / "basictags.html")  # have all tags supported
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(0, result)

        s = self.buf.getvalue()

        # these tags should be filtered
        self.assertEqual(-1, s.find("<html"))
        self.assertEqual(-1, s.find("<head"))
        self.assertEqual(-1, s.find("<title"))
        self.assertEqual(-1, s.find("<body"))
        self.assertEqual(-1, s.find("<font"))
        self.assertEqual(-1, s.find("<b>"))
        self.assertEqual(-1, s.find("<em"))
        self.assertEqual(-1, s.find("<pre>"))
        self.assertEqual(-1, s.find("<blockquote>"))
        self.assertEqual(-1, s.find("<div"))
        self.assertEqual(-1, s.find("<span"))
        self.assertEqual(-1, s.find("<table"))
        self.assertEqual(-1, s.find("<tr"))
        self.assertEqual(-1, s.find("<td"))
        self.assertEqual(-1, s.find("<form"))
        self.assertEqual(-1, s.find("<img"))
        self.assertEqual(-1, s.find("<a"))
        self.assertEqual(-1, s.find("</html>"))

        # these tags should present
        self.assert_(s.find("<h1>") > 0)
        self.assert_(s.find("<h2>") > 0)
        self.assert_(s.find("<h3>") > 0)
        self.assert_(s.find("<h4>") > 0)
        self.assert_(s.find("<h5>") > 0)
        self.assert_(s.find("<h6>") > 0)
        self.assert_(s.find("<p>") > 0)
        self.assert_(s.find("<ul>") > 0)
        self.assert_(s.find("<ol>") > 0)
        self.assert_(s.find("<li>") > 0)
        self.assert_(s.find("<br>") > 0)
        self.assert_(s.find("<hr>") > 0)

        # these are some other transformed data
        self.assert_(s.find("h1-Sample HTML") > 0)
        self.assert_(s.find("[fill your name]") > 0)  # <form>
        self.assert_(s.find("[*]") > 0)
        self.assert_(s.find("[ ]") > 0)
        self.assert_(s.find("(*)") > 0)
        self.assert_(s.find("( )") > 0)
        self.assert_(s.find("[***]") > 0)
        self.assert_(s.find("Lorem") > 0)  # <textarea>
        self.assert_(s.find("[button]") > 0)
        self.assert_(s.find("[submit]") > 0)
        self.assert_(s.find("[reset]") > 0)
        self.assert_(s.find("[go]") > 0)
        self.assert_(s.find("[a picture]") > 0)  # <img>
        self.assert_(s.find(u'<&amp;,&lt;, ,",&gt;>') > 0)  # entities
    def testDistill(self):

        # check distilling basic HTML with all tags supported.

        self.fp = rspreader.openlog(testdir + 'basictags.html')  # have all tags supported
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(0, result)

        s = self.buf.getvalue()

        # these tags should be filtered
        self.assertEqual(-1, s.find('<html'))
        self.assertEqual(-1, s.find('<head'))
        self.assertEqual(-1, s.find('<title'))
        self.assertEqual(-1, s.find('<body'))
        self.assertEqual(-1, s.find('<font'))
        self.assertEqual(-1, s.find('<b>'))
        self.assertEqual(-1, s.find('<em'))
        self.assertEqual(-1, s.find('<pre>'))
        self.assertEqual(-1, s.find('<blockquote>'))
        self.assertEqual(-1, s.find('<div'))
        self.assertEqual(-1, s.find('<span'))
        self.assertEqual(-1, s.find('<table'))
        self.assertEqual(-1, s.find('<tr'))
        self.assertEqual(-1, s.find('<td'))
        self.assertEqual(-1, s.find('<form'))
        self.assertEqual(-1, s.find('<img'))
        self.assertEqual(-1, s.find('<a'))
        self.assertEqual(-1, s.find('</html>'))

        # these tags should present
        self.assert_(s.find('<h1>') > 0)
        self.assert_(s.find('<h2>') > 0)
        self.assert_(s.find('<h3>') > 0)
        self.assert_(s.find('<h4>') > 0)
        self.assert_(s.find('<h5>') > 0)
        self.assert_(s.find('<h6>') > 0)
        self.assert_(s.find('<p>' ) > 0)
        self.assert_(s.find('<ul>') > 0)
        self.assert_(s.find('<ol>') > 0)
        self.assert_(s.find('<li>') > 0)
        self.assert_(s.find('<br>') > 0)
        self.assert_(s.find('<hr>') > 0)

        # these are some other transformed data
        self.assert_(s.find('h1-Sample HTML') > 0)
        self.assert_(s.find('[fill your name]') > 0)        # <form>
        self.assert_(s.find('[*]') > 0)
        self.assert_(s.find('[ ]') > 0)
        self.assert_(s.find('(*)') > 0)
        self.assert_(s.find('( )') > 0)
        self.assert_(s.find('[***]') > 0)
        self.assert_(s.find('Lorem') > 0)                   # <textarea>
        self.assert_(s.find('[button]') > 0)
        self.assert_(s.find('[submit]') > 0)
        self.assert_(s.find('[reset]') > 0)
        self.assert_(s.find('[go]') > 0)
        self.assert_(s.find('[a picture]') > 0)             # <img>
        self.assert_(s.find(u'<&amp;,&lt;, ,",&gt;>') > 0)  # entities
    def testDistillTxt(self):
        self.fp = rspreader.openlog(testdir + 'plaintext.mlog')
        result = distillML.distillTxt(self.fp, self.buf, {})
        self.assertEqual(0, result)

        # check content
        self.buf.seek(0)
        p = patterns_tester.checkPatterns(self.buf, ['Copyright', 'All rights reserved.', 'OF SUCH DAMAGE.'])
        self.assert_(not p, 'unexpected: %s' % p)
    def testDistillTxt(self):
        self.fp = rspreader.openlog(testpath / "plaintext.mlog")
        result = distillML.distillTxt(self.fp, self.buf, {})
        self.assertEqual(0, result)

        # check content
        self.buf.seek(0)
        p = patterns_tester.checkStrings(self.buf.read(), ["Copyright", "All rights reserved.", "OF SUCH DAMAGE."])
        self.assert_(not p, "unexpected: %s" % p)
    def testMeta(self):

        # Check basic meta data parsing

        self.fp = rspreader.openlog(testdir + 'basictags.html')
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u'Basic HTML Sample Document', meta['title'])
        self.assertEqual(u'Description: this sample contains all basic HTML tags the converter understands', meta['description'])
        self.assertEqual(u'basic HTML, sample', meta['keywords'])
        self.assertEqual(4, len(meta))
    def testMetaVariations(self):

        # See meta_variations.html for variations of attributes formatting

        self.fp = rspreader.openlog(testdir + 'meta_variations.html')
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u'word1 word2 word3', meta['title'])            # title span multiple lines
        self.assertEqual(u'word1 & word2 <word3>', meta['description'])  # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines
        self.assert_(not meta.has_key('keywords'))
        self.assertEqual(3, len(meta))
    def testParserError(self):

        PROBLEM_LINE = '<! -- this is bad -->'

        self.fp = rspreader.openlog(testdir + 'malformed_html.mlog')
        s = self.fp.read(1024)
        self.assert_(s.find(PROBLEM_LINE) > 0)   # make sure the PROBLEM_LINE is in the test data
        self.fp.seek(0)

        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(distillML.PARSE_ERROR, result[0])
    def testMetaVariations(self):

        # See meta_variations.html for variations of attributes formatting

        self.fp = rspreader.openlog(testpath / "meta_variations.html")
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u"word1 word2 word3", meta["title"])  # title span multiple lines
        self.assertEqual(
            u"word1 & word2 <word3>", meta["description"]
        )  # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines
        self.assert_(not meta.has_key("keywords"))
        self.assertEqual(3, len(meta))
    def testMeta(self):

        # Check basic meta data parsing

        self.fp = rspreader.openlog(testpath / "basictags.html")
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u"Basic HTML Sample Document", meta["title"])
        self.assertEqual(
            u"Description: this sample contains all basic HTML tags the converter understands", meta["description"]
        )
        self.assertEqual(u"basic HTML, sample", meta["keywords"])
        self.assertEqual(4, len(meta))
def test_distill(fp, wfile, meta):

    # build meta from rsp_header
    try:
        minfo = messagelog.MessageInfo.parseMessageLog(fp)
    except:
        pass    # assume it is html file (not mlog)
    else:
        meta.clear()
        meta.update(minfo.rsp_headers)

    # read content
    fp.seek(0)
    fp = rspreader.openlog(fp)

    return distill(fp, wfile, meta)
    def testJavascript(self):
        self.fp = rspreader.openlog(testdir + 'js/doc_write_html.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'document.write('), result)

        self.fp = rspreader.openlog(testdir + 'js/function.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'function YADopenWindow(x){'), result)

        self.fp = rspreader.openlog(testdir + 'js/ibHtml1=.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'ibHtml1="'), result)

        self.fp = rspreader.openlog(testdir + 'js/var_with_html.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS,  u'var pophtml ='), result)

        self.fp = rspreader.openlog(testdir + 'js/small1.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, 'unknown'), result)

        self.fp = rspreader.openlog(testdir + 'js/small2.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, 'unknown'), result)
    def testJavascript(self):
        self.fp = rspreader.openlog(testpath / "js/doc_write_html.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"document.write("), result)

        self.fp = rspreader.openlog(testpath / "js/function.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"function YADopenWindow(x){"), result)

        self.fp = rspreader.openlog(testpath / "js/ibHtml1=.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'ibHtml1="'), result)

        self.fp = rspreader.openlog(testpath / "js/var_with_html.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"var pophtml ="), result)

        self.fp = rspreader.openlog(testpath / "js/small1.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, "unknown"), result)

        self.fp = rspreader.openlog(testpath / "js/small2.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, "unknown"), result)
 def testLowvisible(self):
    self.fp = rspreader.openlog(testdir + 'lowvisible(doubleclick).mlog')
    result = distillML.distill(self.fp, self.buf, {})
    self.assertEqual(distillML.LOWVISIBLE, result[0])
 def testCSS(self):
     self.fp = rspreader.openlog(testdir + 'main.css')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, 'unknown'), result)
 def testFrameset(self):
     self.fp = rspreader.openlog(testdir + 'frameset.html')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.FRAMESET), result[0])
 def testMagicFilteredTxt(self):
    """ Wrong media type text/plain """
    self.fp = rspreader.openlog(testdir + 'favicon.ico_text(nutch).mlog')
    result = distillML.distillTxt(self.fp, self.buf, {})
    self.assertEqual((distillML.NON_HTML, 'image/vnd.microsoft.icon'), result)
 def testMagicFiltered(self):
     self.fp = rspreader.openlog(testdir + 'gif.qlog')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, 'image/gif'), result)
 def testMagicFiltered(self):
     self.fp = rspreader.openlog(testpath / "gif.qlog")
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, "image/gif"), result)
 def testCSS(self):
     self.fp = rspreader.openlog(testpath / "main.css")
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, "unknown"), result)