def testDistill(self): # check distilling basic HTML with all tags supported. self.fp = rspreader.openlog(testpath / "basictags.html") # have all tags supported result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue() # these tags should be filtered self.assertEqual(-1, s.find("<html")) self.assertEqual(-1, s.find("<head")) self.assertEqual(-1, s.find("<title")) self.assertEqual(-1, s.find("<body")) self.assertEqual(-1, s.find("<font")) self.assertEqual(-1, s.find("<b>")) self.assertEqual(-1, s.find("<em")) self.assertEqual(-1, s.find("<pre>")) self.assertEqual(-1, s.find("<blockquote>")) self.assertEqual(-1, s.find("<div")) self.assertEqual(-1, s.find("<span")) self.assertEqual(-1, s.find("<table")) self.assertEqual(-1, s.find("<tr")) self.assertEqual(-1, s.find("<td")) self.assertEqual(-1, s.find("<form")) self.assertEqual(-1, s.find("<img")) self.assertEqual(-1, s.find("<a")) self.assertEqual(-1, s.find("</html>")) # these tags should present self.assert_(s.find("<h1>") > 0) self.assert_(s.find("<h2>") > 0) self.assert_(s.find("<h3>") > 0) self.assert_(s.find("<h4>") > 0) self.assert_(s.find("<h5>") > 0) self.assert_(s.find("<h6>") > 0) self.assert_(s.find("<p>") > 0) self.assert_(s.find("<ul>") > 0) self.assert_(s.find("<ol>") > 0) self.assert_(s.find("<li>") > 0) self.assert_(s.find("<br>") > 0) self.assert_(s.find("<hr>") > 0) # these are some other transformed data self.assert_(s.find("h1-Sample HTML") > 0) self.assert_(s.find("[fill your name]") > 0) # <form> self.assert_(s.find("[*]") > 0) self.assert_(s.find("[ ]") > 0) self.assert_(s.find("(*)") > 0) self.assert_(s.find("( )") > 0) self.assert_(s.find("[***]") > 0) self.assert_(s.find("Lorem") > 0) # <textarea> self.assert_(s.find("[button]") > 0) self.assert_(s.find("[submit]") > 0) self.assert_(s.find("[reset]") > 0) self.assert_(s.find("[go]") > 0) self.assert_(s.find("[a picture]") > 0) # <img> self.assert_(s.find(u'<&,<, ,",>>') > 0) # entities
def testDistill(self): # check distilling basic HTML with all tags supported. self.fp = rspreader.openlog(testdir + 'basictags.html') # have all tags supported result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue() # these tags should be filtered self.assertEqual(-1, s.find('<html')) self.assertEqual(-1, s.find('<head')) self.assertEqual(-1, s.find('<title')) self.assertEqual(-1, s.find('<body')) self.assertEqual(-1, s.find('<font')) self.assertEqual(-1, s.find('<b>')) self.assertEqual(-1, s.find('<em')) self.assertEqual(-1, s.find('<pre>')) self.assertEqual(-1, s.find('<blockquote>')) self.assertEqual(-1, s.find('<div')) self.assertEqual(-1, s.find('<span')) self.assertEqual(-1, s.find('<table')) self.assertEqual(-1, s.find('<tr')) self.assertEqual(-1, s.find('<td')) self.assertEqual(-1, s.find('<form')) self.assertEqual(-1, s.find('<img')) self.assertEqual(-1, s.find('<a')) self.assertEqual(-1, s.find('</html>')) # these tags should present self.assert_(s.find('<h1>') > 0) self.assert_(s.find('<h2>') > 0) self.assert_(s.find('<h3>') > 0) self.assert_(s.find('<h4>') > 0) self.assert_(s.find('<h5>') > 0) self.assert_(s.find('<h6>') > 0) self.assert_(s.find('<p>' ) > 0) self.assert_(s.find('<ul>') > 0) self.assert_(s.find('<ol>') > 0) self.assert_(s.find('<li>') > 0) self.assert_(s.find('<br>') > 0) self.assert_(s.find('<hr>') > 0) # these are some other transformed data self.assert_(s.find('h1-Sample HTML') > 0) self.assert_(s.find('[fill your name]') > 0) # <form> self.assert_(s.find('[*]') > 0) self.assert_(s.find('[ ]') > 0) self.assert_(s.find('(*)') > 0) self.assert_(s.find('( )') > 0) self.assert_(s.find('[***]') > 0) self.assert_(s.find('Lorem') > 0) # <textarea> self.assert_(s.find('[button]') > 0) self.assert_(s.find('[submit]') > 0) self.assert_(s.find('[reset]') > 0) self.assert_(s.find('[go]') > 0) self.assert_(s.find('[a picture]') > 0) # <img> self.assert_(s.find(u'<&,<, ,",>>') > 0) # entities
def testDistillTxt(self): self.fp = rspreader.openlog(testdir + 'plaintext.mlog') result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual(0, result) # check content self.buf.seek(0) p = patterns_tester.checkPatterns(self.buf, ['Copyright', 'All rights reserved.', 'OF SUCH DAMAGE.']) self.assert_(not p, 'unexpected: %s' % p)
def testDistillTxt(self): self.fp = rspreader.openlog(testpath / "plaintext.mlog") result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual(0, result) # check content self.buf.seek(0) p = patterns_tester.checkStrings(self.buf.read(), ["Copyright", "All rights reserved.", "OF SUCH DAMAGE."]) self.assert_(not p, "unexpected: %s" % p)
def testMeta(self): # Check basic meta data parsing self.fp = rspreader.openlog(testdir + 'basictags.html') meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u'Basic HTML Sample Document', meta['title']) self.assertEqual(u'Description: this sample contains all basic HTML tags the converter understands', meta['description']) self.assertEqual(u'basic HTML, sample', meta['keywords']) self.assertEqual(4, len(meta))
def testMetaVariations(self): # See meta_variations.html for variations of attributes formatting self.fp = rspreader.openlog(testdir + 'meta_variations.html') meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u'word1 word2 word3', meta['title']) # title span multiple lines self.assertEqual(u'word1 & word2 <word3>', meta['description']) # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines self.assert_(not meta.has_key('keywords')) self.assertEqual(3, len(meta))
def testParserError(self): PROBLEM_LINE = '<! -- this is bad -->' self.fp = rspreader.openlog(testdir + 'malformed_html.mlog') s = self.fp.read(1024) self.assert_(s.find(PROBLEM_LINE) > 0) # make sure the PROBLEM_LINE is in the test data self.fp.seek(0) result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(distillML.PARSE_ERROR, result[0])
def testMetaVariations(self): # See meta_variations.html for variations of attributes formatting self.fp = rspreader.openlog(testpath / "meta_variations.html") meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u"word1 word2 word3", meta["title"]) # title span multiple lines self.assertEqual( u"word1 & word2 <word3>", meta["description"] ) # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines self.assert_(not meta.has_key("keywords")) self.assertEqual(3, len(meta))
def testMeta(self): # Check basic meta data parsing self.fp = rspreader.openlog(testpath / "basictags.html") meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u"Basic HTML Sample Document", meta["title"]) self.assertEqual( u"Description: this sample contains all basic HTML tags the converter understands", meta["description"] ) self.assertEqual(u"basic HTML, sample", meta["keywords"]) self.assertEqual(4, len(meta))
def test_distill(fp, wfile, meta): # build meta from rsp_header try: minfo = messagelog.MessageInfo.parseMessageLog(fp) except: pass # assume it is html file (not mlog) else: meta.clear() meta.update(minfo.rsp_headers) # read content fp.seek(0) fp = rspreader.openlog(fp) return distill(fp, wfile, meta)
def testJavascript(self): self.fp = rspreader.openlog(testdir + 'js/doc_write_html.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'document.write('), result) self.fp = rspreader.openlog(testdir + 'js/function.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'function YADopenWindow(x){'), result) self.fp = rspreader.openlog(testdir + 'js/ibHtml1=.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'ibHtml1="'), result) self.fp = rspreader.openlog(testdir + 'js/var_with_html.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'var pophtml ='), result) self.fp = rspreader.openlog(testdir + 'js/small1.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result) self.fp = rspreader.openlog(testdir + 'js/small2.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result)
def testJavascript(self): self.fp = rspreader.openlog(testpath / "js/doc_write_html.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"document.write("), result) self.fp = rspreader.openlog(testpath / "js/function.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"function YADopenWindow(x){"), result) self.fp = rspreader.openlog(testpath / "js/ibHtml1=.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'ibHtml1="'), result) self.fp = rspreader.openlog(testpath / "js/var_with_html.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"var pophtml ="), result) self.fp = rspreader.openlog(testpath / "js/small1.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result) self.fp = rspreader.openlog(testpath / "js/small2.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result)
def testLowvisible(self): self.fp = rspreader.openlog(testdir + 'lowvisible(doubleclick).mlog') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(distillML.LOWVISIBLE, result[0])
def testCSS(self): self.fp = rspreader.openlog(testdir + 'main.css') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result)
def testFrameset(self): self.fp = rspreader.openlog(testdir + 'frameset.html') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.FRAMESET), result[0])
def testMagicFilteredTxt(self): """ Wrong media type text/plain """ self.fp = rspreader.openlog(testdir + 'favicon.ico_text(nutch).mlog') result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'image/vnd.microsoft.icon'), result)
def testMagicFiltered(self): self.fp = rspreader.openlog(testdir + 'gif.qlog') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'image/gif'), result)
def testMagicFiltered(self): self.fp = rspreader.openlog(testpath / "gif.qlog") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "image/gif"), result)
def testCSS(self): self.fp = rspreader.openlog(testpath / "main.css") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result)