def testDistill(self): # check distilling basic HTML with all tags supported. self.fp = rspreader.openlog(testpath / "basictags.html") # have all tags supported result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue() # these tags should be filtered self.assertEqual(-1, s.find("<html")) self.assertEqual(-1, s.find("<head")) self.assertEqual(-1, s.find("<title")) self.assertEqual(-1, s.find("<body")) self.assertEqual(-1, s.find("<font")) self.assertEqual(-1, s.find("<b>")) self.assertEqual(-1, s.find("<em")) self.assertEqual(-1, s.find("<pre>")) self.assertEqual(-1, s.find("<blockquote>")) self.assertEqual(-1, s.find("<div")) self.assertEqual(-1, s.find("<span")) self.assertEqual(-1, s.find("<table")) self.assertEqual(-1, s.find("<tr")) self.assertEqual(-1, s.find("<td")) self.assertEqual(-1, s.find("<form")) self.assertEqual(-1, s.find("<img")) self.assertEqual(-1, s.find("<a")) self.assertEqual(-1, s.find("</html>")) # these tags should present self.assert_(s.find("<h1>") > 0) self.assert_(s.find("<h2>") > 0) self.assert_(s.find("<h3>") > 0) self.assert_(s.find("<h4>") > 0) self.assert_(s.find("<h5>") > 0) self.assert_(s.find("<h6>") > 0) self.assert_(s.find("<p>") > 0) self.assert_(s.find("<ul>") > 0) self.assert_(s.find("<ol>") > 0) self.assert_(s.find("<li>") > 0) self.assert_(s.find("<br>") > 0) self.assert_(s.find("<hr>") > 0) # these are some other transformed data self.assert_(s.find("h1-Sample HTML") > 0) self.assert_(s.find("[fill your name]") > 0) # <form> self.assert_(s.find("[*]") > 0) self.assert_(s.find("[ ]") > 0) self.assert_(s.find("(*)") > 0) self.assert_(s.find("( )") > 0) self.assert_(s.find("[***]") > 0) self.assert_(s.find("Lorem") > 0) # <textarea> self.assert_(s.find("[button]") > 0) self.assert_(s.find("[submit]") > 0) self.assert_(s.find("[reset]") > 0) self.assert_(s.find("[go]") > 0) self.assert_(s.find("[a picture]") > 0) # <img> self.assert_(s.find(u'<&,<, ,",>>') > 0) # entities
def testDistill(self): # check distilling basic HTML with all tags supported. self.fp = rspreader.openlog(testdir + 'basictags.html') # have all tags supported result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue() # these tags should be filtered self.assertEqual(-1, s.find('<html')) self.assertEqual(-1, s.find('<head')) self.assertEqual(-1, s.find('<title')) self.assertEqual(-1, s.find('<body')) self.assertEqual(-1, s.find('<font')) self.assertEqual(-1, s.find('<b>')) self.assertEqual(-1, s.find('<em')) self.assertEqual(-1, s.find('<pre>')) self.assertEqual(-1, s.find('<blockquote>')) self.assertEqual(-1, s.find('<div')) self.assertEqual(-1, s.find('<span')) self.assertEqual(-1, s.find('<table')) self.assertEqual(-1, s.find('<tr')) self.assertEqual(-1, s.find('<td')) self.assertEqual(-1, s.find('<form')) self.assertEqual(-1, s.find('<img')) self.assertEqual(-1, s.find('<a')) self.assertEqual(-1, s.find('</html>')) # these tags should present self.assert_(s.find('<h1>') > 0) self.assert_(s.find('<h2>') > 0) self.assert_(s.find('<h3>') > 0) self.assert_(s.find('<h4>') > 0) self.assert_(s.find('<h5>') > 0) self.assert_(s.find('<h6>') > 0) self.assert_(s.find('<p>' ) > 0) self.assert_(s.find('<ul>') > 0) self.assert_(s.find('<ol>') > 0) self.assert_(s.find('<li>') > 0) self.assert_(s.find('<br>') > 0) self.assert_(s.find('<hr>') > 0) # these are some other transformed data self.assert_(s.find('h1-Sample HTML') > 0) self.assert_(s.find('[fill your name]') > 0) # <form> self.assert_(s.find('[*]') > 0) self.assert_(s.find('[ ]') > 0) self.assert_(s.find('(*)') > 0) self.assert_(s.find('( )') > 0) self.assert_(s.find('[***]') > 0) self.assert_(s.find('Lorem') > 0) # <textarea> self.assert_(s.find('[button]') > 0) self.assert_(s.find('[submit]') > 0) self.assert_(s.find('[reset]') > 0) self.assert_(s.find('[go]') > 0) self.assert_(s.find('[a picture]') > 0) # <img> self.assert_(s.find(u'<&,<, ,",>>') > 0) # entities
def transformDoc(self, inpath, outpath): """ Parse a message log file. Filter unwant document and transform it. File specified by outpath is only created when this success. @return whether the document is transformed. """ mtime = os.path.getmtime(inpath) dt = datetime.datetime.utcfromtimestamp(mtime) timestamp = _formatTimestamp(dt) rfile = file(inpath,'rb') try: minfo = messagelog.MessageInfo.parseMessageLog(rfile) if minfo.discard: # these should be filtered in logging phrase, but double # check here perhaps for logs collected from other sources. log.info('discard %s %s - %s' % (os.path.split(inpath)[1], minfo.flags, minfo.req_path)) return False meta = _extract_meta(minfo, timestamp) # simple filtering if (minfo.status < 200) or (300 <= minfo.status): return False if minfo.ctype != 'html' and minfo.ctype != 'txt': return False rfile.seek(0) contentFp = rspreader.ContentReader(rfile, inpath) discard = False wfile = file(outpath, 'wb') try: if minfo.ctype == 'html': result = distillML.distill(contentFp, wfile, meta=meta) else: result = distillML.distillTxt(contentFp, wfile, meta=meta) if result != 0: log.info('discard %s %s - %s' % (os.path.split(inpath)[1], str(result), minfo.req_path)) discard = True finally: wfile.close() finally: rfile.close() if discard: os.remove(outpath) # remove unwanted output return False else: filename = os.path.split(outpath)[1] log.debug('transformed %s (%s) - %s', filename, meta.get('encoding','?'), minfo.req_path) return True
def testMeta(self): # Check basic meta data parsing self.fp = rspreader.openlog(testdir + 'basictags.html') meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u'Basic HTML Sample Document', meta['title']) self.assertEqual(u'Description: this sample contains all basic HTML tags the converter understands', meta['description']) self.assertEqual(u'basic HTML, sample', meta['keywords']) self.assertEqual(4, len(meta))
def testParserError(self): PROBLEM_LINE = '<! -- this is bad -->' self.fp = rspreader.openlog(testdir + 'malformed_html.mlog') s = self.fp.read(1024) self.assert_(s.find(PROBLEM_LINE) > 0) # make sure the PROBLEM_LINE is in the test data self.fp.seek(0) result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(distillML.PARSE_ERROR, result[0])
def testMetaVariations(self): # See meta_variations.html for variations of attributes formatting self.fp = rspreader.openlog(testdir + 'meta_variations.html') meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u'word1 word2 word3', meta['title']) # title span multiple lines self.assertEqual(u'word1 & word2 <word3>', meta['description']) # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines self.assert_(not meta.has_key('keywords')) self.assertEqual(3, len(meta))
def testMetaVariations(self): # See meta_variations.html for variations of attributes formatting self.fp = rspreader.openlog(testpath / "meta_variations.html") meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u"word1 word2 word3", meta["title"]) # title span multiple lines self.assertEqual( u"word1 & word2 <word3>", meta["description"] ) # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines self.assert_(not meta.has_key("keywords")) self.assertEqual(3, len(meta))
def testAttrEncodingProblem(self): """ Bad HTML found in http://news.bbc.co.uk/ """ # note: the <b> inside the quoted attribute value should be # written as <b>. We choose not to workaround this right now doc = """<html><body> <p>filler.filler.filler.filler.filler.filler.filler</p> <a onmouseover="ChangeText('<b>Back to previous</b>');">text</a> </body></html>""" result = distillML.distill(StringIO.StringIO(doc), self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue()
def testMeta(self): # Check basic meta data parsing self.fp = rspreader.openlog(testpath / "basictags.html") meta = {} result = distillML.distill(self.fp, self.buf, meta) self.assertEqual(u"Basic HTML Sample Document", meta["title"]) self.assertEqual( u"Description: this sample contains all basic HTML tags the converter understands", meta["description"] ) self.assertEqual(u"basic HTML, sample", meta["keywords"]) self.assertEqual(4, len(meta))
def testParseCrazyTitleProblem(self): # Test problem in parsing a missing <title> doc = """<html><head>hello</title></head> <body> <p>filler.filler.filler.filler.filler.filler.filler</p> </body></html>""" meta = {} result = distillML.distill(StringIO.StringIO(doc), self.buf, meta) self.assertEqual(0, result) s = self.buf.getvalue() self.assert_(not meta.has_key('title')) # no title self.assert_(s.find('filler') >= 0) # but sort of getting rest of data
def testParseEmptyTagProblem(self): """ Test problem in parsing <br/> """ # The smgllib.SGMLParser in various versions of Python has problem # parsing <br/> It was suggested to workaround by using <br /> # with a space. But we don't have a choice for documents fetched # from the web. doc = """<html><body> <p>filler.filler.filler.filler.filler.filler.filler</p> <p>abc<br/>def</p> </body></html>""" result = distillML.distill(StringIO.StringIO(doc), self.buf, {}) self.assertEqual(0, result) s = self.buf.getvalue() self.assert_(s.find('abc<br>') > 0) self.assert_(s.find('>def') < 0) # the '>' from the preceding <br/> is a syndrome
def _get_snapshot_content(self, item): # TODO: refactor filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id spath = cfg.getpath('weblibsnapshot')/filename if not spath.exists(): return '' fp = spath.open('rb') # TODO: check file exist, move to weblib? getSnapshotFile()? lwa = mhtml.LoadedWebArchive(fp) resp = lwa.fetch_uri(lwa.root_uri) if not resp: return '' # TODO: lucene_logic: use to docid is confusing with lucene's internal docid? # TODO: mind content-type, encoding, framed objects?? data = resp.read() meta = {} contentBuf = StringIO.StringIO() result = distillML.distill(resp, contentBuf, meta=meta) contentBuf.seek(0) # TODO: what's the deal with writeHeader? meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None) return content
def testJavascript(self): self.fp = rspreader.openlog(testdir + 'js/doc_write_html.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'document.write('), result) self.fp = rspreader.openlog(testdir + 'js/function.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'function YADopenWindow(x){'), result) self.fp = rspreader.openlog(testdir + 'js/ibHtml1=.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'ibHtml1="'), result) self.fp = rspreader.openlog(testdir + 'js/var_with_html.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'var pophtml ='), result) self.fp = rspreader.openlog(testdir + 'js/small1.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result) self.fp = rspreader.openlog(testdir + 'js/small2.js') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result)
def testJavascript(self): self.fp = rspreader.openlog(testpath / "js/doc_write_html.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"document.write("), result) self.fp = rspreader.openlog(testpath / "js/function.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"function YADopenWindow(x){"), result) self.fp = rspreader.openlog(testpath / "js/ibHtml1=.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u'ibHtml1="'), result) self.fp = rspreader.openlog(testpath / "js/var_with_html.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.JS, u"var pophtml ="), result) self.fp = rspreader.openlog(testpath / "js/small1.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result) self.fp = rspreader.openlog(testpath / "js/small2.js") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result)
def testLowvisible(self): self.fp = rspreader.openlog(testdir + 'lowvisible(doubleclick).mlog') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual(distillML.LOWVISIBLE, result[0])
def testCSS(self): self.fp = rspreader.openlog(testdir + 'main.css') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'unknown'), result)
def testFrameset(self): self.fp = rspreader.openlog(testdir + 'frameset.html') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.FRAMESET), result[0])
def testMagicFiltered(self): self.fp = rspreader.openlog(testdir + 'gif.qlog') result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'image/gif'), result)
def testDomainFiltered(self): self.fp = StringIO.StringIO() result = distillML.distill(self.fp, self.buf, {"uri": "http://x.googlesyndication.com/"}) self.assertEqual((distillML.EXDOMAIN, ".googlesyndication.com"), result)
def testMagicFiltered(self): self.fp = rspreader.openlog(testpath / "gif.qlog") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "image/gif"), result)
def testDomainFiltered(self): self.fp = StringIO.StringIO() result = distillML.distill(self.fp, self.buf, {'uri':'http://x.googlesyndication.com/'}) self.assertEqual((distillML.EXDOMAIN, '.googlesyndication.com'), result)
def testCSS(self): self.fp = rspreader.openlog(testpath / "main.css") result = distillML.distill(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, "unknown"), result)