def stats(self, query, result): result.start_table(**{'class': 'GeneralTable'}) dbh = DB.DBO(self.case) columns = [ "service", "type", "From", "To", "CC", "BCC", "sent", "subject", "message" ] dbh.execute("select * from webmail_messages where `inode_id`=%r", self.lookup_id()) row = dbh.fetch() dbh2 = DB.DBO(self.case) dbh2.execute("select * from inode where inode_id = %r", row['inode_id']) row2 = dbh2.fetch() result.row("Timestamp", row2['mtime']) for c in columns: if c == 'message': ## Filter the message out here: parser = HTML.HTMLParser(tag_class = \ FlagFramework.Curry(HTML.ResolvingHTMLTag, case = self.case, inode_id = row['parent_inode_id'])) #parser = HTML.HTMLParser(tag_class = HTML.TextTag) parser.feed(HTML.decode(row[c] or "")) parser.close() #tmp = result.__class__(result) #tmp.text(parser.root.innerHTML(), font='typewriter', wrap='full') #row[c] = tmp r = parser.root.__str__() r = textwrap.fill(r) row[c] = r
def sanitize_page(self, tag_class): """ This produces a rendered version of the underlying page """ ## Get the original HTML File: fsfd = FileSystem.DBFS(self.case) fd = fsfd.open(inode_id=self.parent_inode_id) #data = HTML.decode(fd.read()) data = fd.read() ## FIXME - This is a hack which works because we always send a ## curried class down: try: tag_class.kwargs['inode_id'] = self.parent_inode_id except AttributeError: pass ## Make a parser: p = HTML.HTMLParser(tag_class=tag_class) p.feed(data) p.close() ## Allow us to fix the html page root = p.root self.fixup_page(root, tag_class) ## Add the timestamp to the title of the page - so when you ## print it out we can identify it: s = fsfd.istat(inode_id=self.parent_inode_id) title_tag = root.find("title") if title_tag: title_tag.children = [ "%s %s %s" % (title_tag.innerHTML(), s['mtime'], s['inode']), ] return root.innerHTML()
def scan(self, fd, scanners, type, mime, cookie, scores=None, **args): if scores.get('GmailStreamMagic', 0) == 0: return pyflaglog.log(pyflaglog.DEBUG, "Opening %s for Gmail processing" % fd.inode_id) self.current_time = None self.current_box = 'Unknown' if "html" in mime: html_parser = HTML.HTMLParser() html_parser.parse_fd(fd) html_parser.close() ## Process all script segments for script_tag in html_parser.root.search("script"): script = script_tag.innerHTML() try: j = Javascript.JSParser() j.feed(script) j.close() except: continue self.process_js(j.root, fd) elif "javascript" in mime: ## Make a new parser j = Javascript.JSParser() j.parse_fd(fd) j.close() self.process_js(j.root, fd)
def process_string(self, fd, string): parser = HTML.HTMLParser(verbose=0) parser.feed(string) parser.close() self.process_readmessage(fd, parser) self.process_listing(fd, parser)
def display(self, value, row, result): parser = HTML.HTMLParser(tag_class=HTML.TextTag) parser.feed(value or '') parser.close() value = parser.root.innerHTML() result.text(value, wrap='full', font='typewriter')
def generator(): parser = HTML.HTMLParser(tag_class=Curry(HTML.ResolvingHTMLTag, inode_id=fd.lookup_id(), case=self.case)) #parser = HTML.HTMLParser(tag_class = HTML.Tag) data = fd.read(1000000) parser.feed(data) parser.close() yield parser.root.innerHTML()
def boring(self, metadata, data=''): ## We dont think its boring if our base class does not: ## And the data contains '<title>\s+Windows Live' in the top. if not Scanner.StoreAndScanType.boring(self, metadata, data) and \ re.search("<title>\s+Windows Live", data): ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def render_html(self, inode_id, table_renderer): import plugins.TableRenderers.HTMLBundle as HTMLBundle fsfd = FileSystem.DBFS(table_renderer.case) fd = fsfd.open(inode_id=inode_id) parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag) parser.feed(fd.read(fd.size)) parser.close() text = parser.root.innerHTML() return text
def boring(self, metadata, data=''): ## We dont think its boring if our base class does not: ## And the data contains '<title>\s+Yahoo! Mail' in the top. if not Scanner.StoreAndScanType.boring(self, metadata, data=''): m = re.search("<title>[^<]+Yahoo! Mail", data) if m: self.username = None ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def render_html(self, value, table_renderer): import plugins.TableRenderers.HTMLBundle as HTMLBundle parser = HTML.HTMLParser(tag_class=HTML.TextTag) parser.feed(value or '') parser.close() text = parser.root.innerHTML() ## Make sure its wrapped: ui = HTMLUI.HTMLUI(initial=True) ui.text(text, wrap='full', font='typewriter') return ui.__str__()
def fixup_page(self, root, tag_class): ## We have to inject the message into the edit area: edit_area = root.find("div", {"class":"EditArea"}) or \ root.find("div",{"id":"MsgContainer"}) or \ root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class=tag_class) parser.feed(HTML.decode(self.message)) #parser.feed(self.message) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div'
def boring(self, metadata, data=''): ## Yahoo web 2.0 is very nice to work with- All ## responses are in nice XML if not Scanner.StoreAndScanType.boring(self, metadata, data=''): m = re.search( "<(GetDisplayMessageResponse|ListMessagesResponse|SendMessageResponse)", data) if m: self.context = m.group(1) ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def insert_message(self, result, inode_template="l%s"): ## We dont really want to touch the db in here - just print it out ## nicely: try: ## Try to render the html as text: message = unicode(result['Message']) p = HTML.HTMLParser(tag_class=HTML.TextTag) p.feed(message) p.close() result['Message'] = p.root.__str__() except KeyError: pass for k, v in result.items(): print " %s: %r" % (k, v) return True
def scan(self, fd, scanners, type, mime, cookie, **args): if "Yahoo Mail AJAX" in type: self.parser = HTML.HTMLParser(verbose=0) pyflaglog.log( pyflaglog.DEBUG, "Opening %s for YahooMail2.0 processing" % fd.inode_id) ## Read all the data into the parser self.context = None while 1: data = fd.read(1024 * 1024) if not data: break if not self.context: self.context = data self.parser.feed(data) self.parser.close() if 'GetDisplayMessageResponse' in self.context: self.process_readmessage(fd)
def fixup_page(self, result, message, tag_class): """ Given the parse tree in root, fix up the page so it looks as close as possible to the way it should. We write the new page on outfd. """ if not message: return ## We have to inject the message into the edit area: edit_area = self.parser.root.find("div", {"class":"EditArea"}) or \ self.parser.root.find("div",{"id":"MsgContainer"}) or \ self.parser.root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class=tag_class) parser.feed(HTML.decode(message)) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div' return self.parser.root.innerHTML()
def process_readmessage(self, message): parser = HTML.HTMLParser(verbose=0) parser.feed(message) parser.close() result = {'type': 'Read', 'Message': ''} ## Find the subject sbj = parser.root.find('td', {'class': 'ReadMsgSubject'}) if sbj: result['Subject'] = HTML.decode_entity(sbj.innerHTML()) context = None for td in parser.root.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'Sent' msg = parser.root.find('div', {'class': 'ReadMsgContainer'}) if msg: result['Message'] = msg.innerHTML() ## Try to detect the message ID tag = parser.root.find('div', {'mid': '.'}) if tag: result['message_id'] = tag['mid'] try: result[context] = Time.parse(result[context]) except: pass return self.insert_message(result, inode_template='l%s')
def process_send_message(self, fd): dbh = DB.DBO(self.case) dbh.execute( "select `key`,`value`,`indirect` from http_parameters where `key`='body' and inode_id = %r limit 1", self.fd.inode_id) row = dbh.fetch() if not row: return inode_id = row['indirect'] if not inode_id: return ## Need to parse the sent message fsfd = FileSystem.DBFS(self.case) fd = fsfd.open(inode_id=inode_id) self.parser = HTML.HTMLParser(verbose=0) self.parser.feed(fd.read()) self.parser.close() root = self.parser.root result = {'type': 'Edit Sent'} result['From'] = self.parse_email_address(root, 'from') result['To'] = self.parse_email_address(root, 'to') try: result['message'] = root.find("text").innerHTML() except: pass ## Sometimes they also give us the html version #try: # result['message'] = root.find("html").innerHTML() #except: pass try: result['subject'] = root.find("subject").innerHTML() except: pass self.insert_message(result, "webmail")
def scan(self, fd, scanners, type, mime, cookie, **args): if "HTML" in type: data = fd.read(1024) if not re.search("<title>\s+Windows Live", data): return ## Ok - we know its a Live page pyflaglog.log( pyflaglog.DEBUG, "Opening (%s) %s for Hotmail processing" % (fd.inode_id, fd.urn)) self.parser = HTML.HTMLParser(verbose=0) self.parser.feed(data.decode("utf8", "ignore")) while len(data) > 0: data = fd.read(1024) self.parser.feed(data.decode("utf8", "ignore")) ## Get all the tokens while self.parser.next_token(True): pass ## Now we should be able to parse the data out: self.process_send_message(fd) self.process_editread(fd) self.process_readmessage(fd) self.process_mail_listing(fd)
def sanitize_data(self, data, value, result): parser = HTML.HTMLParser(tag_class = \ FlagFramework.Curry(MessageTags, case = self.case, inode_id = value))
def xxxdisplay(self, value, row, result): parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag) parser.feed(value) parser.close() return parser.root.innerHTML()
def scan(self, fd, scanners, type, mime, cookie, **args): if "Google Image Search" in type: pyflaglog.log(pyflaglog.DEBUG,"Opening %s for Google image search processing" % fd.inode_id) ## Parse the file self.parser = HTML.HTMLParser() self.parser.feed(fd.read()) self.parser.close() ## Pull out all the scripts and match the regex: result = '' image_text = '' text_text = '' count = 0 total_count = 0 regex = re.compile('dyn.Img(\(.+?\));') for script in self.parser.root.search("script"): data = script.innerHTML() for m in regex.finditer(data): row = eval(m.group(1),{},{}) image_text += '''\n<td id="tDataImage%s" nowrap="" width="16%%" valign="bottom" align="center" style="padding-top: 0px;"> <a href="%s"> <img height="%s" width="%s" src="%s?q=tbn:%s%s" style="border: 1px solid ;"/> </a> </td>\n''' % (total_count, row[0], row[5], row[4], row[14], row[2], row[3]) text_text += '''<td id="tDataText%s" width="16%%" valign="top" align="center"> <font face="arial,sans-serif" size="-1"> %s <br/> %s - %s <br/> <font color="#008000">%s</font> </font> </td>''' % (total_count, row[6], row[9], row[10], row[11]) count += 1 total_count += 1 if count >= 5: result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text) image_text = '' text_text = '' count = 0 if image_text: result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text) if result: ## Prepare the new page tag = self.parser.root.find("div", {"id":"ImgContent"}) if tag: result = "<table>%s</table>" % result tag.add_child(result) page = self.parser.root.innerHTML() page = page.encode("utf8","ignore") new_fd = CacheManager.AFF4_MANAGER.create_cache_data( fd.case, "%s/Gimage" % fd.urn, page, inherited = fd.urn) new_fd.close()