def process_edit_read(self): """ Process when an edit box is read from the server """ root = self.parser.root result = {} for field, tag, pattern in [('To', 'textarea', 'tofield'), ('CC', 'textarea', 'ccfield'), ('Bcc', 'textarea', 'bccfield'), ('Subject', 'input', 'subjectfield')]: tmp = root.find(tag, {'id': pattern}) if tmp: try: result[field] = HTML.decode_entity(tmp.children[0]) except IndexError: pass ## Find the message: tmp = root.find('input', {'name': 'PlainMsg'}) if tmp: message = HTML.decode_entity(tmp['value']) if message: result['message'] = message if result: result['type'] = 'Edit Read' if self.username: result['From'] = self.username return self.insert_message(result, inode_template="y%s")
def process_edit_read(self): """ Process when an edit box is read from the server """ root = self.parser.root result = {} for field, tag, pattern in [('To','textarea','tofield'), ('CC','textarea','ccfield'), ('Bcc','textarea', 'bccfield'), ('Subject', 'input', 'subjectfield')]: tmp = root.find(tag, {'id': pattern}) if tmp: try: result[field] = HTML.decode_entity(tmp.children[0]) except IndexError: pass ## Find the message: tmp = root.find('input', {'name':'PlainMsg'}) if tmp: message = HTML.decode_entity(tmp['value']) if message: result['message'] = message if result: result['type']='Edit Read' if self.username: result['From'] = self.username return self.insert_message(result, inode_template="y%s")
def process_readmessage(self): result = {'type': 'Read', 'message': ''} ## We could get several messages in the same response: root = self.parser.root for message in root.search('message'): result['message_id'] = message.find("mid").innerHTML() try: result['sent'] = Time.parse( message.find("receiveddate").innerHTML()) except: pass result['subject'] = message.find("subject").innerHTML() for tag, field in [('from', 'From'), ('to', 'To')]: result[field] = self.parse_email_address(message, tag) ## now iterate over all the parts: for part in message.search("part"): ## Usually text/html are the main body try: if not result['message'] and part.attributes[ 'type'] == 'text': text = part.find("text") result['message'] = HTML.unquote( HTML.decode_entity(text.innerHTML())) except KeyError: pass self.insert_message(result, "webmail")
def process_readmessage(self): result = {'type': 'Read', 'message':'' } ## We could get several messages in the same response: root = self.parser.root for message in root.search('message'): result['message_id'] = message.find("mid").innerHTML() try: result['sent'] = Time.parse(message.find("receiveddate").innerHTML()) except: pass result['subject'] = message.find("subject").innerHTML() for tag,field in [('from','From'), ('to','To')]: result[field] = self.parse_email_address(message, tag) ## now iterate over all the parts: for part in message.search("part"): ## Usually text/html are the main body try: if not result['message'] and part.attributes['type'] == 'text': text = part.find("text") result['message'] = HTML.unquote(HTML.decode_entity(text.innerHTML())) except KeyError: pass self.insert_message(result, "webmail")
def stats(self, query, result): result.start_table(**{'class': 'GeneralTable'}) dbh = DB.DBO(self.case) columns = [ "service", "type", "From", "To", "CC", "BCC", "sent", "subject", "message" ] dbh.execute("select * from webmail_messages where `inode_id`=%r", self.lookup_id()) row = dbh.fetch() dbh2 = DB.DBO(self.case) dbh2.execute("select * from inode where inode_id = %r", row['inode_id']) row2 = dbh2.fetch() result.row("Timestamp", row2['mtime']) for c in columns: if c == 'message': ## Filter the message out here: parser = HTML.HTMLParser(tag_class = \ FlagFramework.Curry(HTML.ResolvingHTMLTag, case = self.case, inode_id = row['parent_inode_id'])) #parser = HTML.HTMLParser(tag_class = HTML.TextTag) parser.feed(HTML.decode(row[c] or "")) parser.close() #tmp = result.__class__(result) #tmp.text(parser.root.innerHTML(), font='typewriter', wrap='full') #row[c] = tmp r = parser.root.__str__() r = textwrap.fill(r) row[c] = r
def parse_email_address(self, message, tag): from_tag = message.find(tag) if from_tag: try: name = from_tag.find("name").innerHTML() except: name = '' email = HTML.unquote(HTML.decode_entity(from_tag.find("email").innerHTML())) return "%s <%s>" % (name, email)
def parse_email_address(self, message, tag): from_tag = message.find(tag) if from_tag: try: name = from_tag.find("name").innerHTML() except: name = '' email = HTML.unquote( HTML.decode_entity(from_tag.find("email").innerHTML())) return "%s <%s>" % (name, email)
def process_readmessage(self, fd, parser): result = {'type': 'Read', 'service': self.service} ## Find the subject sbj = parser.root.find('div', {'class': 'ReadMsgSubject'}) if sbj: result['subject'] = HTML.decode_entity(sbj.innerHTML()) context = None for td in parser.root.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'Sent' msg = parser.root.find('div', {'class': 'ReadMsgContainer'}) ## Try to detect the message ID tag = parser.root.find('div', {'mid': '.'}) if tag: result['message_id'] = tag['mid'] else: result['message_id'] = fd.inode_id try: result['Sent'] = Time.parse(result['Sent']) except: pass if msg: message_urn = "/WebMail/%s/%s" % ( self.service, result['message_id'].replace("/", "_")) fsfd = FileSystem.DBFS(fd.case) try: if fsfd.lookup(path=message_urn): return except RuntimeError: pass pdb.set_trace() message_fd = CacheManager.AFF4_MANAGER.create_cache_data( fd.case, message_urn, inherited=fd.urn) message_fd.write(msg.innerHTML().encode("utf8")) message_fd.insert_to_table("webmail_messages", result) message_fd.close()
def process_readmessage(self, fd, parser): result = {'type': 'Read', 'service':self.service} ## Find the subject sbj = parser.root.find('div', {'class':'ReadMsgSubject'}) if sbj: result['subject'] = HTML.decode_entity(sbj.innerHTML()) context = None for td in parser.root.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'Sent' msg = parser.root.find('div', {'class':'ReadMsgContainer'}) ## Try to detect the message ID tag = parser.root.find('div', {'mid':'.'}) if tag: result['message_id'] = tag['mid'] else: result['message_id'] = fd.inode_id try: result['Sent'] = Time.parse(result['Sent']) except: pass if msg: message_urn = "/WebMail/%s/%s" % (self.service, result['message_id'].replace("/","_")) fsfd = FileSystem.DBFS(fd.case) try: if fsfd.lookup(path = message_urn): return except RuntimeError: pass pdb.set_trace() message_fd = CacheManager.AFF4_MANAGER.create_cache_data( fd.case, message_urn, inherited = fd.urn) message_fd.write(msg.innerHTML().encode("utf8")) message_fd.insert_to_table("webmail_messages", result) message_fd.close()
def fixup_page(self, root, tag_class): ## We have to inject the message into the edit area: edit_area = root.find("div", {"class":"EditArea"}) or \ root.find("div",{"id":"MsgContainer"}) or \ root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class=tag_class) parser.feed(HTML.decode(self.message)) #parser.feed(self.message) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div'
def scan(self, fd, scanners, type, mime, cookie, scores=None, **args): if scores.get('GmailStreamMagic', 0) == 0: return pyflaglog.log(pyflaglog.DEBUG, "Opening %s for Gmail processing" % fd.inode_id) self.current_time = None self.current_box = 'Unknown' if "html" in mime: html_parser = HTML.HTMLParser() html_parser.parse_fd(fd) html_parser.close() ## Process all script segments for script_tag in html_parser.root.search("script"): script = script_tag.innerHTML() try: j = Javascript.JSParser() j.feed(script) j.close() except: continue self.process_js(j.root, fd) elif "javascript" in mime: ## Make a new parser j = Javascript.JSParser() j.parse_fd(fd) j.close() self.process_js(j.root, fd)
def process_mail_listing(self): """ Search for a listing in this page """ current_folder = None for td in self.parser.root.search("td"): if td.attributes.get('align')=='left' and \ "Current Folder" in td.children[0]: current_folder = HTML.decode(td.children[1].innerHTML()) break if not current_folder: return None for table in self.parser.root.search("table"): ## I wish they would use css - it would make it easier to identify things: if table.attributes.get('cellpadding')=='1' and \ table.attributes.get('cellspacing')=='0' and \ table.attributes.get('border')=='0' and \ table.attributes.get('align')=='center' and \ table.attributes.get('bgcolor')=='#ffffcc': b = table.find("b") if b.innerHTML() == "From": ## Ok we are pretty sure this is a listing now: result = { 'type': 'Listed', 'Message': table, 'From': current_folder } return self.insert_message(result, inode_template="y%s")
def process_mail_listing(self): """ Search for a listing in this page """ current_folder = None for td in self.parser.root.search("td"): if td.attributes.get("align") == "left" and "Current Folder" in td.children[0]: current_folder = HTML.decode(td.children[1].innerHTML()) break if not current_folder: return None for table in self.parser.root.search("table"): ## I wish they would use css - it would make it easier to identify things: if ( table.attributes.get("cellpadding") == "1" and table.attributes.get("cellspacing") == "0" and table.attributes.get("border") == "0" and table.attributes.get("align") == "center" and table.attributes.get("bgcolor") == "#ffffcc" ): b = table.find("b") if b.innerHTML() == "From": ## Ok we are pretty sure this is a listing now: result = {"type": "Listed", "Message": table, "From": current_folder} return self.insert_message(result, inode_template="y%s")
def process_mail_listing(self): """ Search for a listing in this page """ current_folder = None for td in self.parser.root.search("td"): if td.attributes.get('align')=='left' and \ "Current Folder" in td.children[0]: current_folder = HTML.decode(td.children[1].innerHTML()) break if not current_folder: return None for table in self.parser.root.search("table"): ## I wish they would use css - it would make it easier to identify things: if table.attributes.get('cellpadding')=='1' and \ table.attributes.get('cellspacing')=='0' and \ table.attributes.get('border')=='0' and \ table.attributes.get('align')=='center' and \ table.attributes.get('bgcolor')=='#ffffcc': b = table.find("b") if b.innerHTML() == "From": ## Ok we are pretty sure this is a listing now: result = {'type':'Listed', 'Message': table, 'From': current_folder} return self.insert_message(result, inode_template = "y%s")
def process_string(self, fd, string): parser = HTML.HTMLParser(verbose=0) parser.feed(string) parser.close() self.process_readmessage(fd, parser) self.process_listing(fd, parser)
def sanitize_page(self, tag_class): """ This produces a rendered version of the underlying page """ ## Get the original HTML File: fsfd = FileSystem.DBFS(self.case) fd = fsfd.open(inode_id=self.parent_inode_id) #data = HTML.decode(fd.read()) data = fd.read() ## FIXME - This is a hack which works because we always send a ## curried class down: try: tag_class.kwargs['inode_id'] = self.parent_inode_id except AttributeError: pass ## Make a parser: p = HTML.HTMLParser(tag_class=tag_class) p.feed(data) p.close() ## Allow us to fix the html page root = p.root self.fixup_page(root, tag_class) ## Add the timestamp to the title of the page - so when you ## print it out we can identify it: s = fsfd.istat(inode_id=self.parent_inode_id) title_tag = root.find("title") if title_tag: title_tag.children = [ "%s %s %s" % (title_tag.innerHTML(), s['mtime'], s['inode']), ] return root.innerHTML()
def display(self, value, row, result): parser = HTML.HTMLParser(tag_class=HTML.TextTag) parser.feed(value or '') parser.close() value = parser.root.innerHTML() result.text(value, wrap='full', font='typewriter')
def generator(): parser = HTML.HTMLParser(tag_class=Curry(HTML.ResolvingHTMLTag, inode_id=fd.lookup_id(), case=self.case)) #parser = HTML.HTMLParser(tag_class = HTML.Tag) data = fd.read(1000000) parser.feed(data) parser.close() yield parser.root.innerHTML()
def process_readmessage(self, fd): result = {'type': 'Read', 'message': ''} root = self.parser.root tag = root.find('div', {'class': 'ReadMsgContainer'}) if not tag: return ## Find the subject: sbj = tag.find('td', {'class': 'ReadMsgSubject'}) if sbj: result['subject'] = HTML.decode_entity(sbj.innerHTML()) ## Fill in all the other fields: context = None for td in tag.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'sent' ## Now the message: ## On newer sites its injected using script: for s in root.search('script'): m = re.match( "document\.getElementById\(\"MsgContainer\"\)\.innerHTML='([^']*)'", s.innerHTML()) if m: result['message'] += HTML.decode_unicode( m.group(1).decode("string_escape")) break try: result['sent'] = Time.parse(result['sent']) except: pass return self.insert_message(result)
def boring(self, metadata, data=''): ## We dont think its boring if our base class does not: ## And the data contains '<title>\s+Windows Live' in the top. if not Scanner.StoreAndScanType.boring(self, metadata, data) and \ re.search("<title>\s+Windows Live", data): ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def process_message_yahoo1(self, result, header): """ Handle Yahoo mail from old version (prior to 20080224) """ ## Look through all its rows: context = None for td in header.search("td"): if context: for i in td: if type(i) == str: result[context] = HTML.unquote( HTML.decode_entity(i)) break context = None data = td.innerHTML() if data.lower().strip().startswith('from:'): context = 'From' elif data.lower().strip().startswith('to:'): context = 'To' elif data.lower().strip().startswith('date:'): context = 'Sent' elif data.lower().strip().startswith('subject:'): context = 'Subject' ## Now the message: msgbody = self.parser.root.find('div', {"class": "msgbody"}) if msgbody: result['message'] = msgbody.innerHTML() if 'Sent' in result: #result['Sent'] = ColumnTypes.guess_date(result['Sent']) result['sent'] = Time.parse(result['sent'], case=self.case, evidence_tz=None) ## Find the message id: tag = header.find('input', dict(name='MsgId')) if tag: result['message_id'] = tag['value'] if len(result.keys()) > 3: return self.insert_message(result, inode_template="y%s")
def fixup_page(self, result, message, tag_class): """ Given the parse tree in root, fix up the page so it looks as close as possible to the way it should. We write the new page on outfd. """ if not message: return ## We have to inject the message into the edit area: edit_area = self.parser.root.find("div", {"class":"EditArea"}) or \ self.parser.root.find("div",{"id":"MsgContainer"}) or \ self.parser.root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class=tag_class) parser.feed(HTML.decode(message)) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div' return self.parser.root.innerHTML()
def render_html(self, inode_id, table_renderer): import plugins.TableRenderers.HTMLBundle as HTMLBundle fsfd = FileSystem.DBFS(table_renderer.case) fd = fsfd.open(inode_id=inode_id) parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag) parser.feed(fd.read(fd.size)) parser.close() text = parser.root.innerHTML() return text
def process_readmessage(self, message): parser = HTML.HTMLParser(verbose=0) parser.feed(message) parser.close() result = {'type': 'Read', 'Message':''} ## Find the subject sbj = parser.root.find('td', {'class':'ReadMsgSubject'}) if sbj: result['Subject'] = HTML.decode_entity(sbj.innerHTML()) context = None for td in parser.root.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'Sent' msg = parser.root.find('div', {'class':'ReadMsgContainer'}) if msg: result['Message'] = msg.innerHTML() ## Try to detect the message ID tag = parser.root.find('div', {'mid':'.'}) if tag: result['message_id'] = tag['mid'] try: result[context] = Time.parse(result[context]) except: pass return self.insert_message(result, inode_template = 'l%s')
def process_readmessage(self, message): parser = HTML.HTMLParser(verbose=0) parser.feed(message) parser.close() result = {'type': 'Read', 'Message': ''} ## Find the subject sbj = parser.root.find('td', {'class': 'ReadMsgSubject'}) if sbj: result['Subject'] = HTML.decode_entity(sbj.innerHTML()) context = None for td in parser.root.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'Sent' msg = parser.root.find('div', {'class': 'ReadMsgContainer'}) if msg: result['Message'] = msg.innerHTML() ## Try to detect the message ID tag = parser.root.find('div', {'mid': '.'}) if tag: result['message_id'] = tag['mid'] try: result[context] = Time.parse(result[context]) except: pass return self.insert_message(result, inode_template='l%s')
def boring(self, metadata, data=''): ## We dont think its boring if our base class does not: ## And the data contains '<title>\s+Yahoo! Mail' in the top. if not Scanner.StoreAndScanType.boring(self, metadata, data=''): m = re.search("<title>[^<]+Yahoo! Mail", data) if m: self.username = None ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def process_editread(self, fd): ## Find the ComposeHeader table: result = {'type':'Edit Read'} root = self.parser.root tag = root.find('table', {"class":'ComposeHeader'}) if not tag: return ## Find the From: row = tag.find( 'select', dict(name = 'ffrom')) if row: option = row.find('option', dict(selected='.*')) result['From'] = HTML.decode_entity(option['value']) for field, pattern in [('To','fto'), ('CC','fcc'), ('BCC', 'fbcc'), ('subject', 'fsubject')]: tmp = tag.find('input', dict(name = pattern)) if tmp: result[field] = HTML.decode_entity(tmp['value']) ## Now extract the content of the email: result['message'] = '' ## Sometimes the message is found in the EditArea div: div = root.find('div', dict(id='EditArea')) if div: result['message'] += div.innerHTML() ## On newer sites its injected using script: for s in root.search('script'): m=re.match("document\.getElementById\(\"fEditArea\"\)\.innerHTML='([^']*)'", s.innerHTML()) if m: result['message'] += m.group(1).decode("string_escape") break return self.insert_message(fd, result)
def process_readmessage(self,fd): result = {'type': 'Read', 'message':''} root = self.parser.root tag = root.find('div', {'class':'ReadMsgContainer'}) if not tag: return ## Find the subject: sbj = tag.find('td', {'class':'ReadMsgSubject'}) if sbj: result['subject'] = HTML.decode_entity(sbj.innerHTML()) ## Fill in all the other fields: context = None for td in tag.search('td'): data = td.innerHTML() if context: result[context] = HTML.decode_entity(data) context = None if data.lower().startswith('from:'): context = 'From' elif data.lower().startswith('to:'): context = 'To' elif data.lower().startswith('sent:'): context = 'sent' ## Now the message: ## On newer sites its injected using script: for s in root.search('script'): m=re.match("document\.getElementById\(\"MsgContainer\"\)\.innerHTML='([^']*)'", s.innerHTML()) if m: result['message'] += HTML.decode_unicode(m.group(1).decode("string_escape")) break try: result['sent'] = Time.parse(result['sent']) except: pass return self.insert_message(fd, result)
def process_editread(self, fd): ## Find the ComposeHeader table: result = {'type': 'Edit Read'} root = self.parser.root tag = root.find('table', {"class": 'ComposeHeader'}) if not tag: return ## Find the From: row = tag.find('select', dict(name='ffrom')) if row: option = row.find('option', dict(selected='.*')) result['From'] = HTML.decode_entity(option['value']) for field, pattern in [('To', 'fto'), ('CC', 'fcc'), ('BCC', 'fbcc'), ('subject', 'fsubject')]: tmp = tag.find('input', dict(name=pattern)) if tmp: result[field] = HTML.decode_entity(tmp['value']) ## Now extract the content of the email: result['message'] = '' ## Sometimes the message is found in the EditArea div: div = root.find('div', dict(id='EditArea')) if div: result['message'] += div.innerHTML() ## On newer sites its injected using script: for s in root.search('script'): m = re.match( "document\.getElementById\(\"fEditArea\"\)\.innerHTML='([^']*)'", s.innerHTML()) if m: result['message'] += m.group(1).decode("string_escape") break return self.insert_message(result)
def process_message_yahoo1(self, result, header): """ Handle Yahoo mail from old version (prior to 20080224) """ ## Look through all its rows: context = None for td in header.search("td"): if context: for i in td: if type(i)==str: result[context] = HTML.unquote(HTML.decode_entity(i)) break context = None data = td.innerHTML() if data.lower().strip().startswith('from:'): context = 'From' elif data.lower().strip().startswith('to:'): context = 'To' elif data.lower().strip().startswith('date:'): context = 'Sent' elif data.lower().strip().startswith('subject:'): context = 'Subject' ## Now the message: msgbody = self.parser.root.find('div', {"class":"msgbody"}) if msgbody: result['message'] = msgbody.innerHTML() if 'Sent' in result: #result['Sent'] = ColumnTypes.guess_date(result['Sent']) result['sent'] = Time.parse(result['sent'], case=self.case, evidence_tz=None) ## Find the message id: tag = header.find('input', dict(name='MsgId')) if tag: result['message_id'] = tag['value'] if len(result.keys())>3: return self.insert_message(result, inode_template = "y%s")
def render_html(self, value, table_renderer): import plugins.TableRenderers.HTMLBundle as HTMLBundle parser = HTML.HTMLParser(tag_class=HTML.TextTag) parser.feed(value or '') parser.close() text = parser.root.innerHTML() ## Make sure its wrapped: ui = HTMLUI.HTMLUI(initial=True) ui.text(text, wrap='full', font='typewriter') return ui.__str__()
def boring(self, metadata, data=''): ## Yahoo web 2.0 is very nice to work with- All ## responses are in nice XML if not Scanner.StoreAndScanType.boring(self, metadata, data=''): m = re.search( "<(GetDisplayMessageResponse|ListMessagesResponse|SendMessageResponse)", data) if m: self.context = m.group(1) ## Make a new parser: if not self.parser: self.parser = HTML.HTMLParser(verbose=0) return False return True
def fixup_page(self, root, tag_class): ## We have to inject the message into the edit area: edit_area = root.find("div", {"class":"EditArea"}) or \ root.find("div",{"id":"MsgContainer"}) or \ root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class = tag_class) parser.feed(HTML.decode(self.message)) #parser.feed(self.message) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div'
def insert_message(self, result, inode_template="l%s"): ## We dont really want to touch the db in here - just print it out ## nicely: try: ## Try to render the html as text: message = unicode(result['Message']) p = HTML.HTMLParser(tag_class=HTML.TextTag) p.feed(message) p.close() result['Message'] = p.root.__str__() except KeyError: pass for k, v in result.items(): print " %s: %r" % (k, v) return True
def scan(self, fd, scanners, type, mime, cookie, **args): if "Yahoo Mail AJAX" in type: self.parser = HTML.HTMLParser(verbose=0) pyflaglog.log( pyflaglog.DEBUG, "Opening %s for YahooMail2.0 processing" % fd.inode_id) ## Read all the data into the parser self.context = None while 1: data = fd.read(1024 * 1024) if not data: break if not self.context: self.context = data self.parser.feed(data) self.parser.close() if 'GetDisplayMessageResponse' in self.context: self.process_readmessage(fd)
def fixup_page(self, result, message, tag_class): """ Given the parse tree in root, fix up the page so it looks as close as possible to the way it should. We write the new page on outfd. """ if not message: return ## We have to inject the message into the edit area: edit_area = self.parser.root.find("div", {"class":"EditArea"}) or \ self.parser.root.find("div",{"id":"MsgContainer"}) or \ self.parser.root.find("textarea",{"id":"fMessageBody"}) if edit_area: parser = HTML.HTMLParser(tag_class = tag_class) parser.feed(HTML.decode(message)) parser.close() result = parser.root.__str__() result = textwrap.fill(result) edit_area.prune() edit_area.add_child(result) edit_area.name = 'div' return self.parser.root.innerHTML()
def process_send_message(self, fd): dbh = DB.DBO(self.case) dbh.execute( "select `key`,`value`,`indirect` from http_parameters where `key`='body' and inode_id = %r limit 1", self.fd.inode_id) row = dbh.fetch() if not row: return inode_id = row['indirect'] if not inode_id: return ## Need to parse the sent message fsfd = FileSystem.DBFS(self.case) fd = fsfd.open(inode_id=inode_id) self.parser = HTML.HTMLParser(verbose=0) self.parser.feed(fd.read()) self.parser.close() root = self.parser.root result = {'type': 'Edit Sent'} result['From'] = self.parse_email_address(root, 'from') result['To'] = self.parse_email_address(root, 'to') try: result['message'] = root.find("text").innerHTML() except: pass ## Sometimes they also give us the html version #try: # result['message'] = root.find("html").innerHTML() #except: pass try: result['subject'] = root.find("subject").innerHTML() except: pass self.insert_message(result, "webmail")
def stats(self, query,result): result.start_table(**{'class':'GeneralTable'}) dbh = DB.DBO(self.case) columns = ["service","type","From","To","CC","BCC","sent","subject","message"] dbh.execute("select * from webmail_messages where `inode_id`=%r", self.lookup_id()) row = dbh.fetch() dbh2 = DB.DBO(self.case) dbh2.execute("select * from inode where inode_id = %r", row['inode_id']) row2 = dbh2.fetch() result.row("Timestamp", row2['mtime']) for c in columns: if c=='message': ## Filter the message out here: parser = HTML.HTMLParser(tag_class = \ FlagFramework.Curry(HTML.ResolvingHTMLTag, case = self.case, inode_id = row['parent_inode_id'])) #parser = HTML.HTMLParser(tag_class = HTML.TextTag) parser.feed(HTML.decode(row[c] or "")) parser.close() #tmp = result.__class__(result) #tmp.text(parser.root.innerHTML(), font='typewriter', wrap='full') #row[c] = tmp r = parser.root.__str__() r = textwrap.fill(r) row[c] = r result.row(c, row[c]) dbh.execute("select url from http where inode_id = %r", row['parent_inode_id']) row = dbh.fetch() if row: tmp = result.__class__(result) tmp.text(row['url'], font='typewriter', wrap='full') result.row("URL", tmp)
def scan(self, fd, scanners, type, mime, cookie, **args): if "HTML" in type: data = fd.read(1024) if not re.search("<title>\s+Windows Live", data): return ## Ok - we know its a Live page pyflaglog.log( pyflaglog.DEBUG, "Opening (%s) %s for Hotmail processing" % (fd.inode_id, fd.urn)) self.parser = HTML.HTMLParser(verbose=0) self.parser.feed(data.decode("utf8", "ignore")) while len(data) > 0: data = fd.read(1024) self.parser.feed(data.decode("utf8", "ignore")) ## Get all the tokens while self.parser.next_token(True): pass ## Now we should be able to parse the data out: self.process_send_message(fd) self.process_editread(fd) self.process_readmessage(fd) self.process_mail_listing(fd)
def fixup_page(self, root, tag_class): ## Put in some script to turn on visibility (this emulates ## what yahoo does). tag = root.find("body") ## This will not be filtered out because the parser thinks its ## just a string - so it will be executed in the browser after ## page loads. tag.add_child("""<script> document.write('<style>* { visibility: visible; }</style>'); </script>""") ## This stylesheet is stuck in a comment?? WTF?? tag = root.find("head") new_tag = HTML.ResolvingHTMLTag( name="link", case=tag.case, inode_id=tag.inode_id, attributes={ 'type': 'text/css', 'rel': 'stylesheet', 'href': "http://us.js2.yimg.com/us.js.yimg.com/lib/hdr/uhbt1_v27_1.8.css" }) ## There are various visibility:hiddens all through the place: for style in root.search("style"): try: style.children[0] = style.children[0].replace( "visibility:hidden", "") except: pass tag.add_child(new_tag)
def make_link(self, url): return urlnorm.normalize(HTML.unquote(url))
def process_stream(self, stream, factories): """ We look for HTTP requests to identify the stream. This allows us to processes HTTP connections on unusual ports. This situation might arise if HTTP proxies are used for example. """ if stream.reverse: combined_inode = "I%s|S%s/%s" % (stream.fd.name, stream.inode_id, stream.reverse) try: fd = self.fsfd.open(inode=combined_inode) ## If we cant open the combined stream, we quit (This could ## happen if we are trying to operate on a combined stream ## already except IOError: return else: fd = stream p=HTTP(fd,self.fsfd) ## Check that this is really HTTP if not p.identify(): return pyflaglog.log(pyflaglog.DEBUG,"Openning %s for HTTP" % combined_inode) ## Iterate over all the messages in this connection for f in p.parse(): if not f: continue offset, size = f ## Create the VFS node: new_inode="%s|H%s:%s" % (combined_inode,offset,size) try: if 'chunked' in p.response['transfer-encoding']: new_inode += "|c0" except KeyError: pass try: if 'gzip' in p.response['content-encoding']: new_inode += "|G1" except KeyError: pass try: if 'deflate' in p.response['content-encoding']: new_inode += "|d1" except KeyError: pass ## stream.ts_sec is already formatted in DB format ## need to convert back to utc/gmt as paths are UTC timestamp = fd.get_packet_ts(offset) ds_timestamp = Time.convert(timestamp, case=self.case, evidence_tz="UTC") try: date_str = ds_timestamp.split(" ")[0] except: date_str = stream.ts_sec.split(" ")[0] path,inode,inode_id=self.fsfd.lookup(inode=combined_inode) ## Try to put the HTTP inodes at the mount point. FIXME: ## This should not be needed when a http stats viewer is ## written. path=posixpath.normpath(path+"/../../../../../") inode_id = self.fsfd.VFSCreate(None,new_inode, "%s/HTTP/%s/%s" % (path,date_str, escape(p.request['url'])), mtime=timestamp, size=size ) ## Update the inode again: #new_inode = new_inode % inode_id ## This updates the inode table with the new inode #self.fsfd.VFSCreate(None,new_inode, # None, update_only = True, # inode_id = inode_id # ) ## Store information about this request in the ## http table: host = p.request.get("host",IP2str(stream.dest_ip)) url = HTML.url_unquote(p.request.get("url")) try: date = p.response["date"] date = Time.parse(date, case=self.case, evidence_tz=None) except (KeyError,ValueError): date = 0 ## Two forms for the referrer: referer = p.request.get('referer', p.request.get('referrer','')) if not url.startswith("http://") and not url.startswith("ftp://"): url = "http://%s%s" % (host, url) ## Not sure if we really care about this? ## Find referred page: ## parent = 0 dbh = DB.DBO(self.case) ## if referer: ## dbh.execute("select inode_id from http where url=%r order by inode_id desc limit 1", referer) ## row = dbh.fetch() ## ## If there is no referrer we just make a psuedo entry ## if not row: ## ## Find out the host ## m=re.match("(http://|ftp://)([^/]+)([^\?\&\=]*)", ## "%s" % referer) ## if m: ## host = m.group(2) ## dbh.insert("http", url=referer, host=host) ## parent = dbh.autoincrement() ## else: ## parent = row['inode_id'] args = dict(inode_id = inode_id, request_packet = p.request.get("packet_id",0), method = p.request.get("method","-"), url = url, response_packet= p.response.get("packet_id"), status = p.response.get("HTTP_code"), content_type = p.response.get("content-type","text/html"), referrer = referer[:500], host = host, tld = make_tld(host), useragent = p.request.get('user-agent', '-'), ) if date: args['date'] = date dbh.insert('http', **args) # parent = parent) ## Replicate the information about the subobjects in the ## connection_details table - this makes it easier to do ## some queries: dbh.insert("connection_details", ts_sec = stream.ts_sec, inode_id = inode_id, src_ip = stream.src_ip, src_port = stream.src_port, dest_ip = stream.dest_ip, dest_port = stream.dest_port, ) ## handle the request's parameters: try: self.handle_parameters(p.request, inode_id) except (KeyError, TypeError): pass ## Only scan the new file using the scanner train if its ## size of bigger than 0: if size>0: self.scan_as_file(new_inode, factories)
def scan(self, fd, scanners, type, mime, cookie, **args): if "Google Image Search" in type: pyflaglog.log(pyflaglog.DEBUG,"Opening %s for Google image search processing" % fd.inode_id) ## Parse the file self.parser = HTML.HTMLParser() self.parser.feed(fd.read()) self.parser.close() ## Pull out all the scripts and match the regex: result = '' image_text = '' text_text = '' count = 0 total_count = 0 regex = re.compile('dyn.Img(\(.+?\));') for script in self.parser.root.search("script"): data = script.innerHTML() for m in regex.finditer(data): row = eval(m.group(1),{},{}) image_text += '''\n<td id="tDataImage%s" nowrap="" width="16%%" valign="bottom" align="center" style="padding-top: 0px;"> <a href="%s"> <img height="%s" width="%s" src="%s?q=tbn:%s%s" style="border: 1px solid ;"/> </a> </td>\n''' % (total_count, row[0], row[5], row[4], row[14], row[2], row[3]) text_text += '''<td id="tDataText%s" width="16%%" valign="top" align="center"> <font face="arial,sans-serif" size="-1"> %s <br/> %s - %s <br/> <font color="#008000">%s</font> </font> </td>''' % (total_count, row[6], row[9], row[10], row[11]) count += 1 total_count += 1 if count >= 5: result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text) image_text = '' text_text = '' count = 0 if image_text: result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text) if result: ## Prepare the new page tag = self.parser.root.find("div", {"id":"ImgContent"}) if tag: result = "<table>%s</table>" % result tag.add_child(result) page = self.parser.root.innerHTML() page = page.encode("utf8","ignore") new_fd = CacheManager.AFF4_MANAGER.create_cache_data( fd.case, "%s/Gimage" % fd.urn, page, inherited = fd.urn) new_fd.close()
def process_readmessage(self,fd): ## This is what the message tree looks like (XML): ## <GetDisplayMessageResponse> ## <message> ## <header> ## <part> ## <part> ## <message> ## <message> ## Each message is a seperate message - therefore the same ## HTTP object might relay several messages. root = self.parser.root for message in root.search('message'): result = {'type': 'Read', 'service':self.service } result['message_id'] = message.find("mid").innerHTML() ## Messages are made unique using the message_id. This ## ensures that even if the same message was seen multiple ## times in the traffic, we only retain one copy of it. message_urn = "/Webmail/%s/%s" % (self.service, result['message_id'].replace("/","_")) ## Make sure we dont have duplicates of the same message - ## duplicates may occur in other connections, so we check ## the webmail table for the same yahoo message id fsfd = FileSystem.DBFS(fd.case) try: if fsfd.lookup(path = message_urn): continue except RuntimeError: pass try: result['sent'] = Time.parse(message.find("receiveddate").innerHTML()) except: pass result['subject'] = HTML.unquote(HTML.decode_entity( message.find("subject").innerHTML())) for tag,field in [('from','From'), ('to','To')]: result[field] = self.parse_email_address(message, tag) message_fd = CacheManager.AFF4_MANAGER.create_cache_data( fd.case, message_urn, inherited = fd.urn) message_fd.insert_to_table("webmail_messages", result) ## now iterate over all the parts: for part in message.search("part"): ## Parts are basically message attachments. ct = part.attributes['type'] part_number = part.attributes['partid'] part_urn = "/".join((message_urn, part_number)) ## Usually text/html are the main body data = None if "text" in ct: text = part.find("text") message_fd.write(HTML.unquote(HTML.decode_entity(text.innerHTML()))) elif "image" in ct: message_fd.write(DB.expand("<b>%s</b><br><img src='%s'/>",( self.make_link(part.attributes.get('filename','')), self.make_link(part.attributes['thumbnailurl'])))) message_fd.close()
escape(p.request['url'])), mtime=timestamp, size=size ) ## Update the inode again: #new_inode = new_inode % inode_id ## This updates the inode table with the new inode #self.fsfd.VFSCreate(None,new_inode, # None, update_only = True, # inode_id = inode_id # ) ## Store information about this request in the ## http table: host = p.request.get("host",IP2str(stream.dest_ip)) url = HTML.url_unquote(p.request.get("url")) try: date = p.response["date"] date = Time.parse(date, case=self.case, evidence_tz=None) except (KeyError,ValueError): date = 0 ## Two forms for the referrer: referer = p.request.get('referer', p.request.get('referrer','')) if not url.startswith("http://") and not url.startswith("ftp://"): url = "http://%s%s" % (host, url) ## Not sure if we really care about this? ## Find referred page: ## parent = 0 dbh = DB.DBO(self.case)
def MSG(self, items, fd, scanners): """ Sends message to members of the current session There are two types of messages that may be sent: 1) A message from the client to the message server. This does not contain the nick of the client, but does contain a transaction ID. This message is sent to all users in the current session. 2) A message from the Switchboard server to the client contains the nick of the sender. These two commands are totally different. 1. MSG 1532 U 92 MIME-Version: 1.0 Content-Type: text/x-msmsgscontrol TypingUser: [email protected] Format is: MSG <Transaction ID> <Type of ACK required> <length of message in bytes> Transaction ID is used to correlate server responses to client requests. 2. MSG [email protected] I%20am%20so%20great 102 MIME-Version: 1.0 Content-Type: text/x-msmsgscontrol TypingUser: [email protected] Format is: MSG <Nick> <URL encoded displayname> <length of message in bytes> """ length = int(items[-1]) start = fd.tell() end = start + length if "@" in items[1]: ## Its type 2 (see above) sender_name = "(%s)" % HTML.url_unquote(items[2]) else: ## Its type 1 sender_name = '' sender = fd.client_id ct = '' while 1: line = fd.readline().strip() if not line: break header, value = line.split(":",1) header = header.lower() if header == 'typinguser': fd.client_id = fd.reverse.dest_id = value.strip() elif header == 'content-type': ct = value ## Update the start to be start start of this line start = fd.tell() fd.seek(end - start, 1) ## We only care about text messages here if end > start and 'text/plain' in ct: ## Lets find out the timestamp of this point CacheManager.urn_insert_to_table(fd.urn, "msn_session", dict(session_id = self.session_id, _time = "from_unixtime(%s)" % fd.current_packet.ts_sec, offset = start, length = end - start, sender = fd.client_id, recipient = fd.dest_id, type = 'MESSAGE', ))
def xxxdisplay(self, value, row, result): parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag) parser.feed(value) parser.close() return parser.root.innerHTML()
def sanitize_data(self, data, value, result): parser = HTML.HTMLParser(tag_class = \ FlagFramework.Curry(MessageTags, case = self.case, inode_id = value))