Python HTML.HTMLParserの例、FileFormats.HTML.HTMLParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

    def stats(self, query, result):
        result.start_table(**{'class': 'GeneralTable'})
        dbh = DB.DBO(self.case)
        columns = [
            "service", "type", "From", "To", "CC", "BCC", "sent", "subject",
            "message"
        ]
        dbh.execute("select * from webmail_messages where `inode_id`=%r",
                    self.lookup_id())
        row = dbh.fetch()

        dbh2 = DB.DBO(self.case)
        dbh2.execute("select * from inode where inode_id = %r",
                     row['inode_id'])
        row2 = dbh2.fetch()
        result.row("Timestamp", row2['mtime'])

        for c in columns:
            if c == 'message':
                ## Filter the message out here:
                parser = HTML.HTMLParser(tag_class = \
                                         FlagFramework.Curry(HTML.ResolvingHTMLTag,
                                                             case = self.case,
                                                             inode_id = row['parent_inode_id']))
                #parser = HTML.HTMLParser(tag_class = HTML.TextTag)
                parser.feed(HTML.decode(row[c] or ""))
                parser.close()
                #tmp = result.__class__(result)
                #tmp.text(parser.root.innerHTML(), font='typewriter', wrap='full')
                #row[c] = tmp
                r = parser.root.__str__()
                r = textwrap.fill(r)
                row[c] = r

コード例 #2

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

    def sanitize_page(self, tag_class):
        """ This produces a rendered version of the underlying page """
        ## Get the original HTML File:
        fsfd = FileSystem.DBFS(self.case)
        fd = fsfd.open(inode_id=self.parent_inode_id)
        #data = HTML.decode(fd.read())
        data = fd.read()
        ## FIXME - This is a hack which works because we always send a
        ## curried class down:
        try:
            tag_class.kwargs['inode_id'] = self.parent_inode_id
        except AttributeError:
            pass

        ## Make a parser:
        p = HTML.HTMLParser(tag_class=tag_class)
        p.feed(data)
        p.close()

        ## Allow us to fix the html page
        root = p.root
        self.fixup_page(root, tag_class)

        ## Add the timestamp to the title of the page - so when you
        ## print it out we can identify it:
        s = fsfd.istat(inode_id=self.parent_inode_id)
        title_tag = root.find("title")
        if title_tag:
            title_tag.children = [
                "%s %s %s" % (title_tag.innerHTML(), s['mtime'], s['inode']),
            ]

        return root.innerHTML()

コード例 #3

0

ファイルを表示

ファイル: Gmail.py プロジェクト: johnmccabe/pyflag

    def scan(self, fd, scanners, type, mime, cookie, scores=None, **args):
        if scores.get('GmailStreamMagic', 0) == 0:
            return

        pyflaglog.log(pyflaglog.DEBUG,
                      "Opening %s for Gmail processing" % fd.inode_id)
        self.current_time = None
        self.current_box = 'Unknown'

        if "html" in mime:
            html_parser = HTML.HTMLParser()
            html_parser.parse_fd(fd)
            html_parser.close()

            ## Process all script segments
            for script_tag in html_parser.root.search("script"):
                script = script_tag.innerHTML()
                try:
                    j = Javascript.JSParser()
                    j.feed(script)
                    j.close()
                except:
                    continue

                self.process_js(j.root, fd)

        elif "javascript" in mime:
            ## Make a new parser
            j = Javascript.JSParser()
            j.parse_fd(fd)
            j.close()

            self.process_js(j.root, fd)

コード例 #4

0

ファイルを表示

    def process_string(self, fd, string):
        parser = HTML.HTMLParser(verbose=0)
        parser.feed(string)
        parser.close()

        self.process_readmessage(fd, parser)
        self.process_listing(fd, parser)

コード例 #5

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

    def display(self, value, row, result):
        parser = HTML.HTMLParser(tag_class=HTML.TextTag)
        parser.feed(value or '')
        parser.close()

        value = parser.root.innerHTML()

        result.text(value, wrap='full', font='typewriter')

コード例 #6

0

ファイルを表示

ファイル: ViewFile.py プロジェクト: olivierh59500/pyflag

        def generator():
            parser = HTML.HTMLParser(tag_class=Curry(HTML.ResolvingHTMLTag,
                                                     inode_id=fd.lookup_id(),
                                                     case=self.case))
            #parser = HTML.HTMLParser(tag_class = HTML.Tag)
            data = fd.read(1000000)
            parser.feed(data)
            parser.close()

            yield parser.root.innerHTML()

コード例 #7

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

        def boring(self, metadata, data=''):
            ## We dont think its boring if our base class does not:
            ## And the data contains '<title>\s+Windows Live' in the top.
            if not Scanner.StoreAndScanType.boring(self, metadata, data) and \
                   re.search("<title>\s+Windows Live", data):
                ## Make a new parser:
                if not self.parser:
                    self.parser = HTML.HTMLParser(verbose=0)
                return False

            return True

コード例 #8

0

ファイルを表示

    def render_html(self, inode_id, table_renderer):
        import plugins.TableRenderers.HTMLBundle as HTMLBundle

        fsfd = FileSystem.DBFS(table_renderer.case)
        fd = fsfd.open(inode_id=inode_id)
        parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag)

        parser.feed(fd.read(fd.size))
        parser.close()

        text = parser.root.innerHTML()
        return text

コード例 #9

0

ファイルを表示

        def boring(self, metadata, data=''):
            ## We dont think its boring if our base class does not:
            ## And the data contains '<title>\s+Yahoo! Mail' in the top.
            if not Scanner.StoreAndScanType.boring(self, metadata, data=''):
                m = re.search("<title>[^<]+Yahoo! Mail", data)
                if m:
                    self.username = None
                    ## Make a new parser:
                    if not self.parser:
                        self.parser = HTML.HTMLParser(verbose=0)
                    return False

            return True

コード例 #10

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

    def render_html(self, value, table_renderer):
        import plugins.TableRenderers.HTMLBundle as HTMLBundle

        parser = HTML.HTMLParser(tag_class=HTML.TextTag)

        parser.feed(value or '')
        parser.close()

        text = parser.root.innerHTML()

        ## Make sure its wrapped:
        ui = HTMLUI.HTMLUI(initial=True)
        ui.text(text, wrap='full', font='typewriter')
        return ui.__str__()

コード例 #11

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

 def fixup_page(self, root, tag_class):
     ## We have to inject the message into the edit area:
     edit_area = root.find("div", {"class":"EditArea"}) or \
                 root.find("div",{"id":"MsgContainer"}) or \
                 root.find("textarea",{"id":"fMessageBody"})
     if edit_area:
         parser = HTML.HTMLParser(tag_class=tag_class)
         parser.feed(HTML.decode(self.message))
         #parser.feed(self.message)
         parser.close()
         result = parser.root.__str__()
         result = textwrap.fill(result)
         edit_area.prune()
         edit_area.add_child(result)
         edit_area.name = 'div'

コード例 #12

0

ファイルを表示

        def boring(self, metadata, data=''):
            ## Yahoo web 2.0 is very nice to work with- All
            ## responses are in nice XML
            if not Scanner.StoreAndScanType.boring(self, metadata, data=''):
                m = re.search(
                    "<(GetDisplayMessageResponse|ListMessagesResponse|SendMessageResponse)",
                    data)
                if m:
                    self.context = m.group(1)
                    ## Make a new parser:
                    if not self.parser:
                        self.parser = HTML.HTMLParser(verbose=0)
                    return False

            return True

コード例 #13

0

ファイルを表示

ファイル: yahoo_mail_versions.py プロジェクト: olivierh59500/pyflag

def insert_message(self, result, inode_template="l%s"):
    ## We dont really want to touch the db in here - just print it out
    ## nicely:
    try:
        ## Try to render the html as text:
        message = unicode(result['Message'])
        p = HTML.HTMLParser(tag_class=HTML.TextTag)
        p.feed(message)
        p.close()

        result['Message'] = p.root.__str__()

    except KeyError:
        pass

    for k, v in result.items():
        print "   %s: %r" % (k, v)

    return True

コード例 #14

0

ファイルを表示

ファイル: YahooMail.py プロジェクト: johnmccabe/pyflag

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "Yahoo Mail AJAX" in type:
            self.parser = HTML.HTMLParser(verbose=0)
            pyflaglog.log(
                pyflaglog.DEBUG,
                "Opening %s for YahooMail2.0 processing" % fd.inode_id)

            ## Read all the data into the parser
            self.context = None
            while 1:
                data = fd.read(1024 * 1024)
                if not data: break

                if not self.context: self.context = data
                self.parser.feed(data)

            self.parser.close()

            if 'GetDisplayMessageResponse' in self.context:
                self.process_readmessage(fd)

コード例 #15

0

ファイルを表示

    def fixup_page(self, result, message, tag_class):
        """ Given the parse tree in root, fix up the page so it looks
        as close as possible to the way it should. We write the new
        page on outfd.
        """
        if not message: return
        ## We have to inject the message into the edit area:
        edit_area = self.parser.root.find("div", {"class":"EditArea"}) or \
                    self.parser.root.find("div",{"id":"MsgContainer"}) or \
                    self.parser.root.find("textarea",{"id":"fMessageBody"})
        if edit_area:
            parser = HTML.HTMLParser(tag_class=tag_class)
            parser.feed(HTML.decode(message))
            parser.close()
            result = parser.root.__str__()
            result = textwrap.fill(result)
            edit_area.prune()
            edit_area.add_child(result)
            edit_area.name = 'div'

        return self.parser.root.innerHTML()

コード例 #16

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

        def process_readmessage(self, message):
            parser = HTML.HTMLParser(verbose=0)
            parser.feed(message)
            parser.close()

            result = {'type': 'Read', 'Message': ''}

            ## Find the subject
            sbj = parser.root.find('td', {'class': 'ReadMsgSubject'})
            if sbj: result['Subject'] = HTML.decode_entity(sbj.innerHTML())

            context = None
            for td in parser.root.search('td'):
                data = td.innerHTML()
                if context:
                    result[context] = HTML.decode_entity(data)
                    context = None

                if data.lower().startswith('from:'):
                    context = 'From'
                elif data.lower().startswith('to:'):
                    context = 'To'
                elif data.lower().startswith('sent:'):
                    context = 'Sent'

            msg = parser.root.find('div', {'class': 'ReadMsgContainer'})
            if msg:
                result['Message'] = msg.innerHTML()

            ## Try to detect the message ID
            tag = parser.root.find('div', {'mid': '.'})
            if tag:
                result['message_id'] = tag['mid']

            try:
                result[context] = Time.parse(result[context])
            except:
                pass

            return self.insert_message(result, inode_template='l%s')

コード例 #17

0

ファイルを表示

ファイル: YahooMail.py プロジェクト: johnmccabe/pyflag

    def process_send_message(self, fd):
        dbh = DB.DBO(self.case)
        dbh.execute(
            "select `key`,`value`,`indirect` from http_parameters where `key`='body' and inode_id = %r limit 1",
            self.fd.inode_id)
        row = dbh.fetch()
        if not row: return

        inode_id = row['indirect']
        if not inode_id: return

        ## Need to parse the sent message
        fsfd = FileSystem.DBFS(self.case)
        fd = fsfd.open(inode_id=inode_id)
        self.parser = HTML.HTMLParser(verbose=0)
        self.parser.feed(fd.read())
        self.parser.close()
        root = self.parser.root

        result = {'type': 'Edit Sent'}
        result['From'] = self.parse_email_address(root, 'from')
        result['To'] = self.parse_email_address(root, 'to')
        try:
            result['message'] = root.find("text").innerHTML()
        except:
            pass

        ## Sometimes they also give us the html version
        #try:
        #    result['message'] = root.find("html").innerHTML()
        #except: pass

        try:
            result['subject'] = root.find("subject").innerHTML()
        except:
            pass

        self.insert_message(result, "webmail")

コード例 #18

0

ファイルを表示

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "HTML" in type:
            data = fd.read(1024)
            if not re.search("<title>\s+Windows Live", data): return

            ## Ok - we know its a Live page
            pyflaglog.log(
                pyflaglog.DEBUG, "Opening (%s) %s for Hotmail processing" %
                (fd.inode_id, fd.urn))
            self.parser = HTML.HTMLParser(verbose=0)
            self.parser.feed(data.decode("utf8", "ignore"))

            while len(data) > 0:
                data = fd.read(1024)
                self.parser.feed(data.decode("utf8", "ignore"))
                ## Get all the tokens
                while self.parser.next_token(True):
                    pass

            ## Now we should be able to parse the data out:
            self.process_send_message(fd)
            self.process_editread(fd)
            self.process_readmessage(fd)
            self.process_mail_listing(fd)

コード例 #19

0

ファイルを表示

 def sanitize_data(self, data, value, result):
     parser = HTML.HTMLParser(tag_class = \
                              FlagFramework.Curry(MessageTags,
                                                  case = self.case,
                                                  inode_id = value))

コード例 #20

0

ファイルを表示

ファイル: LiveCom.py プロジェクト: ntvis/pyflag

    def xxxdisplay(self, value, row, result):
        parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag)
        parser.feed(value)
        parser.close()

        return parser.root.innerHTML()

コード例 #21

0

ファイルを表示

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "Google Image Search" in type:
            pyflaglog.log(pyflaglog.DEBUG,"Opening %s for Google image search processing" % fd.inode_id)
            ## Parse the file
            self.parser = HTML.HTMLParser()        
            self.parser.feed(fd.read())
            self.parser.close()
            
            ## Pull out all the scripts and match the regex:
            result = ''
            image_text = ''
            text_text = ''
            count = 0
            total_count = 0
            regex = re.compile('dyn.Img(\(.+?\));')
            for script in self.parser.root.search("script"):
                data = script.innerHTML()
                for m in regex.finditer(data):
                    row = eval(m.group(1),{},{})
                    image_text += '''\n<td id="tDataImage%s" nowrap="" width="16%%" valign="bottom" align="center" style="padding-top: 0px;">
                    <a href="%s">
                    <img height="%s" width="%s" src="%s?q=tbn:%s%s" style="border: 1px solid ;"/>
                    </a>
                    </td>\n''' % (total_count, row[0], row[5], row[4], row[14], row[2], row[3])

                    text_text += '''<td id="tDataText%s" width="16%%" valign="top" align="center">
                    <font face="arial,sans-serif" size="-1">
                    %s
                    <br/>
                    %s - %s
                    <br/>
                    <font color="#008000">%s</font>
                    </font>
                    </td>''' % (total_count, row[6], row[9], row[10], row[11])
                    
                    count += 1
                    total_count += 1
                    
                    if count >= 5:
                        result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text)
                        image_text = ''
                        text_text = ''
                        count = 0

            if image_text:
                result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text)

            if result:
                ## Prepare the new page
                tag = self.parser.root.find("div", {"id":"ImgContent"})
                if tag:
                    result = "<table>%s</table>" % result
                    tag.add_child(result)

                page = self.parser.root.innerHTML()
                page = page.encode("utf8","ignore")

                new_fd = CacheManager.AFF4_MANAGER.create_cache_data(
                    fd.case,
                    "%s/Gimage" % fd.urn,
                    page, inherited = fd.urn)

                new_fd.close()