Esempio n. 1
0
    def calculate_sanitized_body(self):
        plain_part, html_part = self.body()
        snippet_length = 191
        if html_part:
            assert '\r' not in html_part, "newlines not normalized"

            # Try our best to strip out gmail quoted text.
            soup = BeautifulSoup(html_part.strip(), "lxml")
            for div in soup.findAll('div', 'gmail_quote'):
                div.extract()
            for container in soup.findAll('div', 'gmail_extra'):
                if container.contents is not None:
                    for tag in reversed(container.contents):
                        if not hasattr(tag, 'name') or tag.name != 'br': break
                        else: tag.extract()
                if container.contents is None:
                    # we emptied it!
                    container.extract()

            # Paragraphs don't need trailing line-breaks.
            for container in soup.findAll('p'):
                if container.contents is not None:
                    for tag in reversed(container.contents):
                        if not hasattr(tag, 'name') or tag.name != 'br': break
                        else: tag.extract()

            # Misc other crap.
            dtd = [item for item in soup.contents if isinstance(item, Doctype)]
            comments = soup.findAll(text=lambda text:isinstance(text, Comment))
            for tag in chain(dtd, comments):
                tag.extract()

            self.sanitized_body = unicode(soup)

            # trim for snippet
            for tag in soup.findAll(['style', 'head', 'title']):
                tag.extract()
            self.snippet = soup.get_text(' ')[:191]
        elif plain_part is None:
            self.sanitized_body = u''
            self.snippet = u''
        else:
            stripped = strip_plaintext_quote(plain_part.strip())
            self.sanitized_body = plaintext2html(stripped)
            self.snippet = stripped[:snippet_length]
Esempio n. 2
0
File: base.py Progetto: jre21/inbox
    def calculate_sanitized_body(self):
        plain_part, html_part = self.body
        snippet_length = 191
        if html_part:
            assert '\r' not in html_part, "newlines not normalized"

            # Rudimentary stripping out quoted text in 'gmail_quote' div
            # Wrap this in a try/catch because sometimes BeautifulSoup goes
            # down a dark spiral of recursion death
            try:
                soup = BeautifulSoup(html_part.strip(), "lxml")
                for div in soup.findAll('div', 'gmail_quote'):
                    div.extract()
                for container in soup.findAll('div', 'gmail_extra'):
                    if container.contents is not None:
                        for tag in reversed(container.contents):
                            if not hasattr(tag, 'name') or tag.name != 'br':
                                break
                            else:
                                tag.extract()
                    if container.contents is None:
                        # we emptied it!
                        container.extract()

                # Paragraphs don't need trailing line-breaks.
                for container in soup.findAll('p'):
                    if container.contents is not None:
                        for tag in reversed(container.contents):
                            if not hasattr(tag, 'name') or tag.name != 'br':
                                break
                            else:
                                tag.extract()

                # Misc other crap.
                dtd = [item for item in soup.contents if isinstance(
                    item, Doctype)]
                comments = soup.findAll(text=lambda text: isinstance(
                    text, Comment))
                for tag in chain(dtd, comments):
                    tag.extract()
                self.sanitized_body = unicode(soup)

                # trim for snippet
                for tag in soup.findAll(['style', 'head', 'title']):
                    tag.extract()
                self.snippet = soup.get_text(' ')[:191]

            except RuntimeError as exc:
                err_prefix = 'maximum recursion depth exceeded'
                # e.message is deprecated in Python 3
                if exc.args[0].startswith(err_prefix):
                    full_traceback = 'Ignoring error: {}\nOuter stack:\n{}{}'\
                        .format(exc, ''.join(traceback.format_stack()[:-2]),
                                traceback.format_exc(exc))

                    # Note that python doesn't support tail call recursion
                    # optimizations
                    # http://neopythonic.blogspot.com/2009/04/tail-recursion-elimination.html
                    full_traceback = 'Error in BeautifulSoup.' + \
                        'System recursion limit: {0}'.format(
                            sys.getrecursionlimit()) + \
                        '\n\n\n' + \
                        full_traceback

                    # TODO have a better logging service for storing these
                    errdir = os.path.join(config['LOGDIR'],
                                          'bs_parsing_errors', )
                    errfile = os.path.join(errdir, str(self.data_sha256))
                    mkdirp(errdir)

                    with open("{0}_traceback".format(errfile), 'w') as fh:
                        fh.write(full_traceback)
                    # Write the file in binary mode, since it might also have
                    # decoding errors.
                    with open("{0}_data".format(errfile), 'wb') as fh:
                        fh.write(html_part.encode("utf-8"))

                    log.error("BeautifulSoup parsing error. Data logged to\
                              {0}_data and {0}_traceback".format(errfile))
                    self.decode_error = True

                    # Not sanitized, but will still work
                    self.sanitized_body = html_part
                    self.snippet = soup.get_text(' ')[:191]

                else:
                    log.error("Unknown BeautifulSoup exception: {0}".format(
                        exc))
                    raise exc

        elif plain_part is None:
            self.sanitized_body = u''
            self.snippet = u''
        else:
            stripped = strip_plaintext_quote(plain_part.strip())
            self.sanitized_body = plaintext2html(stripped)
            self.snippet = stripped[:snippet_length]