Python plaintext2htmlの例、inbox.util.html.plaintext2html Pythonの例

コード例 #1

0

ファイルを表示

ファイル: message.py プロジェクト: FrictionlessCoin/inbox

 def calculate_sanitized_body(self):
     plain_part, html_part = self.body
     # TODO: also strip signatures.
     if html_part:
         assert '\r' not in html_part, "newlines not normalized"
         extracted = extract_from_html(
             html_part.encode('utf-8')).decode('utf-8').strip()
         self.snippet = self.calculate_html_snippet(extracted)
         # If quote-stripping left us with a blank message, store the
         # original instead.
         if self.snippet.strip():
             self.sanitized_body = unicode(extracted)
         else:
             self.snippet = self.calculate_html_snippet(html_part)
             self.sanitized_body = html_part
     elif plain_part:
         extracted = extract_from_plain(plain_part).strip()
         if extracted.strip():
             self.snippet = self.calculate_plaintext_snippet(extracted)
             self.sanitized_body = plaintext2html(extracted, False)
         else:
             self.snippet = self.calculate_plaintext_snippet(plain_part)
             self.sanitized_body = plaintext2html(plain_part, False)
     else:
         self.sanitized_body = u''
         self.snippet = u''

コード例 #2

0

ファイルを表示

 def calculate_sanitized_body(self):
     plain_part, html_part = self.body
     # TODO: also strip signatures.
     if html_part:
         assert '\r' not in html_part, "newlines not normalized"
         extracted = extract_from_html(
             html_part.encode('utf-8')).decode('utf-8').strip()
         self.snippet = self.calculate_html_snippet(extracted)
         # If quote-stripping left us with a blank message, store the
         # original instead.
         if self.snippet.strip():
             self.sanitized_body = unicode(extracted)
         else:
             self.snippet = self.calculate_html_snippet(html_part)
             self.sanitized_body = html_part
     elif plain_part:
         extracted = extract_from_plain(plain_part).strip()
         if extracted.strip():
             self.snippet = self.calculate_plaintext_snippet(extracted)
             self.sanitized_body = plaintext2html(extracted, False)
         else:
             self.snippet = self.calculate_plaintext_snippet(plain_part)
             self.sanitized_body = plaintext2html(plain_part, False)
     else:
         self.sanitized_body = u''
         self.snippet = u''

コード例 #3

0

ファイルを表示

ファイル: message.py プロジェクト: raghuveerkancherla/sync-engine

 def calculate_body(self, html_parts, plain_parts):
     html_body = ''.join(html_parts).decode('utf-8').strip()
     plain_body = '\n'.join(plain_parts).decode('utf-8').strip()
     if html_body:
         self.snippet = self.calculate_html_snippet(html_body)
         self.body = html_body
     elif plain_body:
         self.snippet = self.calculate_plaintext_snippet(plain_body)
         self.body = plaintext2html(plain_body, False)
     else:
         self.body = u''
         self.snippet = u''

コード例 #4

0

ファイルを表示

 def calculate_body(self, html_parts, plain_parts):
     html_body = ''.join(html_parts).decode('utf-8').strip()
     plain_body = '\n'.join(plain_parts).decode('utf-8').strip()
     if html_body:
         self.snippet = self.calculate_html_snippet(html_body)
         self.body = html_body
     elif plain_body:
         self.snippet = self.calculate_plaintext_snippet(plain_body)
         self.body = plaintext2html(plain_body, False)
     else:
         self.body = u''
         self.snippet = u''

コード例 #5

0

ファイルを表示

ファイル: message.py プロジェクト: EthanBlackburn/sync-engine

 def calculate_sanitized_body(self):
     plain_part, html_part = self.body
     # TODO: also strip signatures.
     if html_part:
         assert '\r' not in html_part, "newlines not normalized"
         self.snippet = self.calculate_html_snippet(html_part)
         self.sanitized_body = html_part
     elif plain_part:
         self.snippet = self.calculate_plaintext_snippet(plain_part)
         self.sanitized_body = plaintext2html(plain_part, False)
     else:
         self.sanitized_body = u''
         self.snippet = u''

コード例 #6

0

ファイルを表示

ファイル: message.py プロジェクト: GEverding/inbox

 def calculate_sanitized_body(self):
     plain_part, html_part = self.body
     # TODO: also strip signatures.
     if html_part:
         assert '\r' not in html_part, "newlines not normalized"
         self.snippet = self.calculate_html_snippet(html_part)
         self.sanitized_body = html_part
     elif plain_part:
         self.snippet = self.calculate_plaintext_snippet(plain_part)
         self.sanitized_body = plaintext2html(plain_part, False)
     else:
         self.sanitized_body = u''
         self.snippet = u''

コード例 #7

0

ファイルを表示

 def calculate_sanitized_body(self):
     plain_part, html_part = self.body
     # TODO: also strip signatures.
     if html_part:
         assert '\r' not in html_part, "newlines not normalized"
         stripped = extract_from_html(
             html_part.encode('utf-8')).decode('utf-8').strip()
         self.sanitized_body = unicode(stripped)
         self.calculate_html_snippet(self.sanitized_body)
     elif plain_part:
         stripped = extract_from_plain(plain_part).strip()
         self.sanitized_body = plaintext2html(stripped, False)
         self.calculate_plaintext_snippet(stripped)
     else:
         self.sanitized_body = u''
         self.snippet = u''

コード例 #8

0

ファイルを表示

ファイル: tables.py プロジェクト: nvasilakis/inbox

    def calculate_sanitized_body(self):
        plain_part, html_part = self.body()
        snippet_length = 191
        if html_part:
            assert '\r' not in html_part, "newlines not normalized"

            # Try our best to strip out gmail quoted text.
            soup = BeautifulSoup(html_part.strip(), "lxml")
            for div in soup.findAll('div', 'gmail_quote'):
                div.extract()
            for container in soup.findAll('div', 'gmail_extra'):
                if container.contents is not None:
                    for tag in reversed(container.contents):
                        if not hasattr(tag, 'name') or tag.name != 'br': break
                        else: tag.extract()
                if container.contents is None:
                    # we emptied it!
                    container.extract()

            # Paragraphs don't need trailing line-breaks.
            for container in soup.findAll('p'):
                if container.contents is not None:
                    for tag in reversed(container.contents):
                        if not hasattr(tag, 'name') or tag.name != 'br': break
                        else: tag.extract()

            # Misc other crap.
            dtd = [item for item in soup.contents if isinstance(item, Doctype)]
            comments = soup.findAll(text=lambda text:isinstance(text, Comment))
            for tag in chain(dtd, comments):
                tag.extract()

            self.sanitized_body = unicode(soup)

            # trim for snippet
            for tag in soup.findAll(['style', 'head', 'title']):
                tag.extract()
            self.snippet = soup.get_text(' ')[:191]
        elif plain_part is None:
            self.sanitized_body = u''
            self.snippet = u''
        else:
            stripped = strip_plaintext_quote(plain_part.strip())
            self.sanitized_body = plaintext2html(stripped)
            self.snippet = stripped[:snippet_length]

コード例 #9

0

ファイルを表示

 def calculate_body(self, html_parts, plain_parts, store_body=True):
     html_body = "".join(html_parts).decode("utf-8").strip()
     plain_body = "\n".join(plain_parts).decode("utf-8").strip()
     if html_body:
         self.snippet = self.calculate_html_snippet(html_body)
         if store_body:
             self.body = html_body
         else:
             self.body = None
     elif plain_body:
         self.snippet = self.calculate_plaintext_snippet(plain_body)
         if store_body:
             self.body = plaintext2html(plain_body, False)
         else:
             self.body = None
     else:
         self.body = None
         self.snippet = u""

コード例 #10

0

ファイルを表示

ファイル: base.py プロジェクト: jre21/inbox

    def calculate_sanitized_body(self):
        plain_part, html_part = self.body
        snippet_length = 191
        if html_part:
            assert '\r' not in html_part, "newlines not normalized"

            # Rudimentary stripping out quoted text in 'gmail_quote' div
            # Wrap this in a try/catch because sometimes BeautifulSoup goes
            # down a dark spiral of recursion death
            try:
                soup = BeautifulSoup(html_part.strip(), "lxml")
                for div in soup.findAll('div', 'gmail_quote'):
                    div.extract()
                for container in soup.findAll('div', 'gmail_extra'):
                    if container.contents is not None:
                        for tag in reversed(container.contents):
                            if not hasattr(tag, 'name') or tag.name != 'br':
                                break
                            else:
                                tag.extract()
                    if container.contents is None:
                        # we emptied it!
                        container.extract()

                # Paragraphs don't need trailing line-breaks.
                for container in soup.findAll('p'):
                    if container.contents is not None:
                        for tag in reversed(container.contents):
                            if not hasattr(tag, 'name') or tag.name != 'br':
                                break
                            else:
                                tag.extract()

                # Misc other crap.
                dtd = [item for item in soup.contents if isinstance(
                    item, Doctype)]
                comments = soup.findAll(text=lambda text: isinstance(
                    text, Comment))
                for tag in chain(dtd, comments):
                    tag.extract()
                self.sanitized_body = unicode(soup)

                # trim for snippet
                for tag in soup.findAll(['style', 'head', 'title']):
                    tag.extract()
                self.snippet = soup.get_text(' ')[:191]

            except RuntimeError as exc:
                err_prefix = 'maximum recursion depth exceeded'
                # e.message is deprecated in Python 3
                if exc.args[0].startswith(err_prefix):
                    full_traceback = 'Ignoring error: {}\nOuter stack:\n{}{}'\
                        .format(exc, ''.join(traceback.format_stack()[:-2]),
                                traceback.format_exc(exc))

                    # Note that python doesn't support tail call recursion
                    # optimizations
                    # http://neopythonic.blogspot.com/2009/04/tail-recursion-elimination.html
                    full_traceback = 'Error in BeautifulSoup.' + \
                        'System recursion limit: {0}'.format(
                            sys.getrecursionlimit()) + \
                        '\n\n\n' + \
                        full_traceback

                    # TODO have a better logging service for storing these
                    errdir = os.path.join(config['LOGDIR'],
                                          'bs_parsing_errors', )
                    errfile = os.path.join(errdir, str(self.data_sha256))
                    mkdirp(errdir)

                    with open("{0}_traceback".format(errfile), 'w') as fh:
                        fh.write(full_traceback)
                    # Write the file in binary mode, since it might also have
                    # decoding errors.
                    with open("{0}_data".format(errfile), 'wb') as fh:
                        fh.write(html_part.encode("utf-8"))

                    log.error("BeautifulSoup parsing error. Data logged to\
                              {0}_data and {0}_traceback".format(errfile))
                    self.decode_error = True

                    # Not sanitized, but will still work
                    self.sanitized_body = html_part
                    self.snippet = soup.get_text(' ')[:191]

                else:
                    log.error("Unknown BeautifulSoup exception: {0}".format(
                        exc))
                    raise exc

        elif plain_part is None:
            self.sanitized_body = u''
            self.snippet = u''
        else:
            stripped = strip_plaintext_quote(plain_part.strip())
            self.sanitized_body = plaintext2html(stripped)
            self.snippet = stripped[:snippet_length]

コード例 #11

0

ファイルを表示

ファイル: test_html.py プロジェクト: ivicac/sync-engine

def test_plaintext2html(plaintext, html):
    assert plaintext2html(plaintext) == html