def calculate_sanitized_body(self): plain_part, html_part = self.body # TODO: also strip signatures. if html_part: assert '\r' not in html_part, "newlines not normalized" extracted = extract_from_html( html_part.encode('utf-8')).decode('utf-8').strip() self.snippet = self.calculate_html_snippet(extracted) # If quote-stripping left us with a blank message, store the # original instead. if self.snippet.strip(): self.sanitized_body = unicode(extracted) else: self.snippet = self.calculate_html_snippet(html_part) self.sanitized_body = html_part elif plain_part: extracted = extract_from_plain(plain_part).strip() if extracted.strip(): self.snippet = self.calculate_plaintext_snippet(extracted) self.sanitized_body = plaintext2html(extracted, False) else: self.snippet = self.calculate_plaintext_snippet(plain_part) self.sanitized_body = plaintext2html(plain_part, False) else: self.sanitized_body = u'' self.snippet = u''
def calculate_body(self, html_parts, plain_parts): html_body = ''.join(html_parts).decode('utf-8').strip() plain_body = '\n'.join(plain_parts).decode('utf-8').strip() if html_body: self.snippet = self.calculate_html_snippet(html_body) self.body = html_body elif plain_body: self.snippet = self.calculate_plaintext_snippet(plain_body) self.body = plaintext2html(plain_body, False) else: self.body = u'' self.snippet = u''
def calculate_sanitized_body(self): plain_part, html_part = self.body # TODO: also strip signatures. if html_part: assert '\r' not in html_part, "newlines not normalized" self.snippet = self.calculate_html_snippet(html_part) self.sanitized_body = html_part elif plain_part: self.snippet = self.calculate_plaintext_snippet(plain_part) self.sanitized_body = plaintext2html(plain_part, False) else: self.sanitized_body = u'' self.snippet = u''
def calculate_sanitized_body(self): plain_part, html_part = self.body # TODO: also strip signatures. if html_part: assert '\r' not in html_part, "newlines not normalized" stripped = extract_from_html( html_part.encode('utf-8')).decode('utf-8').strip() self.sanitized_body = unicode(stripped) self.calculate_html_snippet(self.sanitized_body) elif plain_part: stripped = extract_from_plain(plain_part).strip() self.sanitized_body = plaintext2html(stripped, False) self.calculate_plaintext_snippet(stripped) else: self.sanitized_body = u'' self.snippet = u''
def calculate_sanitized_body(self): plain_part, html_part = self.body() snippet_length = 191 if html_part: assert '\r' not in html_part, "newlines not normalized" # Try our best to strip out gmail quoted text. soup = BeautifulSoup(html_part.strip(), "lxml") for div in soup.findAll('div', 'gmail_quote'): div.extract() for container in soup.findAll('div', 'gmail_extra'): if container.contents is not None: for tag in reversed(container.contents): if not hasattr(tag, 'name') or tag.name != 'br': break else: tag.extract() if container.contents is None: # we emptied it! container.extract() # Paragraphs don't need trailing line-breaks. for container in soup.findAll('p'): if container.contents is not None: for tag in reversed(container.contents): if not hasattr(tag, 'name') or tag.name != 'br': break else: tag.extract() # Misc other crap. dtd = [item for item in soup.contents if isinstance(item, Doctype)] comments = soup.findAll(text=lambda text:isinstance(text, Comment)) for tag in chain(dtd, comments): tag.extract() self.sanitized_body = unicode(soup) # trim for snippet for tag in soup.findAll(['style', 'head', 'title']): tag.extract() self.snippet = soup.get_text(' ')[:191] elif plain_part is None: self.sanitized_body = u'' self.snippet = u'' else: stripped = strip_plaintext_quote(plain_part.strip()) self.sanitized_body = plaintext2html(stripped) self.snippet = stripped[:snippet_length]
def calculate_body(self, html_parts, plain_parts, store_body=True): html_body = "".join(html_parts).decode("utf-8").strip() plain_body = "\n".join(plain_parts).decode("utf-8").strip() if html_body: self.snippet = self.calculate_html_snippet(html_body) if store_body: self.body = html_body else: self.body = None elif plain_body: self.snippet = self.calculate_plaintext_snippet(plain_body) if store_body: self.body = plaintext2html(plain_body, False) else: self.body = None else: self.body = None self.snippet = u""
def calculate_sanitized_body(self): plain_part, html_part = self.body snippet_length = 191 if html_part: assert '\r' not in html_part, "newlines not normalized" # Rudimentary stripping out quoted text in 'gmail_quote' div # Wrap this in a try/catch because sometimes BeautifulSoup goes # down a dark spiral of recursion death try: soup = BeautifulSoup(html_part.strip(), "lxml") for div in soup.findAll('div', 'gmail_quote'): div.extract() for container in soup.findAll('div', 'gmail_extra'): if container.contents is not None: for tag in reversed(container.contents): if not hasattr(tag, 'name') or tag.name != 'br': break else: tag.extract() if container.contents is None: # we emptied it! container.extract() # Paragraphs don't need trailing line-breaks. for container in soup.findAll('p'): if container.contents is not None: for tag in reversed(container.contents): if not hasattr(tag, 'name') or tag.name != 'br': break else: tag.extract() # Misc other crap. dtd = [item for item in soup.contents if isinstance( item, Doctype)] comments = soup.findAll(text=lambda text: isinstance( text, Comment)) for tag in chain(dtd, comments): tag.extract() self.sanitized_body = unicode(soup) # trim for snippet for tag in soup.findAll(['style', 'head', 'title']): tag.extract() self.snippet = soup.get_text(' ')[:191] except RuntimeError as exc: err_prefix = 'maximum recursion depth exceeded' # e.message is deprecated in Python 3 if exc.args[0].startswith(err_prefix): full_traceback = 'Ignoring error: {}\nOuter stack:\n{}{}'\ .format(exc, ''.join(traceback.format_stack()[:-2]), traceback.format_exc(exc)) # Note that python doesn't support tail call recursion # optimizations # http://neopythonic.blogspot.com/2009/04/tail-recursion-elimination.html full_traceback = 'Error in BeautifulSoup.' + \ 'System recursion limit: {0}'.format( sys.getrecursionlimit()) + \ '\n\n\n' + \ full_traceback # TODO have a better logging service for storing these errdir = os.path.join(config['LOGDIR'], 'bs_parsing_errors', ) errfile = os.path.join(errdir, str(self.data_sha256)) mkdirp(errdir) with open("{0}_traceback".format(errfile), 'w') as fh: fh.write(full_traceback) # Write the file in binary mode, since it might also have # decoding errors. with open("{0}_data".format(errfile), 'wb') as fh: fh.write(html_part.encode("utf-8")) log.error("BeautifulSoup parsing error. Data logged to\ {0}_data and {0}_traceback".format(errfile)) self.decode_error = True # Not sanitized, but will still work self.sanitized_body = html_part self.snippet = soup.get_text(' ')[:191] else: log.error("Unknown BeautifulSoup exception: {0}".format( exc)) raise exc elif plain_part is None: self.sanitized_body = u'' self.snippet = u'' else: stripped = strip_plaintext_quote(plain_part.strip()) self.sanitized_body = plaintext2html(stripped) self.snippet = stripped[:snippet_length]
def test_plaintext2html(plaintext, html): assert plaintext2html(plaintext) == html