def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links text = html_keep_url(text) # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace("\n", "<br/>") text = text.replace("\r", "<br/>") # 2. clickable links text = html_keep_url(text) # 3-4: form paragraphs idx = 0 final = "<p>" br_tags = re.compile(r"(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})") for item in re.finditer(br_tags, text): final += text[idx : item.start()] + "</p><p>" idx = item.end() final += text[idx:] + "</p>" # 5. container if container_tag: final = "<%s>%s</%s>" % (container_tag, final, container_tag) return ustr(final)
def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2-3: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 4. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def register_exception(db_name, model, uid, e, service, req): db = openerp.sql_db.db_connect(db_name) cr = db.cursor() registry = openerp.registry(cr.dbname) ename = "<unknown>" if "base.general_exception" in registry: tb = sys.exc_info()[2] if tb: frames = [] count = 0 while tb: frame = tb.tb_frame local_vars = [] output = '' try: if count >= 0: local_vars = [(0, 0, { 'name': ustr(k), 'value': ustr(v) }) for k, v in frame.f_locals.items()] local_vars.sort(key=lambda x: x[2]['name']) seq = 1 for lv in local_vars: lv[2]['sequence'] = seq seq += 1 lines, lineno = inspect.getsourcelines(frame) for l in lines: if lineno >= (frame.f_lineno - 10) and lineno <= ( frame.f_lineno + 10): output += u"%s%d: %s" % ( frame.f_lineno == lineno and '*' or ' ', lineno, l) lineno += 1 except Exception, process_exception: output += "\nEXCEPTION DURING PROCESSING: %s" % ustr( process_exception) frames.append((0, 0, { 'file_name': frame.f_code.co_filename, 'line_number': frame.f_lineno, 'src_code': output, 'locals': local_vars })) count += 1 tb = tb.tb_next frames.reverse() ge_obj = registry["base.general_exception"] vals = { 'service': service, 'exception': unicode(e), 'request': ustr(req), 'do_not_purge': False, 'user': uid, 'frames': frames, } ge_id = ge_obj.create(cr, SUPERUSER_ID, vals) ge = ge_obj.browse(cr, SUPERUSER_ID, ge_id) ename = ge.name cr.commit()
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html2plaintext(html, body_id=None, encoding="utf-8"): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath("//*[@id=%s]" % (body_id,)) else: source = tree.xpath("//body") if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall(".//a"): url = link.get("href") if url: i += 1 link.tag = "span" link.text = "%s [%s]" % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(" ", "") html = html.replace("<strong>", "*").replace("</strong>", "*") html = html.replace("<b>", "*").replace("</b>", "*") html = html.replace("<h3>", "*").replace("</h3>", "*") html = html.replace("<h2>", "**").replace("</h2>", "**") html = html.replace("<h1>", "**").replace("</h1>", "**") html = html.replace("<em>", "/").replace("</em>", "/") html = html.replace("<tr>", "\n") html = html.replace("</p>", "\n") html = re.sub("<br\s*/?>", "\n", html) html = re.sub("<.*?>", " ", html) html = html.replace(" " * 2, " ") html = html.replace(">", ">") html = html.replace("<", "<") html = html.replace("&", "&") # strip all lines html = "\n".join([x.strip() for x in html.splitlines()]) html = html.replace("\n" * 2, "\n") for i, url in enumerate(url_index): if i == 0: html += "\n\n" html += ustr("[%s] %s\n") % (i + 1, url) return html
def register_exception(db_name, model, uid, e, service, req): db = openerp.sql_db.db_connect(db_name) cr = db.cursor() registry = openerp.registry(cr.dbname) ename = "<unknown>" if "base.general_exception" in registry: tb = sys.exc_info()[2] if tb: frames = [] count = 0 while tb: frame = tb.tb_frame local_vars = [] output = '' try: if count >= 0: local_vars = [(0,0,{'name': ustr(k), 'value': ustr(v)}) for k,v in frame.f_locals.items()] local_vars.sort(key=lambda x: x[2]['name']) seq = 1 for lv in local_vars: lv[2]['sequence'] = seq seq += 1 lines, lineno = inspect.getsourcelines(frame) for l in lines: if lineno >= (frame.f_lineno - 10) and lineno <= (frame.f_lineno + 10): output += u"%s%d: %s" % (frame.f_lineno == lineno and '*' or ' ', lineno, l) lineno += 1 except Exception, process_exception: output += "\nEXCEPTION DURING PROCESSING: %s" % ustr(process_exception) frames.append((0,0, {'file_name': frame.f_code.co_filename, 'line_number': frame.f_lineno, 'src_code': output, 'locals': local_vars})) count += 1 tb = tb.tb_next frames.reverse() ge_obj = registry["base.general_exception"] vals = { 'service': service, 'exception': unicode(e), 'request': ustr(req), 'do_not_purge': False, 'user': uid, 'frames': frames, } ge_id = ge_obj.create(cr, SUPERUSER_ID, vals) ge = ge_obj.browse(cr, SUPERUSER_ID, ge_id) ename = ge.name cr.commit()
def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) cleaned = cleaner.clean_html(src) except TypeError: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove) cleaned = cleaner.clean_html(src) except Exception, e: if isinstance(e, etree.ParserError) and 'empty' in str(e): return "" _logger.warning('html_sanitize failed to parse %s' % (src)) cleaned = '<p>Impossible to parse</p>'
def html_sanitize(src, silent=True, strict=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs[ 'frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def slugify(string, max_length=None): """ Transform a string to a slug that can be used in a url path. This method will first try to do the job with python-slugify if present. Otherwise it will process string by stripping leading and ending spaces, converting unicode chars to ascii, lowering all chars and replacing spaces and underscore with hyphen "-". :param string: str :param max_length: int :rtype: str """ string = ustr(string) if slugify_lib: # There are 2 different libraries only python-slugify is supported try: return slugify_lib.slugify(string, max_length=max_length) except TypeError: pass uni = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('ascii') slug = re.sub(r'[\W_]', ' ', uni).strip().lower() slug = re.sub(r'[-\s]+', '-', slug) return slug[:max_length]
def html_sanitize(src, silent=True, strict=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, 'comments': False, 'processing_instructions' : False } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def str2bool(s, default=None): s = ustr(s).lower() y = 'y yes 1 true t on'.split() n = 'n no 0 false f off'.split() if s not in (y + n): if default is None: raise ValueError('Use 0/1/yes/no/true/false/on/off') return bool(default) return s in y
def html_sanitize(src, silent=True, strict=False): if not src: return src src = ustr(src, errors="replace") logger = logging.getLogger(__name__ + ".html_sanitize") # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace("<%", cgi.escape("<%")) src = src.replace("%>", cgi.escape("%>")) kwargs = { "page_structure": True, "style": False, # do not remove style attributes "forms": True, # remove form tags "remove_unknown_tags": False, "allow_tags": allowed_tags, "comments": False, "processing_instructions": False, } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({"kill_tags": tags_to_kill, "remove_tags": tags_to_remove}) else: kwargs["remove_tags"] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs.update({"safe_attrs_only": True, "safe_attrs": safe_attrs}) else: kwargs["safe_attrs_only"] = False # keep oe-data attributes + style kwargs["frames"] = (False,) # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace("%24", "$") cleaned = cleaned.replace("%7B", "{") cleaned = cleaned.replace("%7D", "}") cleaned = cleaned.replace("%20", " ") cleaned = cleaned.replace("%5B", "[") cleaned = cleaned.replace("%5D", "]") cleaned = cleaned.replace("<%", "<%") cleaned = cleaned.replace("%>", "%>") except etree.ParserError, e: if "empty" in str(e): return "" if not silent: raise logger.warning("ParserError obtained when sanitizing %r", src, exc_info=True) cleaned = "<p>ParserError when sanitizing</p>"
def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links idx = 0 final = '' link_tags = re.compile( r'(ftp|http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?' ) for item in re.finditer(link_tags, text): final += text[idx:item.start()] final += '<a href="%s" target="_blank">%s</a>' % (item.group(0), item.group(0)) idx = item.end() final += text[idx:] text = final # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (</body>, </html>, or EOF), and converting the provided content in html unless ``plaintext`` is False. Content conversion can be done in two ways: - wrapping it into a pre (preserve=True) - use plaintext2html (preserve=False, using container_tag to wrap the whole content) A side-effect of this method is to coerce all HTML tags to lowercase in ``html``, and strip enclosing <html> or <body> tags in content if ``plaintext`` is False. :param str html: html tagsoup (doesn't have to be XHTML) :param str content: extra content to append :param bool plaintext: whether content is plaintext and should be wrapped in a <pre/> tag. :param bool preserve: if content is plaintext, wrap it into a <pre> instead of converting it into html """ html = ustr(html) if plaintext and preserve: content = u'\n<pre>%s</pre>\n' % ustr(content) elif plaintext: content = '\n%s\n' % plaintext2html(content, container_tag) else: content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content) content = u'\n%s\n' % ustr(content) # Force all tags to lowercase html = re.sub( r'(</?)\W*(\w+)([ >])', lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html) insert_location = html.find('</body>') if insert_location == -1: insert_location = html.find('</html>') if insert_location == -1: return '%s%s' % (html, content) return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if smtp_header: text = decode_header(smtp_header.replace('\r', '')) # The joining space will not be needed as of Python 3.3 # See https://hg.python.org/cpython/rev/8c03fe231877 return ' '.join([ustr(x[0], x[1]) for x in text]) return u''
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if smtp_header: text = decode_header(smtp_header.replace("\r", "")) # The joining space will not be needed as of Python 3.3 # See https://hg.python.org/cpython/rev/8c03fe231877 return " ".join([ustr(x[0], x[1]) for x in text]) return u""
def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links idx = 0 final = '' link_tags = re.compile(r'(ftp|http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?') for item in re.finditer(link_tags, text): final += text[idx:item.start()] final += '<a href="%s" target="_blank">%s</a>' % (item.group(0), item.group(0)) idx = item.end() final += text[idx:] text = final # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def scan_languages(): """ Returns all languages supported by OpenERP for translation :returns: a list of (lang_code, lang_name) pairs :rtype: [(str, unicode)] """ csvpath = openerp.modules.module.get_resource_path('base', 'res', 'res.lang.csv') try: # read (code, name) from languages in base/res/res.lang.csv result = [] with open(csvpath) as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') fields = reader.next() code_index = fields.index("code") name_index = fields.index("name") for row in reader: result.append((ustr(row[code_index]), ustr(row[name_index]))) except Exception: _logger.error("Could not read %s", csvpath) result = [] return sorted(result or [('en_US', u'English')], key=itemgetter(1))
def build_invitation(self, email_from='', email_to='', subject='', email_cc=[], email_bcc=[], reply_to=False, attachments=None, message_id=None, references=None, object_id=False, headers={}, ): email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." msg = MIMEMultipart() if not headers: headers = {} if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header(COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header(COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() for key, value in headers.items(): msg[ustr(key).encode('utf-8')] = encode_header(value) text_to_body_added = False if attachments: #it is assumed for now that only ics file is attached!!! for fname, fcontent in attachments: if not text_to_body_added and fname == 'invite.ics': # Provide message description in body of message only as text for now; need fixes if 'DESCRIPTION:' in fcontent and 'LOCATION' in fcontent.split('DESCRIPTION')[1]: meeting_description_text = fcontent.split('DESCRIPTION:')[1].split('LOCATION')[0] text_converted_to_html = self.plaintext2html(meeting_description_text, tabstop=4) text_utf8 = re.sub(r'\\n', "</p><p>", text_converted_to_html) alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text_utf8, _charset='utf-8', _subtype='html')) msg.attach(alternative_part) #adding invitation stuff part = MIMEBase('text', 'calendar', charset='utf-8', method='REQUEST') part.set_payload(fcontent) msg.attach(part) return msg
def html_sanitize(src, silent=True): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if etree.LXML_VERSION >= (3, 1, 0): kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': safe_attrs, }) else: # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs['safe_attrs_only'] = False try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) except etree.ParserError: if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>Unknown error when sanitizing</p>' return cleaned
def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') root = lxml.html.fromstring(u"<div>%s</div>" % src) result = handle_element(root) res = [] for element in children(result[0]): if isinstance(element, basestring): res.append(element) else: element.tail = "" res.append(lxml.html.tostring(element)) return ''.join(res)
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (</body>, </html>, or EOF), and converting the provided content in html unless ``plaintext`` is False. Content conversion can be done in two ways: - wrapping it into a pre (preserve=True) - use plaintext2html (preserve=False, using container_tag to wrap the whole content) A side-effect of this method is to coerce all HTML tags to lowercase in ``html``, and strip enclosing <html> or <body> tags in content if ``plaintext`` is False. :param str html: html tagsoup (doesn't have to be XHTML) :param str content: extra content to append :param bool plaintext: whether content is plaintext and should be wrapped in a <pre/> tag. :param bool preserve: if content is plaintext, wrap it into a <pre> instead of converting it into html """ html = ustr(html) if plaintext and preserve: content = u'\n<pre>%s</pre>\n' % ustr(content) elif plaintext: content = '\n%s\n' % plaintext2html(content, container_tag) else: content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content) content = u'\n%s\n' % ustr(content) # Force all tags to lowercase html = re.sub(r'(</?)\W*(\w+)([ >])', lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html) insert_location = html.find('</body>') if insert_location == -1: insert_location = html.find('</html>') if insert_location == -1: return '%s%s' % (html, content) return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
def html_sanitize(src, silent=True): if not src: return src src = ustr(src, errors='replace') logger = _logger.getChild('html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if etree.LXML_VERSION >= (3, 1, 0): kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': clean.defs.safe_attrs | set(['style']), }) else: # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs['safe_attrs_only'] = False try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) except etree.ParserError: if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>Unknown error when sanitizing</p>' return cleaned
def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) cleaned = cleaner.clean_html(src) except TypeError, e: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove) cleaned = cleaner.clean_html(src)
def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') # html encode email tags part = re.compile(r"(<[^<>]+@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) cleaned = cleaner.clean_html(src) except TypeError, e: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove) cleaned = cleaner.clean_html(src)
def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, protect_sections=False): """ html_email_clean: clean the html by doing the following steps: - try to strip email quotes, by removing blockquotes or having some client- specific heuristics - try to strip signatures - shorten the html to a maximum number of characters if requested Some specific use case: - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of a quote; detecting by finding WordSection1 of MsoNormal - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect Hotmail by funding ``SkyDrivePlaceholder`` :param string html: sanitized html; tags like html or head should not be present in the html string. This method therefore takes as input html code coming from a sanitized source, like fields.html. :param boolean remove: remove the html code that is unwanted; otherwise it is only flagged and tagged :param boolean shorten: shorten the html; every excessing content will be flagged as to remove :param int max_length: if shortening, maximum number of characters before shortening :param dict expand_options: options for the read more link when shortening the content.The used keys are the following: - oe_expand_container_tag: class applied to the container of the whole read more link - oe_expand_container_class: class applied to the link container (default: oe_mail_expand) - oe_expand_container_content: content of the container (default: ...) - oe_expand_separator_node: optional separator, like adding ... <br /><br /> <a ...>read more</a> (default: void) - oe_expand_a_href: href of the read more link itself (default: #) - oe_expand_a_class: class applied to the <a> containing the link itself (default: oe_mail_expand) - oe_expand_a_content: content of the <a> (default: read more) The formatted read more link is the following: <cont_tag class="oe_expand_container_class"> oe_expand_container_content if expand_options.get('oe_expand_separator_node'): <oe_expand_separator_node/> <a href="oe_expand_a_href" class="oe_expand_a_class"> oe_expand_a_content </a> </span> """ def _replace_matching_regex(regex, source, replace=''): """ Replace all matching expressions in source by replace """ if not source: return source dest = '' idx = 0 for item in re.finditer(regex, source): dest += source[idx:item.start()] + replace idx = item.end() dest += source[idx:] return dest def _create_node(tag, text, tail=None, attrs={}): new_node = etree.Element(tag) new_node.text = text new_node.tail = tail for key, val in attrs.iteritems(): new_node.set(key, val) return new_node def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}): new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs) node.insert(index, new_node) return new_node def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}): text = node.text or '' if not re.search(regex, text): return cur_node = node node.text = '' idx, iteration = 0, 0 for item in re.finditer(regex, text): if iteration == 0: cur_node.text = text[idx:item.start()] else: _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()]) new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs) cur_node = new_node idx = item.end() iteration += 1 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {}) def _truncate_node(node, position, simplify_whitespaces=True): """ Truncate a node text at a given position. This algorithm will shorten at the end of the word whose ending character exceeds position. :param bool simplify_whitespaces: whether to try to count all successive whitespaces as one character. This option should not be True when trying to keep 'pre' consistency. """ if node.text is None: node.text = '' truncate_idx = -1 if simplify_whitespaces: cur_char_nbr = 0 word = None node_words = node.text.strip(' \t\r\n').split() for word in node_words: cur_char_nbr += len(word) if cur_char_nbr >= position: break if word: truncate_idx = node.text.find(word) + len(word) else: truncate_idx = position if truncate_idx == -1 or truncate_idx > len(node.text): truncate_idx = len(node.text) # compose new text bits innertext = node.text[0:truncate_idx] outertext = node.text[truncate_idx:] node.text = innertext # create <span> ... <a href="#">read more</a></span> node read_more_node = _create_node( expand_options.get('oe_expand_container_tag', 'span'), expand_options.get('oe_expand_container_content', ' ... '), None, { 'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand') }) if expand_options.get('oe_expand_separator_node'): read_more_separator_node = _create_node( expand_options.get('oe_expand_separator_node'), '', None, {}) read_more_node.append(read_more_separator_node) read_more_link_node = _create_node( 'a', expand_options.get('oe_expand_a_content', _('read more')), None, { 'href': expand_options.get('oe_expand_a_href', '#'), 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'), }) read_more_node.append(read_more_link_node) # create outertext node overtext_node = _create_node('span', outertext) # tag node overtext_node.set('in_overlength', '1') # add newly created nodes in dom node.append(read_more_node) node.append(overtext_node) if expand_options is None: expand_options = {} whitelist_classes_local = whitelist_classes.copy() if expand_options.get('oe_expand_container_class'): whitelist_classes_local.add( expand_options.get('oe_expand_container_class')) if expand_options.get('oe_expand_a_class'): whitelist_classes_local.add(expand_options.get('oe_expand_a_class')) if not html or not isinstance(html, basestring): return html html = ustr(html) # Pre processing # ------------------------------------------------------------ # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}' # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) # html: ClEditor seems to love using <div><br /><div> -> replace with <br /> br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE) inner_html = _replace_matching_regex(br_div_tags, html, '<br />') # form a tree root = lxml.html.fromstring(inner_html) if not len(root) and root.text is None and root.tail is None: inner_html = '<div>%s</div>' % inner_html root = lxml.html.fromstring(inner_html) quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') signature = re.compile(r'(^[-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)', re.M) for node in root.iter(): # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass if node.tail: tail_node = _create_node('span', node.tail) node.tail = None node.addnext(tail_node) # form node and tag text-based quotes and signature _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) # Processing # ------------------------------------------------------------ # tree: tag nodes # signature_begin = False # try dynamic signature recognition quoted = False quote_begin = False overlength = False replace_class = False overlength_section_id = None overlength_section_count = 0 cur_char_nbr = 0 for node in root.iter(): # comments do not need processing # note: bug in node.get(value, default) for HtmlComments, default never returned if node.tag == etree.Comment: continue # do not take into account multiple spaces that are displayed as max 1 space in html node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split()) # remove unwanted classes from node if node.get('class'): sanitize_classes = [] for _class in node.get('class').split(' '): if _class in whitelist_classes_local: sanitize_classes.append(_class) else: sanitize_classes.append('cleaned_' + _class) replace_class = True node.set('class', ' '.join(sanitize_classes)) # root: try to tag the client used to write the html if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get( 'class', ''): root.set('msoffice', '1') if 'SkyDrivePlaceholder' in node.get( 'class', '') or 'SkyDrivePlaceholder' in node.get('id', ''): root.set('hotmail', '1') # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later if node.tag == 'section': overlength_section_count += 1 node.set('section_closure', str(overlength_section_count)) if node.getparent() is not None and ( node.getparent().get('section_closure') or node.getparent().get('section_inner')): node.set('section_inner', str(overlength_section_count)) # state of the parsing: flag quotes and tails to remove if quote_begin: node.set('in_quote', '1') node.set('tail_remove', '1') # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections) if overlength: if not overlength_section_id or int( node.get('section_inner', overlength_section_count + 1)) > overlength_section_count: node.set('in_overlength', '1') node.set('tail_remove', '1') # find quote in msoffice / hotmail / blockquote / text quote and signatures if root.get('msoffice' ) and node.tag == 'div' and 'border-top:solid' in node.get( 'style', ''): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if root.get('hotmail') and node.tag == 'hr' and ( 'stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if node.tag == 'blockquote' or node.get('text_quote') or node.get( 'text_signature'): # here no quote_begin because we want to be able to remove some quoted # text without removing all the remaining context quoted = True node.set('in_quote', '1') if node.getparent() is not None and node.getparent().get('in_quote'): # inside a block of removed text but not in quote_begin (see above) quoted = True node.set('in_quote', '1') # shorten: # if protect section: # 1/ find the first parent not being inside a section # 2/ add the read more link # else: # 1/ truncate the text at the next available space # 2/ create a 'read more' node, next to current node # 3/ add the truncated text in a new node, next to 'read more' node node_text = (node.text or '').strip().strip('\n').strip() if shorten and not overlength and cur_char_nbr + len( node_text) > max_length: node_to_truncate = node while node_to_truncate.getparent() is not None: if node_to_truncate.get('in_quote'): node_to_truncate = node_to_truncate.getparent() elif protect_sections and ( node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')): node_to_truncate = node_to_truncate.getparent() overlength_section_id = node_to_truncate.get( 'section_closure') else: break overlength = True node_to_truncate.set('truncate', '1') if node_to_truncate == node: node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr)) else: node_to_truncate.set('truncate_position', str(len(node.text or ''))) cur_char_nbr += len(node_text) # Tree modification # ------------------------------------------------------------ for node in root.iter(): if node.get('truncate'): _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre') # Post processing # ------------------------------------------------------------ to_remove = [] for node in root.iter(): if node.get('in_quote') or node.get('in_overlength'): # copy the node tail into parent text if node.tail and not node.get('tail_remove'): parent = node.getparent() parent.tail = node.tail + (parent.tail or '') to_remove.append(node) if node.get('tail_remove'): node.tail = '' # clean node for attribute_name in [ 'in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position' ]: node.attrib.pop(attribute_name, None) for node in to_remove: if remove: node.getparent().remove(node) else: if not expand_options.get( 'oe_expand_a_class', 'oe_mail_expand' ) in node.get( 'class', '' ): # trick: read more link should be displayed even if it's in overlength node_class = node.get('class', '') + ' oe_mail_cleaned' node.set('class', node_class) if not overlength and not quote_begin and not quoted and not replace_class: return html # html: \n that were tail of elements have been encapsulated into <span> -> back to \n html = etree.tostring(root, pretty_print=False, encoding='UTF-8') linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL) html = _replace_matching_regex(linebreaks, html, '\n') return ustr(html)
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) from lxml.etree import tostring try: from lxml.html.soupparser import fromstring kwargs = {} except ImportError: _logger.debug('tools.misc.html2plaintext: cannot use BeautifulSoup, fallback to lxml.etree.HTMLParser') from lxml.etree import fromstring, HTMLParser kwargs = dict(parser=HTMLParser()) tree = fromstring(html, **kwargs) if body_id is not None: source = tree.xpath('//*[@id=%s]'%(body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(tostring(tree, encoding=encoding)) html = html.replace('<strong>','*').replace('</strong>','*') html = html.replace('<b>','*').replace('</b>','*') html = html.replace('<h3>','*').replace('</h3>','*') html = html.replace('<h2>','**').replace('</h2>','**') html = html.replace('<h1>','**').replace('</h1>','**') html = html.replace('<em>','/').replace('</em>','/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i+1, url) return html
def update_delivery(self, cr, uid, ids, context=None): err_res = "" for delivery_update_obj in self.browse(cr, uid, ids, context=context): res = "" for line in delivery_update_obj.delivery_update_lines: shop = self.pool.get('taobao.shop').browse(cr,uid,line.taobao_shop_id.id) top = TOP(shop.taobao_app_key, shop.taobao_app_secret, shop.taobao_session_key) if not line.carrier_tracking_ref: res += u"发货单号:%s,淘宝单号:%s 该单没有运单号。\n"%(line.delivery_ref,line.tid) continue try: tao_res = self.pool.get('stock.picking.out')._top_item_deliver_update(top, line.carrier_tracking_ref, line.tid, company_code = line.company_code) except TOPException,e: res += u"发货单号:%s,淘宝单号:%s 发货错误[%s] \n" % (line.delivery_ref,line.tid,ustr(e)) if res:err_res = "" + res
def build_email(self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype='plain', headers=None, body_alternative=None, subtype_alternative='plain'): """Constructs an RFC2822 email.message.Message object based on the keyword arguments passed, and returns it. :param string email_from: sender email address :param list email_to: list of recipient addresses (to be joined with commas) :param string subject: email subject (no pre-encoding/quoting necessary) :param string body: email body, of the type ``subtype`` (by default, plaintext). If html subtype is used, the message will be automatically converted to plaintext and wrapped in multipart/alternative, unless an explicit ``body_alternative`` version is passed. :param string body_alternative: optional alternative body, of the type specified in ``subtype_alternative`` :param string reply_to: optional value of Reply-To header :param string object_id: optional tracking identifier, to be included in the message-id for recognizing replies. Suggested format for object-id is "res_id-model", e.g. "12345-crm.lead". :param string subtype: optional mime subtype for the text body (usually 'plain' or 'html'), must match the format of the ``body`` parameter. Default is 'plain', making the content part of the mail "text/plain". :param string subtype_alternative: optional mime subtype of ``body_alternative`` (usually 'plain' or 'html'). Default is 'plain'. :param list attachments: list of (filename, filecontents) pairs, where filecontents is a string containing the bytes of the attachment :param list email_cc: optional list of string values for CC header (to be joined with commas) :param list email_bcc: optional list of string values for BCC header (to be joined with commas) :param dict headers: optional map of headers to set on the outgoing mail (may override the other headers, including Subject, Reply-To, Message-Id, etc.) :rtype: email.message.Message (usually MIMEMultipart) :return: the new RFC2822 email message """ email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body_utf8 = ustr(body).encode('utf-8') email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset='utf-8') msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header(COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header(COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode('utf-8')] = encode_header(value) if subtype == 'html' and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text(email_body_utf8.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text_utf8, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode('utf-8') alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset='utf-8') alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase('application', "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param('name', filename_rfc2047) part.add_header('Content-Disposition', 'attachment', filename=filename_rfc2047) part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) from lxml.etree import tostring try: from lxml.html.soupparser import fromstring kwargs = {} except ImportError: _logger.debug( 'tools.misc.html2plaintext: cannot use BeautifulSoup, fallback to lxml.etree.HTMLParser' ) from lxml.etree import fromstring, HTMLParser kwargs = dict(parser=HTMLParser()) tree = fromstring(html, **kwargs) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id, )) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(tostring(tree, encoding=encoding)) html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def build_invitation( self, email_from='', email_to='', subject='', email_cc=[], email_bcc=[], reply_to=False, attachments=None, message_id=None, references=None, object_id=False, headers={}, ): email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." msg = MIMEMultipart() if not headers: headers = {} if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header( COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header( COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() for key, value in headers.items(): msg[ustr(key).encode('utf-8')] = encode_header(value) text_to_body_added = False if attachments: #it is assumed for now that only ics file is attached!!! for fname, fcontent in attachments: if not text_to_body_added and fname == 'invite.ics': # Provide message description in body of message only as text for now; need fixes if 'DESCRIPTION:' in fcontent and 'LOCATION' in fcontent.split( 'DESCRIPTION')[1]: meeting_description_text = fcontent.split( 'DESCRIPTION:')[1].split('LOCATION')[0] text_converted_to_html = self.plaintext2html( meeting_description_text, tabstop=4) text_utf8 = re.sub(r'\\n', "</p><p>", text_converted_to_html) alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach( MIMEText(text_utf8, _charset='utf-8', _subtype='html')) msg.attach(alternative_part) #adding invitation stuff part = MIMEBase('text', 'calendar', charset='utf-8', method='REQUEST') part.set_payload(fcontent) msg.attach(part) return msg
def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attach=None, openobject_id=False, ssl=False, debug=False, subtype='plain', x_headers=None, priority='3'): """Send an email. @param email_from A string used to fill the `From` header, if falsy, config['email_from'] is used instead. Also used for the `Reply-To` header if `reply_to` is not provided @param email_to a sequence of addresses to send the mail to. """ if x_headers is None: x_headers = {} if not (email_from or config['email_from']): raise ValueError("Sending an email requires either providing a sender " "address or having configured one") if not email_from: email_from = config.get('email_from', False) email_from = ustr(email_from).encode('utf-8') if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body = ustr(body).encode('utf-8') email_text = MIMEText(email_body or '',_subtype=subtype,_charset='utf-8') msg = MIMEMultipart() msg['Subject'] = Header(ustr(subject), 'utf-8') msg['From'] = email_from del msg['Reply-To'] if reply_to: msg['Reply-To'] = reply_to else: msg['Reply-To'] = msg['From'] msg['To'] = COMMASPACE.join(email_to) if email_cc: msg['Cc'] = COMMASPACE.join(email_cc) if email_bcc: msg['Bcc'] = COMMASPACE.join(email_bcc) msg['Date'] = formatdate(localtime=True) msg['X-Priority'] = priorities.get(priority, '3 (Normal)') # Add dynamic X Header for key, value in x_headers.iteritems(): msg['%s' % key] = str(value) if html2text and subtype == 'html': text = html2text(email_body.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text) msg.attach(alternative_part) else: msg.attach(email_text) if attach: for (fname,fcontent) in attach: part = MIMEBase('application', "octet-stream") part.set_payload( fcontent ) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' % (fname,)) msg.attach(part) return _email_send(email_from, flatten([email_to, email_cc, email_bcc]), msg, openobject_id=openobject_id, ssl=ssl, debug=debug)
def html_sanitize(src, silent=True, strict=False, strip_style=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace('<%', cgi.escape('<%')) src = src.replace('%>', cgi.escape('%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, 'comments': False, 'processing_instructions': False } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') cleaned = cleaned.replace('<%', '<%') cleaned = cleaned.replace('%>', '%>') except etree.ParserError as e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>Unknown error when sanitizing</p>' # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that if cleaned.startswith('<div>') and cleaned.endswith('</div>'): cleaned = cleaned[5:-6] return cleaned
def html_sanitize(src, silent=True, strict=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: 'cite=' not in m.group(1) and cgi.escape(m.group(1)) or m. group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace('<%', cgi.escape('<%')) src = src.replace('%>', cgi.escape('%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, 'comments': False, 'processing_instructions': False } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs[ 'frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') cleaned = cleaned.replace('<%', '<%') cleaned = cleaned.replace('%>', '%>') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def build_email(self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype='plain', headers=None, body_alternative=None, subtype_alternative='plain'): """ copy-pasted from openerp/addons/base/ir/ir_mail_server.py::build_email """ ftemplate = '__image-%s__' fcounter = 0 attachments = attachments or [] pattern = re.compile(r'"data:image/png;base64,[^"]*"') pos = 0 new_body = '' while True: match = pattern.search(body, pos) if not match: break s = match.start() e = match.end() data = body[s+len('"data:image/png;base64,'):e-1] new_body += body[pos:s] fname = ftemplate % fcounter fcounter += 1 attachments.append( (fname, base64.b64decode(data)) ) new_body += '"cid:%s"' % fname pos = e new_body += body[pos:] body = new_body email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body_utf8 = ustr(body).encode('utf-8') email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset='utf-8') msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header(COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header(COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode('utf-8')] = encode_header(value) if subtype == 'html' and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text(email_body_utf8.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text_utf8, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode('utf-8') alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset='utf-8') alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase('application', "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param('name', filename_rfc2047) part.add_header('Content-Disposition', 'attachment', filename=filename_rfc2047) part.add_header('Content-ID', '<%s>' % filename_rfc2047) # NEW STUFF part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg
def html_email_clean(html, remove=False, shorten=False, max_length=300): """ html_email_clean: clean the html by doing the following steps: - try to strip email quotes, by removing blockquotes or having some client- specific heuristics - try to strip signatures - shorten the html to a maximum number of characters if requested Some specific use case: - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of a quote; detecting by finding WordSection1 of MsoNormal - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect Hotmail by funding ``SkyDrivePlaceholder`` :param string html: sanitized html; tags like html or head should not be present in the html string. This method therefore takes as input html code coming from a sanitized source, like fields.html. :param boolean remove: remove the html code that is unwanted; otherwise it is only flagged and tagged :param boolean shorten: shorten the html; every excessing content will be flagged as to remove :param int max_length: if shortening, maximum number of characters before shortening """ def _replace_matching_regex(regex, source, replace=''): """ Replace all matching expressions in source by replace """ if not source: return source dest = '' idx = 0 for item in re.finditer(regex, source): dest += source[idx:item.start()] + replace idx = item.end() dest += source[idx:] return dest def _create_node(tag, text, tail=None, attrs={}): new_node = etree.Element(tag) new_node.text = text new_node.tail = tail for key, val in attrs.iteritems(): new_node.set(key, val) return new_node def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}): new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs) node.insert(index, new_node) return new_node def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}): text = node.text or '' if not re.search(regex, text): return cur_node = node node.text = '' idx, iteration = 0, 0 for item in re.finditer(regex, text): if iteration == 0: cur_node.text = text[idx:item.start()] else: _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()]) new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs) cur_node = new_node idx = item.end() iteration += 1 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {}) if not html or not isinstance(html, basestring): return html html = ustr(html) # Pre processing # ------------------------------------------------------------ # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}' # html: remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) # html: ClEditor seems to love using <div><br /><div> -> replace with <br /> br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE) html = _replace_matching_regex(br_div_tags, html, '<br />') # form a tree root = lxml.html.fromstring(html) if not len(root) and root.text is None and root.tail is None: html = '<div>%s</div>' % html root = lxml.html.fromstring(html) # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass for node in root.getiterator(): if node.tail: tail_node = _create_node('span', node.tail) node.tail = None node.addnext(tail_node) # form node and tag text-based quotes and signature quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)') for node in root.getiterator(): _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) # Processing # ------------------------------------------------------------ # tree: tag nodes # signature_begin = False # try dynamic signature recognition quote_begin = False overlength = False cur_char_nbr = 0 for node in root.getiterator(): # root: try to tag the client used to write the html if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''): root.set('msoffice', '1') if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''): root.set('hotmail', '1') # state of the parsing if quote_begin: node.set('in_quote', '1') node.set('tail_remove', '1') if overlength: node.set('in_overlength', '1') node.set('tail_remove', '1') if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') # shorten: # 1/ truncate the text at the next available space # 2/ create a 'read more' node, next to current node # 3/ add the truncated text in a new node, next to 'read more' node if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length: overlength = True # truncate text innertext = node.text[0:(max_length - cur_char_nbr)] outertext = node.text[(max_length - cur_char_nbr):] stop_idx = outertext.find(' ') if stop_idx == -1: stop_idx = len(outertext) node.text = innertext + outertext[0:stop_idx] # create <span> ... <a href="#">read more</a></span> node read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'}) read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'}) read_more_node.append(read_more_link_node) # create outertext node new_node = _create_node('span', outertext[stop_idx:]) # add newly created nodes in dom node.addnext(new_node) node.addnext(read_more_node) # tag node new_node.set('in_overlength', '1') cur_char_nbr += len(node.text or '') if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'): node.set('in_quote', '1') # Post processing # ------------------------------------------------------------ to_remove = [] for node in root.getiterator(): if node.get('in_quote') or node.get('in_overlength'): # copy the node tail into parent text if node.tail and not node.get('tail_remove'): parent = node.getparent() parent.tail = node.tail + (parent.tail or '') to_remove.append(node) if node.get('tail_remove'): node.tail = '' for node in to_remove: if remove: node.getparent().remove(node) else: if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned' node.set('class', node_class) # html: \n that were tail of elements have been encapsulated into <span> -> back to \n html = etree.tostring(root, pretty_print=False) linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL) html = _replace_matching_regex(linebreaks, html, '\n') return html
def html_email_clean(html): """ html_email_clean: clean the html to display in the web client. - strip email quotes (remove blockquote nodes) - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by \n to avoid ignoring signatures converted into html :param string html: sanitized html; tags like html or head should not be present in the html string. This method therefore takes as input html code coming from a sanitized source, like fields.html. """ def _replace_matching_regex(regex, source, replace=''): dest = '' idx = 0 for item in re.finditer(regex, source): dest += source[idx:item.start()] + replace idx = item.end() dest += source[idx:] return dest if not html or not isinstance(html, basestring): return html html = ustr(html) # 0. remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])') html = _replace_matching_regex(br_tags, html, '__BR_TAG__') # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre root = lxml.html.fromstring(html) if not len(root) and root.text is None and root.tail is None: html = '<div>%s</div>' % html root = lxml.html.fromstring(html) # 2.5 remove quoted text in nodes quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') for node in root.getiterator(): if not node.text: continue node.text = _replace_matching_regex(quote_tags, node.text) # 3. remove blockquotes quotes = [el for el in root.getiterator(tag='blockquote')] for node in quotes: # copy the node tail into parent text if node.tail: parent = node.getparent() parent.text = parent.text or '' + node.tail # remove the node node.getparent().remove(node) # 4. strip signatures signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)') for elem in root.getiterator(): if elem.text: match = re.search(signature, elem.text) if match: elem.text = elem.text[:match.start()] + elem.text[match.end():] if elem.tail: match = re.search(signature, elem.tail) if match: elem.tail = elem.tail[:match.start()] + elem.tail[match.end():] # 5. \n back to <br/> html = etree.tostring(root, pretty_print=True) html = html.replace('__BR_TAG__', '<br />') # 6. Misc cleaning : # - ClEditor seems to love using <div><br /><div> -> replace with <br /> br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)') html = _replace_matching_regex(br_div_tags, html, '<br />') return html
cr.close() return ename old_jsonRequest = http.JsonRequest old_dispatch = http.JsonRequest.dispatch def new_dispatch(self): try: if self.jsonp_handler: return self.jsonp_handler() result = self._call_function(**self.params) return self._json_response(result) except osv.except_osv, oe: return self._handle_exception(oe) except Exception, e: ename = register_exception(self.session.db, self.jsonrequest['params'].get('model', ''), self.session.uid, e, ustr(self.httprequest), ustr(self.jsonrequest)) return self._handle_exception( osv.except_osv( _('Error!'), _('Please contact your system administrator, exception ID [%s]' ) % ename)) http.JsonRequest.dispatch = new_dispatch
def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, protect_sections=False): """ html_email_clean: clean the html by doing the following steps: - try to strip email quotes, by removing blockquotes or having some client- specific heuristics - try to strip signatures - shorten the html to a maximum number of characters if requested Some specific use case: - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of a quote; detecting by finding WordSection1 of MsoNormal - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect Hotmail by funding ``SkyDrivePlaceholder`` :param string html: sanitized html; tags like html or head should not be present in the html string. This method therefore takes as input html code coming from a sanitized source, like fields.html. :param boolean remove: remove the html code that is unwanted; otherwise it is only flagged and tagged :param boolean shorten: shorten the html; every excessing content will be flagged as to remove :param int max_length: if shortening, maximum number of characters before shortening :param dict expand_options: options for the read more link when shortening the content.The used keys are the following: - oe_expand_container_tag: class applied to the container of the whole read more link - oe_expand_container_class: class applied to the link container (default: oe_mail_expand) - oe_expand_container_content: content of the container (default: ...) - oe_expand_separator_node: optional separator, like adding ... <br /><br /> <a ...>read more</a> (default: void) - oe_expand_a_href: href of the read more link itself (default: #) - oe_expand_a_class: class applied to the <a> containing the link itself (default: oe_mail_expand) - oe_expand_a_content: content of the <a> (default: read more) The formatted read more link is the following: <cont_tag class="oe_expand_container_class"> oe_expand_container_content if expand_options.get('oe_expand_separator_node'): <oe_expand_separator_node/> <a href="oe_expand_a_href" class="oe_expand_a_class"> oe_expand_a_content </a> </span> """ def _replace_matching_regex(regex, source, replace=''): """ Replace all matching expressions in source by replace """ if not source: return source dest = '' idx = 0 for item in re.finditer(regex, source): dest += source[idx:item.start()] + replace idx = item.end() dest += source[idx:] return dest def _create_node(tag, text, tail=None, attrs={}): new_node = etree.Element(tag) new_node.text = text new_node.tail = tail for key, val in attrs.iteritems(): new_node.set(key, val) return new_node def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}): new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs) node.insert(index, new_node) return new_node def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}): text = node.text or '' if not re.search(regex, text): return cur_node = node node.text = '' idx, iteration = 0, 0 for item in re.finditer(regex, text): if iteration == 0: cur_node.text = text[idx:item.start()] else: _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()]) new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs) cur_node = new_node idx = item.end() iteration += 1 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {}) def _truncate_node(node, position, simplify_whitespaces=True): """ Truncate a node text at a given position. This algorithm will shorten at the end of the word whose ending character exceeds position. :param bool simplify_whitespaces: whether to try to count all successive whitespaces as one character. This option should not be True when trying to keep 'pre' consistency. """ if node.text is None: node.text = '' truncate_idx = -1 if simplify_whitespaces: cur_char_nbr = 0 word = None node_words = node.text.strip(' \t\r\n').split() for word in node_words: cur_char_nbr += len(word) if cur_char_nbr >= position: break if word: truncate_idx = node.text.find(word) + len(word) else: truncate_idx = position if truncate_idx == -1 or truncate_idx > len(node.text): truncate_idx = len(node.text) # compose new text bits innertext = node.text[0:truncate_idx] outertext = node.text[truncate_idx:] node.text = innertext # create <span> ... <a href="#">read more</a></span> node read_more_node = _create_node( expand_options.get('oe_expand_container_tag', 'span'), expand_options.get('oe_expand_container_content', ' ... '), None, {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')} ) if expand_options.get('oe_expand_separator_node'): read_more_separator_node = _create_node( expand_options.get('oe_expand_separator_node'), '', None, {} ) read_more_node.append(read_more_separator_node) read_more_link_node = _create_node( 'a', expand_options.get('oe_expand_a_content', 'read more'), None, { 'href': expand_options.get('oe_expand_a_href', '#'), 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'), } ) read_more_node.append(read_more_link_node) # create outertext node overtext_node = _create_node('span', outertext) # tag node overtext_node.set('in_overlength', '1') # add newly created nodes in dom node.append(read_more_node) node.append(overtext_node) if expand_options is None: expand_options = {} if not html or not isinstance(html, basestring): return html html = ustr(html) # Pre processing # ------------------------------------------------------------ # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}' # html: remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) # html: ClEditor seems to love using <div><br /><div> -> replace with <br /> br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE) html = _replace_matching_regex(br_div_tags, html, '<br />') # form a tree root = lxml.html.fromstring(html) if not len(root) and root.text is None and root.tail is None: html = '<div>%s</div>' % html root = lxml.html.fromstring(html) quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)') for node in root.iter(): # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass if node.tail: tail_node = _create_node('span', node.tail) node.tail = None node.addnext(tail_node) # form node and tag text-based quotes and signature _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) # Processing # ------------------------------------------------------------ # tree: tag nodes # signature_begin = False # try dynamic signature recognition quote_begin = False overlength = False overlength_section_id = None overlength_section_count = 0 cur_char_nbr = 0 for node in root.iter(): # comments do not need processing # note: bug in node.get(value, default) for HtmlComments, default never returned if node.tag == etree.Comment: continue # do not take into account multiple spaces that are displayed as max 1 space in html node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split()) # root: try to tag the client used to write the html if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''): root.set('msoffice', '1') if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''): root.set('hotmail', '1') # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later if node.tag == 'section': overlength_section_count += 1 node.set('section_closure', str(overlength_section_count)) if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')): node.set('section_inner', str(overlength_section_count)) # state of the parsing: flag quotes and tails to remove if quote_begin: node.set('in_quote', '1') node.set('tail_remove', '1') # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections) if overlength: if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count: node.set('in_overlength', '1') node.set('tail_remove', '1') # find quote in msoffice / hotmail / blockquote / text quote and signatures if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'): # here no quote_begin because we want to be able to remove some quoted # text without removing all the remaining context node.set('in_quote', '1') if node.getparent() is not None and node.getparent().get('in_quote'): # inside a block of removed text but not in quote_begin (see above) node.set('in_quote', '1') # shorten: # if protect section: # 1/ find the first parent not being inside a section # 2/ add the read more link # else: # 1/ truncate the text at the next available space # 2/ create a 'read more' node, next to current node # 3/ add the truncated text in a new node, next to 'read more' node node_text = (node.text or '').strip().strip('\n').strip() if shorten and not overlength and cur_char_nbr + len(node_text) > max_length: node_to_truncate = node while node_to_truncate.getparent() is not None: if node_to_truncate.get('in_quote'): node_to_truncate = node_to_truncate.getparent() elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')): node_to_truncate = node_to_truncate.getparent() overlength_section_id = node_to_truncate.get('section_closure') else: break overlength = True node_to_truncate.set('truncate', '1') if node_to_truncate == node: node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr)) else: node_to_truncate.set('truncate_position', str(len(node.text or ''))) cur_char_nbr += len(node_text) # Tree modification # ------------------------------------------------------------ for node in root.iter(): if node.get('truncate'): _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre') # Post processing # ------------------------------------------------------------ to_remove = [] for node in root.iter(): if node.get('in_quote') or node.get('in_overlength'): # copy the node tail into parent text if node.tail and not node.get('tail_remove'): parent = node.getparent() parent.tail = node.tail + (parent.tail or '') to_remove.append(node) if node.get('tail_remove'): node.tail = '' # clean node for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']: node.attrib.pop(attribute_name, None) for node in to_remove: if remove: node.getparent().remove(node) else: if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength node_class = node.get('class', '') + ' oe_mail_cleaned' node.set('class', node_class) # html: \n that were tail of elements have been encapsulated into <span> -> back to \n html = etree.tostring(root, pretty_print=False) linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL) html = _replace_matching_regex(linebreaks, html, '\n') return html
def html_sanitize( src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False, ): if not src: return src src = ustr(src, errors="replace") # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL ) src = doctype.sub(r"", src) logger = logging.getLogger(__name__ + ".html_sanitize") # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: ("cite=" not in m.group(1) and "alt=" not in m.group(1)) and cgi.escape(m.group(1)) or m.group(1), src ) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace("<%", cgi.escape("<%")) src = src.replace("%>", cgi.escape("%>")) kwargs = { "page_structure": True, "style": strip_style, # True = remove style tags/attrs "sanitize_style": sanitize_style, # True = sanitize styling "forms": True, # True = remove form tags "remove_unknown_tags": False, "comments": False, "processing_instructions": False, } if sanitize_tags: kwargs["allow_tags"] = allowed_tags if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({"kill_tags": tags_to_kill, "remove_tags": tags_to_remove}) else: kwargs["remove_tags"] = tags_to_kill + tags_to_remove if sanitize_attributes and etree.LXML_VERSION >= ( 3, 1, 0, ): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(["class"]) else: current_safe_attrs = safe_attrs kwargs.update({"safe_attrs_only": True, "safe_attrs": current_safe_attrs}) else: kwargs.update( { "safe_attrs_only": False, # keep oe-data attributes + style "strip_classes": strip_classes, # remove classes, even when keeping other attributes } ) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace("%24", "$") cleaned = cleaned.replace("%7B", "{") cleaned = cleaned.replace("%7D", "}") cleaned = cleaned.replace("%20", " ") cleaned = cleaned.replace("%5B", "[") cleaned = cleaned.replace("%5D", "]") cleaned = cleaned.replace("%7C", "|") cleaned = cleaned.replace("<%", "<%") cleaned = cleaned.replace("%>", "%>") except etree.ParserError, e: if "empty" in str(e): return "" if not silent: raise logger.warning("ParserError obtained when sanitizing %r", src, exc_info=True) cleaned = "<p>ParserError when sanitizing</p>"
def build_email(self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype='plain', headers=None, body_alternative=None, subtype_alternative='plain'): """ copy-pasted from openerp/addons/base/models/ir_mail_server.py::build_email """ ftemplate = '__image-%s__' fcounter = 0 attachments = attachments or [] pattern = re.compile(r'"data:image/png;base64,[^"]*"') pos = 0 new_body = '' body = body or '' while True: match = pattern.search(body, pos) if not match: break s = match.start() e = match.end() data = body[s + len('"data:image/png;base64,'):e - 1] new_body += body[pos:s] fname = ftemplate % fcounter fcounter += 1 attachments.append((fname, base64.b64decode(data))) new_body += '"cid:%s"' % fname pos = e new_body += body[pos:] body = new_body email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body_utf8 = ustr(body).encode('utf-8') email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset='utf-8') msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header(COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header(COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode('utf-8')] = encode_header(value) if subtype == 'html' and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text(email_body_utf8.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text_utf8, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode('utf-8') alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset='utf-8') alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase('application', "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param('name', filename_rfc2047) part.add_header('Content-Disposition', 'attachment', filename=filename_rfc2047) part.add_header('Content-ID', '<%s>' % filename_rfc2047) # NEW STUFF part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg
def html_sanitize(src, silent=True, strict=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(r"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub(lambda m: 'cite=' not in m.group(1) and cgi.escape(m.group(1)) or m.group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace('<%', cgi.escape('<%')) src = src.replace('%>', cgi.escape('%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, 'comments': False, 'processing_instructions': False } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') cleaned = cleaned.replace('<%', '<%') cleaned = cleaned.replace('%>', '%>') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'