def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using cgi.escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = cgi.escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links text = html_keep_url(text) # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (</body>, </html>, or EOF), and converting the provided content in html unless ``plaintext`` is False. Content conversion can be done in two ways: - wrapping it into a pre (preserve=True) - use plaintext2html (preserve=False, using container_tag to wrap the whole content) A side-effect of this method is to coerce all HTML tags to lowercase in ``html``, and strip enclosing <html> or <body> tags in content if ``plaintext`` is False. :param str html: html tagsoup (doesn't have to be XHTML) :param str content: extra content to append :param bool plaintext: whether content is plaintext and should be wrapped in a <pre/> tag. :param bool preserve: if content is plaintext, wrap it into a <pre> instead of converting it into html """ html = ustr(html) if plaintext and preserve: content = u'\n<pre>%s</pre>\n' % ustr(content) elif plaintext: content = '\n%s\n' % plaintext2html(content, container_tag) else: content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content) content = u'\n%s\n' % ustr(content) # Force all tags to lowercase html = re.sub( r'(</?)\W*(\w+)([ >])', lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html) insert_location = html.find('</body>') if insert_location == -1: insert_location = html.find('</html>') if insert_location == -1: return '%s%s' % (html, content) return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) if not html: return '' tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id, )) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html_sanitize(src, silent=True, strict=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: 'cite=' not in m.group(1) and cgi.escape(m.group(1)) or m. group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace('<%', cgi.escape('<%')) src = src.replace('%>', cgi.escape('%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, 'comments': False, 'processing_instructions': False } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs[ 'frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') cleaned = cleaned.replace('<%', '<%') cleaned = cleaned.replace('%>', '%>') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, protect_sections=False): """ html_email_clean: clean the html by doing the following steps: - try to strip email quotes, by removing blockquotes or having some client- specific heuristics - try to strip signatures - shorten the html to a maximum number of characters if requested Some specific use case: - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of a quote; detecting by finding WordSection1 of MsoNormal - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect Hotmail by funding ``SkyDrivePlaceholder`` :param string html: sanitized html; tags like html or head should not be present in the html string. This method therefore takes as input html code coming from a sanitized source, like fields.html. :param boolean remove: remove the html code that is unwanted; otherwise it is only flagged and tagged :param boolean shorten: shorten the html; every excessing content will be flagged as to remove :param int max_length: if shortening, maximum number of characters before shortening :param dict expand_options: options for the read more link when shortening the content.The used keys are the following: - oe_expand_container_tag: class applied to the container of the whole read more link - oe_expand_container_class: class applied to the link container (default: oe_mail_expand) - oe_expand_container_content: content of the container (default: ...) - oe_expand_separator_node: optional separator, like adding ... <br /><br /> <a ...>read more</a> (default: void) - oe_expand_a_href: href of the read more link itself (default: #) - oe_expand_a_class: class applied to the <a> containing the link itself (default: oe_mail_expand) - oe_expand_a_content: content of the <a> (default: read more) The formatted read more link is the following: <cont_tag class="oe_expand_container_class"> oe_expand_container_content if expand_options.get('oe_expand_separator_node'): <oe_expand_separator_node/> <a href="oe_expand_a_href" class="oe_expand_a_class"> oe_expand_a_content </a> </span> """ def _replace_matching_regex(regex, source, replace=''): """ Replace all matching expressions in source by replace """ if not source: return source dest = '' idx = 0 for item in re.finditer(regex, source): dest += source[idx:item.start()] + replace idx = item.end() dest += source[idx:] return dest def _create_node(tag, text, tail=None, attrs={}): new_node = etree.Element(tag) new_node.text = text new_node.tail = tail for key, val in attrs.iteritems(): new_node.set(key, val) return new_node def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}): new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs) node.insert(index, new_node) return new_node def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}): text = node.text or '' if not re.search(regex, text): return cur_node = node node.text = '' idx, iteration = 0, 0 for item in re.finditer(regex, text): if iteration == 0: cur_node.text = text[idx:item.start()] else: _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()]) new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs) cur_node = new_node idx = item.end() iteration += 1 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {}) def _truncate_node(node, position, simplify_whitespaces=True): """ Truncate a node text at a given position. This algorithm will shorten at the end of the word whose ending character exceeds position. :param bool simplify_whitespaces: whether to try to count all successive whitespaces as one character. This option should not be True when trying to keep 'pre' consistency. """ if node.text is None: node.text = '' truncate_idx = -1 if simplify_whitespaces: cur_char_nbr = 0 word = None node_words = node.text.strip(' \t\r\n').split() for word in node_words: cur_char_nbr += len(word) if cur_char_nbr >= position: break if word: truncate_idx = node.text.find(word) + len(word) else: truncate_idx = position if truncate_idx == -1 or truncate_idx > len(node.text): truncate_idx = len(node.text) # compose new text bits innertext = node.text[0:truncate_idx] outertext = node.text[truncate_idx:] node.text = innertext # create <span> ... <a href="#">read more</a></span> node read_more_node = _create_node( expand_options.get('oe_expand_container_tag', 'span'), expand_options.get('oe_expand_container_content', ' ... '), None, { 'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand') }) if expand_options.get('oe_expand_separator_node'): read_more_separator_node = _create_node( expand_options.get('oe_expand_separator_node'), '', None, {}) read_more_node.append(read_more_separator_node) read_more_link_node = _create_node( 'a', expand_options.get('oe_expand_a_content', _('read more')), None, { 'href': expand_options.get('oe_expand_a_href', '#'), 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'), }) read_more_node.append(read_more_link_node) # create outertext node overtext_node = _create_node('span', outertext) # tag node overtext_node.set('in_overlength', '1') # add newly created nodes in dom node.append(read_more_node) node.append(overtext_node) if expand_options is None: expand_options = {} whitelist_classes_local = whitelist_classes.copy() if expand_options.get('oe_expand_container_class'): whitelist_classes_local.add( expand_options.get('oe_expand_container_class')) if expand_options.get('oe_expand_a_class'): whitelist_classes_local.add(expand_options.get('oe_expand_a_class')) if not html or not isinstance(html, basestring): return html html = ustr(html) # Pre processing # ------------------------------------------------------------ # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}' # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) # html: ClEditor seems to love using <div><br /><div> -> replace with <br /> br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE) inner_html = _replace_matching_regex(br_div_tags, html, '<br />') # form a tree root = lxml.html.fromstring(inner_html) if not len(root) and root.text is None and root.tail is None: inner_html = '<div>%s</div>' % inner_html root = lxml.html.fromstring(inner_html) quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') signature = re.compile(r'(^[-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)', re.M) for node in root.iter(): # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass if node.tail: tail_node = _create_node('span', node.tail) node.tail = None node.addnext(tail_node) # form node and tag text-based quotes and signature _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) # Processing # ------------------------------------------------------------ # tree: tag nodes # signature_begin = False # try dynamic signature recognition quoted = False quote_begin = False overlength = False replace_class = False overlength_section_id = None overlength_section_count = 0 cur_char_nbr = 0 for node in root.iter(): # comments do not need processing # note: bug in node.get(value, default) for HtmlComments, default never returned if node.tag == etree.Comment: continue # do not take into account multiple spaces that are displayed as max 1 space in html node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split()) # remove unwanted classes from node if node.get('class'): sanitize_classes = [] for _class in node.get('class').split(' '): if _class in whitelist_classes_local: sanitize_classes.append(_class) else: sanitize_classes.append('cleaned_' + _class) replace_class = True node.set('class', ' '.join(sanitize_classes)) # root: try to tag the client used to write the html if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get( 'class', ''): root.set('msoffice', '1') if 'SkyDrivePlaceholder' in node.get( 'class', '') or 'SkyDrivePlaceholder' in node.get('id', ''): root.set('hotmail', '1') # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later if node.tag == 'section': overlength_section_count += 1 node.set('section_closure', str(overlength_section_count)) if node.getparent() is not None and ( node.getparent().get('section_closure') or node.getparent().get('section_inner')): node.set('section_inner', str(overlength_section_count)) # state of the parsing: flag quotes and tails to remove if quote_begin: node.set('in_quote', '1') node.set('tail_remove', '1') # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections) if overlength: if not overlength_section_id or int( node.get('section_inner', overlength_section_count + 1)) > overlength_section_count: node.set('in_overlength', '1') node.set('tail_remove', '1') # find quote in msoffice / hotmail / blockquote / text quote and signatures if root.get('msoffice' ) and node.tag == 'div' and 'border-top:solid' in node.get( 'style', ''): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if root.get('hotmail') and node.tag == 'hr' and ( 'stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') if node.tag == 'blockquote' or node.get('text_quote') or node.get( 'text_signature'): # here no quote_begin because we want to be able to remove some quoted # text without removing all the remaining context quoted = True node.set('in_quote', '1') if node.getparent() is not None and node.getparent().get('in_quote'): # inside a block of removed text but not in quote_begin (see above) quoted = True node.set('in_quote', '1') # shorten: # if protect section: # 1/ find the first parent not being inside a section # 2/ add the read more link # else: # 1/ truncate the text at the next available space # 2/ create a 'read more' node, next to current node # 3/ add the truncated text in a new node, next to 'read more' node node_text = (node.text or '').strip().strip('\n').strip() if shorten and not overlength and cur_char_nbr + len( node_text) > max_length: node_to_truncate = node while node_to_truncate.getparent() is not None: if node_to_truncate.get('in_quote'): node_to_truncate = node_to_truncate.getparent() elif protect_sections and ( node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')): node_to_truncate = node_to_truncate.getparent() overlength_section_id = node_to_truncate.get( 'section_closure') else: break overlength = True node_to_truncate.set('truncate', '1') if node_to_truncate == node: node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr)) else: node_to_truncate.set('truncate_position', str(len(node.text or ''))) cur_char_nbr += len(node_text) # Tree modification # ------------------------------------------------------------ for node in root.iter(): if node.get('truncate'): _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre') # Post processing # ------------------------------------------------------------ to_remove = [] for node in root.iter(): if node.get('in_quote') or node.get('in_overlength'): # copy the node tail into parent text if node.tail and not node.get('tail_remove'): parent = node.getparent() parent.tail = node.tail + (parent.tail or '') to_remove.append(node) if node.get('tail_remove'): node.tail = '' # clean node for attribute_name in [ 'in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position' ]: node.attrib.pop(attribute_name, None) for node in to_remove: if remove: node.getparent().remove(node) else: if not expand_options.get( 'oe_expand_a_class', 'oe_mail_expand' ) in node.get( 'class', '' ): # trick: read more link should be displayed even if it's in overlength node_class = node.get('class', '') + ' oe_mail_cleaned' node.set('class', node_class) if not overlength and not quote_begin and not quoted and not replace_class: return html # html: \n that were tail of elements have been encapsulated into <span> -> back to \n html = etree.tostring(root, pretty_print=False, encoding='UTF-8') linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL) html = _replace_matching_regex(linebreaks, html, '\n') return ustr(html)
def build_email(self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype='plain', headers=None, body_alternative=None, subtype_alternative='plain'): """Constructs an RFC2822 email.message.Message object based on the keyword arguments passed, and returns it. :param string email_from: sender email address :param list email_to: list of recipient addresses (to be joined with commas) :param string subject: email subject (no pre-encoding/quoting necessary) :param string body: email body, of the type ``subtype`` (by default, plaintext). If html subtype is used, the message will be automatically converted to plaintext and wrapped in multipart/alternative, unless an explicit ``body_alternative`` version is passed. :param string body_alternative: optional alternative body, of the type specified in ``subtype_alternative`` :param string reply_to: optional value of Reply-To header :param string object_id: optional tracking identifier, to be included in the message-id for recognizing replies. Suggested format for object-id is "res_id-model", e.g. "12345-crm.lead". :param string subtype: optional mime subtype for the text body (usually 'plain' or 'html'), must match the format of the ``body`` parameter. Default is 'plain', making the content part of the mail "text/plain". :param string subtype_alternative: optional mime subtype of ``body_alternative`` (usually 'plain' or 'html'). Default is 'plain'. :param list attachments: list of (filename, filecontents) pairs, where filecontents is a string containing the bytes of the attachment :param list email_cc: optional list of string values for CC header (to be joined with commas) :param list email_bcc: optional list of string values for BCC header (to be joined with commas) :param dict headers: optional map of headers to set on the outgoing mail (may override the other headers, including Subject, Reply-To, Message-Id, etc.) :rtype: email.message.Message (usually MIMEMultipart) :return: the new RFC2822 email message """ email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body_utf8 = ustr(body).encode('utf-8') email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset='utf-8') msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header( COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header( COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode('utf-8')] = encode_header(value) if subtype == 'html' and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text( email_body_utf8.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach( MIMEText(text_utf8, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode('utf-8') alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset='utf-8') alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase('application', "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param('name', filename_rfc2047) part.add_header('Content-Disposition', 'attachment', filename=filename_rfc2047) part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg
def build_email(self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype='plain', headers=None, body_alternative=None, subtype_alternative='plain'): """Constructs an RFC2822 email.message.Message object based on the keyword arguments passed, and returns it. :param string email_from: sender email address :param list email_to: list of recipient addresses (to be joined with commas) :param string subject: email subject (no pre-encoding/quoting necessary) :param string body: email body, of the type ``subtype`` (by default, plaintext). If html subtype is used, the message will be automatically converted to plaintext and wrapped in multipart/alternative, unless an explicit ``body_alternative`` version is passed. :param string body_alternative: optional alternative body, of the type specified in ``subtype_alternative`` :param string reply_to: optional value of Reply-To header :param string object_id: optional tracking identifier, to be included in the message-id for recognizing replies. Suggested format for object-id is "res_id-model", e.g. "12345-crm.lead". :param string subtype: optional mime subtype for the text body (usually 'plain' or 'html'), must match the format of the ``body`` parameter. Default is 'plain', making the content part of the mail "text/plain". :param string subtype_alternative: optional mime subtype of ``body_alternative`` (usually 'plain' or 'html'). Default is 'plain'. :param list attachments: list of (filename, filecontents) pairs, where filecontents is a string containing the bytes of the attachment :param list email_cc: optional list of string values for CC header (to be joined with commas) :param list email_bcc: optional list of string values for BCC header (to be joined with commas) :param dict headers: optional map of headers to set on the outgoing mail (may override the other headers, including Subject, Reply-To, Message-Id, etc.) :rtype: email.message.Message (usually MIMEMultipart) :return: the new RFC2822 email message """ email_from = email_from or tools.config.get('email_from') assert email_from, "You must either provide a sender address explicitly or configure "\ "a global sender address in the server configuration or with the "\ "--email-from startup parameter." # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u'' email_body_utf8 = ustr(body).encode('utf-8') email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset='utf-8') msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg['Message-Id'] = encode_header(message_id) if references: msg['references'] = encode_header(references) msg['Subject'] = encode_header(subject) msg['From'] = encode_rfc2822_address_header(email_from) del msg['Reply-To'] if reply_to: msg['Reply-To'] = encode_rfc2822_address_header(reply_to) else: msg['Reply-To'] = msg['From'] msg['To'] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg['Cc'] = encode_rfc2822_address_header(COMMASPACE.join(email_cc)) if email_bcc: msg['Bcc'] = encode_rfc2822_address_header(COMMASPACE.join(email_bcc)) msg['Date'] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode('utf-8')] = encode_header(value) if subtype == 'html' and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text(email_body_utf8.decode('utf-8')).encode('utf-8') alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach(MIMEText(text_utf8, _charset='utf-8', _subtype='plain')) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode('utf-8') alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset='utf-8') alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase('application', "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param('name', filename_rfc2047) part.add_header('Content-Disposition', 'attachment', filename=filename_rfc2047) part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg