def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using misc.html_escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = misc.html_escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links text = html_keep_url(text) # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if isinstance(smtp_header, Header): smtp_header = ustr(smtp_header) if smtp_header: text = decode_header(smtp_header.replace('\r', '')) return ''.join([ustr(x[0], x[1]) for x in text]) return u''
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if isinstance(smtp_header, Header): smtp_header = ustr(smtp_header) if smtp_header: text = decode_header(smtp_header.replace('\r', '')) # The joining space will not be needed as of Python 3.3 # See https://github.com/python/cpython/commit/07ea53cb218812404cdbde820647ce6e4b2d0f8e sep = ' ' if pycompat.PY2 else '' return sep.join([ustr(x[0], x[1]) for x in text]) return u''
def remove_accents(input_str): """Suboptimal-but-better-than-nothing way to replace accented latin letters by an ASCII equivalent. Will obviously change the meaning of input_str and work only for some cases""" input_str = ustr(input_str) nkfd_form = unicodedata.normalize('NFKD', input_str) return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def str2bool(s, default=None): s = ustr(s).lower() y = 'y yes 1 true t on'.split() n = 'n no 0 false f off'.split() if s not in (y + n): if default is None: raise ValueError('Use 0/1/yes/no/true/false/on/off') return bool(default) return s in y
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (</body>, </html>, or EOF), and converting the provided content in html unless ``plaintext`` is False. Content conversion can be done in two ways: - wrapping it into a pre (preserve=True) - use plaintext2html (preserve=False, using container_tag to wrap the whole content) A side-effect of this method is to coerce all HTML tags to lowercase in ``html``, and strip enclosing <html> or <body> tags in content if ``plaintext`` is False. :param str html: html tagsoup (doesn't have to be XHTML) :param str content: extra content to append :param bool plaintext: whether content is plaintext and should be wrapped in a <pre/> tag. :param bool preserve: if content is plaintext, wrap it into a <pre> instead of converting it into html """ html = ustr(html) if plaintext and preserve: content = u'\n<pre>%s</pre>\n' % ustr(content) elif plaintext: content = '\n%s\n' % plaintext2html(content, container_tag) else: content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content) content = u'\n%s\n' % ustr(content) # Force all tags to lowercase html = re.sub( r'(</?)(\w+)([ >])', lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html) insert_location = html.find('</body>') if insert_location == -1: insert_location = html.find('</html>') if insert_location == -1: return '%s%s' % (html, content) return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
def schema_activity(arch): """ Check the activity view against its schema :type arch: etree._Element """ global _activity_validator if _activity_validator is None: with misc.file_open(os.path.join('mail', 'views', 'activity.rng')) as f: _activity_validator = etree.RelaxNG(etree.parse(f)) if _activity_validator.validate(arch): return True for error in _activity_validator.error_log: _logger.error(ustr(error)) return False
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) if not html: return '' tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id, )) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(u"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: (u'cite=' not in m.group(1) and u'alt=' not in m.group(1) and u'src=' not in m.group(1)) and misc.html_escape(m.group(1)) or m.group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace(u'<%', misc.html_escape(u'<%')) src = src.replace(u'%>', misc.html_escape(u'%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'sanitize_style': sanitize_style, # True = sanitize styling 'forms': True, # True = remove form tags 'remove_unknown_tags': False, 'comments': False, 'processing_instructions': False } if sanitize_tags: kwargs['allow_tags'] = allowed_tags if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if sanitize_attributes and etree.LXML_VERSION >= ( 3, 1, 0 ): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs.update({ 'safe_attrs_only': False, # keep oe-data attributes + style 'strip_classes': strip_classes, # remove classes, even when keeping other attributes }) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) assert isinstance(cleaned, pycompat.text_type) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace(u'%24', u'$') cleaned = cleaned.replace(u'%7B', u'{') cleaned = cleaned.replace(u'%7D', u'}') cleaned = cleaned.replace(u'%20', u' ') cleaned = cleaned.replace(u'%5B', u'[') cleaned = cleaned.replace(u'%5D', u']') cleaned = cleaned.replace(u'%7C', u'|') cleaned = cleaned.replace(u'<%', u'<%') cleaned = cleaned.replace(u'%>', u'%>') # html considerations so real html content match database value cleaned.replace(u'\xa0', u' ') except etree.ParserError as e: if u'empty' in pycompat.text_type(e): return u"" if not silent: raise logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>Unknown error when sanitizing</p>' # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'): cleaned = cleaned[5:-6] return cleaned