def sanitize_payload(payload): "Sanitize HTML" if not payload: return '', '' styles = [] payload = clean_payload(payload) body_style, body_class = get_body_style(payload) if body_style: styles.append(body_style) safe_attrs = set(defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner(remove_tags=UNCLEANTAGS, safe_attrs_only=True, safe_attrs=safe_attrs) payload = HTMLTITLE_RE.sub('', payload) try: html = cleaner.clean_html(payload) except ValueError: payload = bytes(bytearray(payload, encoding='utf-8')) html = cleaner.clean_html(payload) except XMLSyntaxError: html = '' mainstyle = sanitize_css(get_style(html)) if mainstyle: styles.append(decode(mainstyle)) style = u'\n'.join(styles) html = clean_styles(CSS_COMMENT_RE.sub('', html)) html = set_body_class(html, body_class) return html.strip(), style.strip()
def clean_payload(payload): "Custom clean methods" if not payload: return '' payload = html_entity_decode(html_entity_decode(payload)) try: payload = UNICODE_ENTITY_RE.sub(uni2char, payload) except UnicodeDecodeError: payload = u'' payload = CSS_COMMENT_RE.sub('', payload) return payload