def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs["sanitize"] = True else: parser_kwargs["tokenizer"] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
async def onfinish (self): tab = self.loader.tab yield self.script await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) viewport = await getFormattedViewportMetrics (tab) dom = await tab.DOM.getDocument (depth=-1, pierce=True) self.logger.debug ('dom snapshot document', uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): url = URL (doc['documentURL']) if url in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src self.logger.warning ('dom snapshot duplicate', uuid='d44de989-98d4-456e-82e7-9d4c49acab5e') elif url.scheme in ('http', 'https'): self.logger.debug ('dom snapshot', uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4', base=doc["baseURL"]) haveUrls.add (url) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled disallowedTags = ['script', 'noscript'] disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def __str__(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False) return serializer.render(stream)[container_len : -container_len - 1]
def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values='always', alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream)
def serialize_html(input, options): options = dict([(str(k), v) for k, v in options.items()]) encoding = options.get("encoding", None) if "encoding" in options: del options["encoding"] stream = Lint(JsonWalker(input), False) serializer = HTMLSerializer(alphabetical_attributes=True, **options) return serializer.render(stream, encoding)
def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False, alphabetical_attributes=True) return serializer.render(stream)
def sanitize(document): parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer) domtree = parser.parseFragment(document) stream = html5lib.treewalkers.getTreeWalker('etree')(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, )
def sanitize(tokenizer, document): parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) builder = "simpletree" if html5lib_version == "0.95" else "etree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ '{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li', '{http://www.w3.org/1999/xhtml}ul', ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip('\n') if not len(tree): # No children. tree.text = tree.text.rstrip('\n') # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith('\n'): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip('\n') parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False) return serializer.render(stream)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ['{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li', '{http://www.w3.org/1999/xhtml}ul'] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip('\n') if not len(tree): # No children. tree.text = tree.text.rstrip('\n') # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith('\n'): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip('\n') parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False) return serializer.render(stream)
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment( '<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer(alphabetical_attributes=True, quote_attr_values='always') # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert (ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>')
async def test_treewalker_cdata(tab, server): ret = await tab.Page.navigate(url='http://localhost:8080/test.xml') # wait until loaded XXX: replace with idle check await asyncio.sleep(0.5) dom = await tab.DOM.getDocument(depth=-1, pierce=True) docs = list(ChromeTreeWalker(dom['root']).split()) assert len(docs) == 1 for i, doc in enumerate(docs): walker = ChromeTreeWalker(doc) serializer = HTMLSerializer() result = serializer.render(iter(walker)) # chrome will display a pretty-printed viewer *plus* the original # source (stripped of its xml header) assert cdataDoc in result
def sanitize(tokenizer, document): parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE: builder = "etree" else: builder = "simpletree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer( alphabetical_attributes=True, quote_attr_values='always' ) # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert ( ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>' )
def serialize(self, **kwargs): """Return the unicode serialization of myself, with optional sanitization arguments.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) stream = sortAttributes(stream) serializer = HTMLSerializer(quote_attr_values="always", omit_optional_tags=False) html = serializer.render(stream)[container_len:-container_len - 1] return bleach.clean( html, tags=kwargs.get("tags") or (ALLOWED_TAGS + ["for"]), attributes=kwargs.get("attributes") or ALLOWED_ATTRIBUTES, styles=kwargs.get("styles") or ALLOWED_STYLES, strip_comments=True, )
def sanitize_html(input): """ Removes any unwanted HTML tags and attributes, using html5lib. >>> sanitize_html("foobar<p>adf<i></p>abc</i>") u'foobar<p>adf<i></i></p><i>abc</i>' >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>') u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>' """ p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def clean_html(input): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def apply_linkification( html: str, skip_tags: Optional[List[str]] = None, ) -> str: """Apply custom linkification filter to convert text patterns to links.""" parser = HTMLParser(namespaceHTMLElements=False) html_tree = parser.parseFragment(html) walker_stream = html5lib.getTreeWalker('etree')(html_tree) filtered_html_tree = LinkifyFilter(walker_stream, skip_tags) serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, sanitize=False, alphabetical_attributes=False, ) return serializer.render(filtered_html_tree)
async def test_treewalker(tab): frames = await tab.Page.getFrameTree() framehtml = '<HTML><HEAD></HEAD><BODY></BODY></HTML>' html = '<HTML><HEAD><META charset=utf-8></HEAD><BODY><H1>Hello</H1><!-- comment --><IFRAME></IFRAME></BODY></HTML>' rootframe = frames['frameTree']['frame']['id'] await tab.Page.setDocumentContent(frameId=rootframe, html=html) dom = await tab.DOM.getDocument(depth=-1, pierce=True) docs = list(ChromeTreeWalker(dom['root']).split()) assert len(docs) == 2 for i, doc in enumerate(docs): walker = ChromeTreeWalker(doc) serializer = HTMLSerializer() result = serializer.render(iter(walker)) if i == 0: assert result == html elif i == 1: assert result == framehtml
def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :arg re url_re: url matching regex :arg re email_re: email matching regex :returns: linkified text as unicode """ self.callbacks = callbacks self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # linkify does not sanitize sanitize=False, # linkify alphabetizes alphabetical_attributes=False, )
def sanitize(tokenizer, document): parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE: builder = "etree" for link in domtree.findall(".//{http://www.w3.org/1999/xhtml}a"): if link.get('href', None): link.set("rel", "nofollow noopener") else: builder = "simpletree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer( quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def sanitize(tokenizer, document): parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE: builder = "etree" for link in domtree.findall(".//{http://www.w3.org/1999/xhtml}a"): if link.get('href', None): link.set("rel", "nofollow noopener") else: builder = "simpletree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def cleanup_html(string, sanitize=True, fragment=True, stream=False, filter_optional_tags=False, id_prefix=None, update_anchor_links=True): """Clean up some html and convert it to HTML.""" if not string.strip(): return '' string = force_text(string) if sanitize: string = lxml.html.clean.clean_html(string) tree = parse_html(string, fragment) walker = treewalkers.getTreeWalker('lxml')(tree) walker = CleanupFilter(walker, id_prefix, update_anchor_links) if filter_optional_tags: walker = OptionalTagsFilter(walker) serializer = HTMLSerializer( quote_attr_values=True, minimize_boolean_attributes=False, omit_optional_tags=False, ) rv = serializer.serialize(walker, 'utf-8') if stream: return rv return force_text(b''.join(rv))
def truncate(html, truncated_message, suffix, max_entities=None, max_length=None): walker = html5lib.getTreeWalker('etree') html_stream = walker(html5lib.parseFragment(html, treebuilder='etree')) truncated_message_stream = walker( html5lib.parseFragment(truncated_message, treebuilder='etree')) suffix_stream = walker(html5lib.parseFragment(suffix, treebuilder='etree')) truncated = TelegramTruncator(html_stream, truncated_message=truncated_message_stream, suffix=suffix_stream, max_entities=max_entities, max_length=max_length) return HTMLSerializer().render(truncated).strip('\n')
class Cleaner(object): """Cleaner for cleaning HTML fragments of malicious content This cleaner is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This cleaner is not designed to use to transform content to be used in non-web-page contexts. To use:: from bleach.sanitizer import Cleaner cleaner = Cleaner() for text in all_the_yucky_things: sanitized = cleaner.clean(text) """ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, ) def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode """ if not text: return u'' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
class Linker(object): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses linkify does a best-effort approach and tries to recover from bad situations due to crazy text. """ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :arg re url_re: url matching regex :arg re email_re: email matching regex :returns: linkified text as unicode """ self.callbacks = callbacks self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # linkify does not sanitize sanitize=False, # linkify alphabetizes alphabetical_attributes=False, ) def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode """ text = force_unicode(text) if not text: return u'' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered)
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This function is not designed to use to transform content to be used in non-web-page contexts. :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments :returns: cleaned text as unicode """ if not text: return u'' text = force_unicode(text) parser = html5lib.HTMLParser(namespaceHTMLElements=False) dom = parser.parseFragment(text) walker = html5lib.getTreeWalker('etree') filtered = BleachSanitizerFilter( source=walker(dom), # Bleach-sanitizer-specific things allowed_attributes_map=attributes, strip_disallowed_elements=strip, strip_html_comments=strip_comments, # html5lib-sanitizer things allowed_elements=tags, allowed_css_properties=styles, allowed_protocols=protocols, allowed_svg_properties=[], ) s = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, ) return s.render(filtered)
class Cleaner(object): """Cleaner for cleaning HTML fragments of malicious content This cleaner is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This cleaner is not designed to use to transform content to be used in non-web-page contexts. To use:: from bleach.sanitizer import Cleaner cleaner = Cleaner() for text in all_the_yucky_things: sanitized = cleaner.clean(text) """ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, ) def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): raise TypeError('argument must of text type') if not text: return u'' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
def setUp(self): self.parser = etree.XMLParser(resolve_entities=False) self.treewalker = html5lib.getTreeWalker('lxml') self.serializer = HTMLSerializer()
class Linker(object): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses linkify does a best-effort approach and tries to recover from bad situations due to crazy text. """ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :arg re url_re: url matching regex :arg re email_re: email matching regex :returns: linkified text as unicode """ self.callbacks = callbacks self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # linkify does not sanitize sanitize=False, # linkify alphabetizes alphabetical_attributes=False, ) def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): raise TypeError('argument must of text type') text = force_unicode(text) if not text: return u'' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered)
def serialize_html(input, options): options = dict([(str(k), v) for k, v in options.items()]) stream = JsonWalker(input) serializer = HTMLSerializer(alphabetical_attributes=True, **options) return serializer.render(stream, options.get('encoding', None))
def testThrowsUnknownOption(): with pytest.raises(TypeError): HTMLSerializer(foobar=None)
def test_strip_tag(): d = html5lib.parse( '<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>') stream = StripTagFilter(getTreeWalker('etree')(d), ['b', 'c']) serializer = HTMLSerializer() assert serializer.render(stream) == '<a>barbaz.</a>.'
def test_strip_attribute(): d = html5lib.parse('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>') stream = StripAttributeFilter(getTreeWalker('etree')(d), ['b', 'c', 'd']) serializer = HTMLSerializer() assert serializer.render(stream) == '<a></a><br keep=1>'