def test_bleach_html_parser(parser_args, data, expected): args = { 'tags': None, 'strip': True, 'consume_entities': True } args.update(parser_args) # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser(**args) walker = html5lib_shim.getTreeWalker('etree') serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values='always', omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected
def __init__( self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS, ): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :arg re url_re: url matching regex :arg re email_re: email matching regex :arg list-of-strings recognized_tags: the list of tags that linkify knows about; everything else gets escaped :returns: linkified text as unicode """ self.callbacks = callbacks self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re # Create a parser/tokenizer that allows all HTML tags and escapes # anything not in that list. self.parser = html5lib_shim.BleachHTMLParser( tags=recognized_tags, strip=False, consume_entities=True, namespaceHTMLElements=False, ) self.walker = html5lib_shim.getTreeWalker("etree") self.serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values="always", omit_optional_tags=False, # linkify does not sanitize sanitize=False, # linkify alphabetizes alphabetical_attributes=False, )
def test_serializer(data, expected): # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser( tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False ) walker = html5lib_shim.getTreeWalker("etree") serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values="always", omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected
def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = html5lib_shim.BleachHTMLParser( tags=self.tags, strip=self.strip, consume_entities=False, namespaceHTMLElements=False ) self.walker = html5lib_shim.getTreeWalker('etree') self.serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values='always', omit_optional_tags=False, escape_lt_in_attrs=True, # We want to leave entities as they are without escaping or # resolving or expanding resolve_entities=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, )