def test_bleach_html_parser(parser_args, data, expected):
    args = {
        'tags': None,
        'strip': True,
        'consume_entities': True
    }
    args.update(parser_args)

    # Build a parser, walker, and serializer just like we do in clean()
    parser = html5lib_shim.BleachHTMLParser(**args)
    walker = html5lib_shim.getTreeWalker('etree')
    serializer = html5lib_shim.BleachHTMLSerializer(
        quote_attr_values='always',
        omit_optional_tags=False,
        escape_lt_in_attrs=True,
        resolve_entities=False,
        sanitize=False,
        alphabetical_attributes=False,
    )

    # Parse, walk, and then serialize the output
    dom = parser.parseFragment(data)
    serialized = serializer.render(walker(dom))

    assert serialized == expected
Exemple #2
0
    def __init__(
        self,
        callbacks=DEFAULT_CALLBACKS,
        skip_tags=None,
        parse_email=False,
        url_re=URL_RE,
        email_re=EMAIL_RE,
        recognized_tags=html5lib_shim.HTML_TAGS,
    ):
        """Creates a Linker instance

        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

        :arg list skip_tags: list of tags that you don't want to linkify the
            contents of; for example, you could set this to ``['pre']`` to skip
            linkifying contents of ``pre`` tags

        :arg bool parse_email: whether or not to linkify email addresses

        :arg re url_re: url matching regex

        :arg re email_re: email matching regex

        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
            everything else gets escaped

        :returns: linkified text as unicode

        """
        self.callbacks = callbacks
        self.skip_tags = skip_tags
        self.parse_email = parse_email
        self.url_re = url_re
        self.email_re = email_re

        # Create a parser/tokenizer that allows all HTML tags and escapes
        # anything not in that list.
        self.parser = html5lib_shim.BleachHTMLParser(
            tags=recognized_tags,
            strip=False,
            consume_entities=True,
            namespaceHTMLElements=False,
        )
        self.walker = html5lib_shim.getTreeWalker("etree")
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values="always",
            omit_optional_tags=False,
            # linkify does not sanitize
            sanitize=False,
            # linkify alphabetizes
            alphabetical_attributes=False,
        )
Exemple #3
0
def test_serializer(data, expected):
    # Build a parser, walker, and serializer just like we do in clean()
    parser = html5lib_shim.BleachHTMLParser(
        tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False
    )
    walker = html5lib_shim.getTreeWalker("etree")
    serializer = html5lib_shim.BleachHTMLSerializer(
        quote_attr_values="always",
        omit_optional_tags=False,
        escape_lt_in_attrs=True,
        resolve_entities=False,
        sanitize=False,
        alphabetical_attributes=False,
    )

    # Parse, walk, and then serialize the output
    dom = parser.parseFragment(data)
    serialized = serializer.render(walker(dom))

    assert serialized == expected
    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
                 strip_comments=True, filters=None):
        """Initializes a Cleaner

        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``

        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

        :arg bool strip: whether or not to strip disallowed elements

        :arg bool strip_comments: whether or not to strip HTML comments

        :arg list filters: list of html5lib Filter classes to pass streamed content through

            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters

            .. Warning::

               Using filters changes the output of ``bleach.Cleaner.clean``.
               Make sure the way the filters change the output are secure.

        """
        self.tags = tags
        self.attributes = attributes
        self.styles = styles
        self.protocols = protocols
        self.strip = strip
        self.strip_comments = strip_comments
        self.filters = filters or []

        self.parser = html5lib_shim.BleachHTMLParser(
            tags=self.tags,
            strip=self.strip,
            consume_entities=False,
            namespaceHTMLElements=False
        )
        self.walker = html5lib_shim.getTreeWalker('etree')
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values='always',
            omit_optional_tags=False,
            escape_lt_in_attrs=True,

            # We want to leave entities as they are without escaping or
            # resolving or expanding
            resolve_entities=False,

            # Bleach has its own sanitizer, so don't use the html5lib one
            sanitize=False,

            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
            alphabetical_attributes=False,
        )