예제 #1
0
 def test_ordering(self):
     assert alphabetize_attributes({
         (None, "a"): 1,
         (None, "b"): 2
     }) == OrderedDict([((None, "a"), 1), ((None, "b"), 2)])
     assert alphabetize_attributes({
         (None, "b"): 1,
         (None, "a"): 2
     }) == OrderedDict([((None, "a"), 2), ((None, "b"), 1)])
예제 #2
0
 def test_ordering(self):
     assert (alphabetize_attributes({
         (None, 'a'): 1,
         (None, 'b'): 2
     }) == OrderedDict([((None, 'a'), 1), ((None, 'b'), 2)]))
     assert (alphabetize_attributes({
         (None, 'b'): 1,
         (None, 'a'): 2
     }) == OrderedDict([((None, 'a'), 2), ((None, 'b'), 1)]))
예제 #3
0
    def allow_token(self, token):
        """Handles the case where we're allowing the tag"""
        if 'data' in token:
            # Loop through all the attributes and drop the ones that are not
            # allowed, are unsafe or break other rules. Additionally, fix
            # attribute values that need fixing.
            #
            # At the end of this loop, we have the final set of attributes
            # we're keeping.
            attrs = {}
            for namespaced_name, val in token['data'].items():
                namespace, name = namespaced_name

                # Drop attributes that are not explicitly allowed
                #
                # NOTE(willkg): We pass in the attribute name--not a namespaced
                # name.
                if not self.attr_filter(token['name'], name, val):
                    continue

                # Drop attributes with uri values that use a disallowed protocol
                # Sanitize attributes with uri values
                if namespaced_name in self.attr_val_is_uri:
                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
                    if new_value is None:
                        continue
                    val = new_value

                # Drop values in svg attrs with non-local IRIs
                if namespaced_name in self.svg_attr_val_allows_ref:
                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                     ' ',
                                     unescape(val))
                    new_val = new_val.strip()
                    if not new_val:
                        continue

                    else:
                        # Replace the val with the unescaped version because
                        # it's a iri
                        val = new_val

                # Drop href and xlink:href attr for svg elements with non-local IRIs
                if (None, token['name']) in self.svg_allow_local_href:
                    if namespaced_name in [
                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
                    ]:
                        if re.search(r'^\s*[^#\s]', val):
                            continue

                # If it's a style attribute, sanitize it
                if namespaced_name == (None, 'style'):
                    val = self.sanitize_css(val)

                # At this point, we want to keep the attribute, so add it in
                attrs[namespaced_name] = val

            token['data'] = alphabetize_attributes(attrs)

        return token
예제 #4
0
    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.

        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
        ['attribute', 'pairs'], 'tag': callable}.

        Here callable is a function with two arguments of attribute name and
        value. It should return true of false.

        Also gives the option to strip tags instead of encoding.

        """
        token_type = token['type']
        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
            if token['name'] in self.allowed_elements:
                return self.allow_token(token)

            elif self.strip_disallowed_elements:
                pass

            else:
                if 'data' in token:
                    # Alphabetize the attributes before calling .disallowed_token()
                    # so that the resulting string is stable
                    token['data'] = alphabetize_attributes(token['data'])
                return self.disallowed_token(token)

        elif token_type == 'Comment':
            if not self.strip_html_comments:
                return token

        else:
            return token
예제 #5
0
    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.

        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
        ['attribute', 'pairs'], 'tag': callable}.

        Here callable is a function with two arguments of attribute name and
        value. It should return true of false.

        Also gives the option to strip tags instead of encoding.

        """
        token_type = token['type']
        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
            if token['name'] in self.allowed_elements:
                return self.allow_token(token)

            elif self.strip_disallowed_elements:
                pass

            else:
                if 'data' in token:
                    # Alphabetize the attributes before calling .disallowed_token()
                    # so that the resulting string is stable
                    token['data'] = alphabetize_attributes(token['data'])
                return self.disallowed_token(token)

        elif token_type == 'Comment':
            if not self.strip_html_comments:
                return token

        else:
            return token
예제 #6
0
    def allow_token(self, token):
        """Handles the case where we're allowing the tag"""
        if 'data' in token:
            # Loop through all the attributes and drop the ones that are not
            # allowed, are unsafe or break other rules. Additionally, fix
            # attribute values that need fixing.
            #
            # At the end of this loop, we have the final set of attributes
            # we're keeping.
            attrs = {}
            for namespaced_name, val in token['data'].items():
                namespace, name = namespaced_name

                # Drop attributes that are not explicitly allowed
                #
                # NOTE(willkg): We pass in the attribute name--not a namespaced
                # name.
                if not self.attr_filter(token['name'], name, val):
                    continue

                # Drop attributes with uri values that use a disallowed protocol
                # Sanitize attributes with uri values
                if namespaced_name in self.attr_val_is_uri:
                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
                    if new_value is None:
                        continue
                    val = new_value

                # Drop values in svg attrs with non-local IRIs
                if namespaced_name in self.svg_attr_val_allows_ref:
                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                     ' ',
                                     unescape(val))
                    new_val = new_val.strip()
                    if not new_val:
                        continue

                    else:
                        # Replace the val with the unescaped version because
                        # it's a iri
                        val = new_val

                # Drop href and xlink:href attr for svg elements with non-local IRIs
                if (None, token['name']) in self.svg_allow_local_href:
                    if namespaced_name in [
                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
                    ]:
                        if re.search(r'^\s*[^#\s]', val):
                            continue

                # If it's a style attribute, sanitize it
                if namespaced_name == (None, u'style'):
                    val = self.sanitize_css(val)

                # At this point, we want to keep the attribute, so add it in
                attrs[namespaced_name] = val

            token['data'] = alphabetize_attributes(attrs)

        return token
예제 #7
0
 def test_different_namespaces(self):
     assert (
         alphabetize_attributes({
             ('xlink', 'href'): 'abc',
             (None, 'alt'): '123'
         }) ==
         OrderedDict([
             ((None, 'alt'), '123'),
             (('xlink', 'href'), 'abc')
         ])
     )
예제 #8
0
 def test_ordering(self):
     assert (
         alphabetize_attributes({
             (None, 'a'): 1,
             (None, 'b'): 2
         }) ==
         OrderedDict([
             ((None, 'a'), 1),
             ((None, 'b'), 2)
         ])
     )
     assert (
         alphabetize_attributes({
             (None, 'b'): 1,
             (None, 'a'): 2}
         ) ==
         OrderedDict([
             ((None, 'a'), 2),
             ((None, 'b'), 1)
         ])
     )
예제 #9
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, u'href'): u'mailto:%s' % match.group(0),
                        u'_text': match.group(0)
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {u'type': u'Characters', u'data': match.group(0)}
                        )

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'}
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #10
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {'type': 'Characters', 'data': text[end:match.start()]}
                        )

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, 'href'): 'mailto:%s' % match.group(0),
                        '_text': match.group(0)
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {'type': 'Characters', 'data': match.group(0)}
                        )

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop('_text', '')
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
                            {'type': 'Characters', 'data': force_unicode(_text)},
                            {'type': 'EndTag', 'name': 'a'}
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({'type': 'Characters', 'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #11
0
    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.

        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
        ['attribute', 'pairs'], 'tag': callable}.

        Here callable is a function with two arguments of attribute name and
        value. It should return true of false.

        Also gives the option to strip tags instead of encoding.

        :arg dict token: token to sanitize

        :returns: token or list of tokens

        """
        token_type = token["type"]
        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
            if token["name"] in self.allowed_elements:
                return self.allow_token(token)

            elif self.strip_disallowed_elements:
                return None

            else:
                if "data" in token:
                    # Alphabetize the attributes before calling .disallowed_token()
                    # so that the resulting string is stable
                    token["data"] = alphabetize_attributes(token["data"])
                return self.disallowed_token(token)

        elif token_type == "Comment":
            if not self.strip_html_comments:
                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
                token["data"] = html5lib_shim.escape(token["data"],
                                                     entities={
                                                         '"': "&quot;",
                                                         "'": "&#x27;"
                                                     })
                return token
            else:
                return None

        elif token_type == "Characters":
            return self.sanitize_characters(token)

        else:
            return token
예제 #12
0
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the
        callbacks return.

        This yields the new set of tokens.

        """
        a_token = token_buffer[0]
        if a_token['data']:
            attrs = a_token['data']
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs['_text'] = text

        attrs = self.apply_callbacks(attrs, False)

        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {'type': 'Characters', 'data': text}

        else:
            new_text = attrs.pop('_text', '')
            a_token['data'] = alphabetize_attributes(attrs)

            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem

            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
                yield token_buffer[-1]
예제 #13
0
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the
        callbacks return.

        This yields the new set of tokens.

        """
        a_token = token_buffer[0]
        if a_token['data']:
            attrs = a_token['data']
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs['_text'] = text

        attrs = self.apply_callbacks(attrs, False)

        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {'type': 'Characters', 'data': text}

        else:
            new_text = attrs.pop('_text', '')
            a_token['data'] = alphabetize_attributes(attrs)

            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem

            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
                yield token_buffer[-1]
예제 #14
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token['type'] == 'EndTag' and token['name'] == 'a':
                    in_a = False
                yield token
                continue
            elif token['type'] == 'StartTag' and token['name'] == 'a':
                in_a = True
                yield token
                continue
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {'type': 'Characters', 'data': text[end:match.start()]}
                        )

                    url = match.group(0)
                    prefix = suffix = ''

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = 'http://%s' % url

                    attrs = {
                        (None, 'href'): href,
                        '_text': url
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {'type': 'Characters', 'data': prefix + url + suffix}
                        )

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append(
                                {'type': 'Characters', 'data': prefix}
                            )

                        _text = attrs.pop('_text', '')
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
                            {'type': 'Characters', 'data': force_unicode(_text)},
                            {'type': 'EndTag', 'name': 'a'},
                        ])

                        if suffix:
                            new_tokens.append(
                                {'type': 'Characters', 'data': suffix}
                            )

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({'type': 'Characters', 'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #15
0
    def allow_token(self, token):
        """Handles the case where we're allowing the tag"""
        if 'data' in token:
            # Loop through all the attributes and drop the ones that are not
            # allowed, are unsafe or break other rules. Additionally, fix
            # attribute values that need fixing.
            #
            # At the end of this loop, we have the final set of attributes
            # we're keeping.
            attrs = {}
            for namespaced_name, val in token['data'].items():
                namespace, name = namespaced_name

                # Drop attributes that are not explicitly allowed
                #
                # NOTE(willkg): We pass in the attribute name--not a namespaced
                # name.
                if not self.attr_filter(token['name'], name, val):
                    continue

                # Look at attributes that have uri values
                if namespaced_name in self.attr_val_is_uri:
                    val_unescaped = re.sub(
                        "[`\000-\040\177-\240\s]+",
                        '',
                        unescape(val)).lower()

                    # Remove replacement characters from unescaped characters.
                    val_unescaped = val_unescaped.replace("\ufffd", "")

                    # Drop attributes with uri values that have protocols that
                    # aren't allowed
                    if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
                            (val_unescaped.split(':')[0] not in self.allowed_protocols)):
                        continue

                # Drop values in svg attrs with non-local IRIs
                if namespaced_name in self.svg_attr_val_allows_ref:
                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                     ' ',
                                     unescape(val))
                    new_val = new_val.strip()
                    if not new_val:
                        continue

                    else:
                        # Replace the val with the unescaped version because
                        # it's a iri
                        val = new_val

                # Drop href and xlink:href attr for svg elements with non-local IRIs
                if (None, token['name']) in self.svg_allow_local_href:
                    if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
                        if re.search(r'^\s*[^#\s]', val):
                            continue

                # If it's a style attribute, sanitize it
                if namespaced_name == (None, u'style'):
                    val = self.sanitize_css(val)

                # At this point, we want to keep the attribute, so add it in
                attrs[namespaced_name] = val

            token['data'] = alphabetize_attributes(attrs)

        return token
예제 #16
0
 def test_different_namespaces(self):
     assert alphabetize_attributes({
         ("xlink", "href"): "abc",
         (None, "alt"): "123"
     }) == OrderedDict([((None, "alt"), "123"), (("xlink", "href"), "abc")])
예제 #17
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token["type"] == "EndTag" and token["name"] == "a":
                    in_a = False
                yield token
                continue
            elif token["type"] == "StartTag" and token["name"] == "a":
                in_a = True
                yield token
                continue
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:match.start()]
                        })

                    url = match.group(0)
                    prefix = suffix = ""

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = "http://%s" % url

                    attrs = {(None, "href"): href, "_text": url}
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append({
                            "type": "Characters",
                            "data": prefix + url + suffix
                        })

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append({
                                "type": "Characters",
                                "data": prefix
                            })

                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": attrs
                            },
                            {
                                "type": "Characters",
                                "data": force_unicode(_text)
                            },
                            {
                                "type": "EndTag",
                                "name": "a"
                            },
                        ])

                        if suffix:
                            new_tokens.append({
                                "type": "Characters",
                                "data": suffix
                            })

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:]
                        })

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #18
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:match.start()]
                        })

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, "href"): "mailto:%s" % match.group(0),
                        "_text": match.group(0),
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append({
                            "type": "Characters",
                            "data": match.group(0)
                        })

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": attrs
                            },
                            {
                                "type": "Characters",
                                "data": force_unicode(_text)
                            },
                            {
                                "type": "EndTag",
                                "name": "a"
                            },
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:]
                        })

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #19
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token['type'] == 'EndTag' and token['name'] == 'a':
                    in_a = False
                yield token
                continue
            elif token['type'] == 'StartTag' and token['name'] == 'a':
                in_a = True
                yield token
                continue
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    url = match.group(0)
                    prefix = suffix = ''

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = u'http://%s' % url

                    attrs = {
                        (None, u'href'): href,
                        u'_text': url
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {u'type': u'Characters', u'data': prefix + url + suffix}
                        )

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': prefix}
                            )

                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'},
                        ])

                        if suffix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': suffix}
                            )

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #20
0
    def test_empty_cases(self):
        assert alphabetize_attributes(None) is None

        assert alphabetize_attributes({}) == {}
예제 #21
0
    def allow_token(self, token):
        """Handles the case where we're allowing the tag"""
        if 'data' in token:
            # Loop through all the attributes and drop the ones that are not
            # allowed, are unsafe or break other rules. Additionally, fix
            # attribute values that need fixing.
            #
            # At the end of this loop, we have the final set of attributes
            # we're keeping.
            attrs = {}
            for namespaced_name, val in token['data'].items():
                namespace, name = namespaced_name

                # Drop attributes that are not explicitly allowed
                #
                # NOTE(willkg): We pass in the attribute name--not a namespaced
                # name.
                if not self.attr_filter(token['name'], name, val):
                    continue

                # Look at attributes that have uri values
                if namespaced_name in self.attr_val_is_uri:
                    val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                           unescape(val)).lower()

                    # Remove replacement characters from unescaped characters.
                    val_unescaped = val_unescaped.replace("\ufffd", "")

                    # Drop attributes with uri values that have protocols that
                    # aren't allowed
                    if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
                            and (val_unescaped.split(':')[0]
                                 not in self.allowed_protocols)):
                        continue

                # Drop values in svg attrs with non-local IRIs
                if namespaced_name in self.svg_attr_val_allows_ref:
                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ',
                                     unescape(val))
                    new_val = new_val.strip()
                    if not new_val:
                        continue

                    else:
                        # Replace the val with the unescaped version because
                        # it's a iri
                        val = new_val

                # Drop href and xlink:href attr for svg elements with non-local IRIs
                if (None, token['name']) in self.svg_allow_local_href:
                    if namespaced_name in [(None, 'href'),
                                           (namespaces['xlink'], 'href')]:
                        if re.search(r'^\s*[^#\s]', val):
                            continue

                # If it's a style attribute, sanitize it
                if namespaced_name == (None, u'style'):
                    val = self.sanitize_css(val)

                # At this point, we want to keep the attribute, so add it in
                attrs[namespaced_name] = val

            token['data'] = alphabetize_attributes(attrs)

        return token
예제 #22
0
 def test_different_namespaces(self):
     assert (alphabetize_attributes({
         ('xlink', 'href'): 'abc',
         (None, 'alt'): '123'
     }) == OrderedDict([((None, 'alt'), '123'),
                        (('xlink', 'href'), 'abc')]))
예제 #23
0
    def test_empty_cases(self):
        assert alphabetize_attributes(None) is None

        assert alphabetize_attributes({}) == {}