Ejemplo n.º 1
0
    def sanitize_uri_value(self, value, allowed_protocols):
        """Checks a uri value to see if it's allowed

        :arg value: the uri value to sanitize
        :arg allowed_protocols: list of allowed protocols

        :returns: allowed value or None

        """
        # NOTE(willkg): This transforms the value into one that's easier to
        # match and verify, but shouldn't get returned since it's vastly
        # different than the original value.

        # Convert all character entities in the value
        new_value = html5lib_shim.convert_entities(value)

        # Nix backtick, space characters, and control characters
        new_value = re.sub(
            "[`\000-\040\177-\240\s]+",
            '',
            new_value
        )

        # Remove REPLACEMENT characters
        new_value = new_value.replace('\ufffd', '')

        # Lowercase it--this breaks the value, but makes it easier to match
        # against
        new_value = new_value.lower()

        try:
            # Drop attributes with uri values that have protocols that aren't
            # allowed
            parsed = urlparse(new_value)
        except ValueError:
            # URI is impossible to parse, therefore it's not allowed
            return None

        if parsed.scheme:
            # If urlparse found a scheme, check that
            if parsed.scheme in allowed_protocols:
                return value

        else:
            # Allow uris that are just an anchor
            if new_value.startswith('#'):
                return value

            # Handle protocols that urlparse doesn't recognize like "myprotocol"
            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
                return value

            # If there's no protocol/scheme specified, then assume it's "http"
            # and see if that's allowed
            if 'http' in allowed_protocols:
                return value

        return None
Ejemplo n.º 2
0
    def sanitize_uri_value(self, value, allowed_protocols):
        """Checks a uri value to see if it's allowed

        :arg value: the uri value to sanitize
        :arg allowed_protocols: list of allowed protocols

        :returns: allowed value or None

        """
        # NOTE(willkg): This transforms the value into one that's easier to
        # match and verify, but shouldn't get returned since it's vastly
        # different than the original value.

        # Convert all character entities in the value
        new_value = html5lib_shim.convert_entities(value)

        # Nix backtick, space characters, and control characters
        new_value = re.sub(
            r"[`\000-\040\177-\240\s]+",
            '',
            new_value
        )

        # Remove REPLACEMENT characters
        new_value = new_value.replace('\ufffd', '')

        # Lowercase it--this breaks the value, but makes it easier to match
        # against
        new_value = new_value.lower()

        try:
            # Drop attributes with uri values that have protocols that aren't
            # allowed
            parsed = urlparse(new_value)
        except ValueError:
            # URI is impossible to parse, therefore it's not allowed
            return None

        if parsed.scheme:
            # If urlparse found a scheme, check that
            if parsed.scheme in allowed_protocols:
                return value

        else:
            # Allow uris that are just an anchor
            if new_value.startswith('#'):
                return value

            # Handle protocols that urlparse doesn't recognize like "myprotocol"
            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
                return value

            # If there's no protocol/scheme specified, then assume it's "http"
            # and see if that's allowed
            if 'http' in allowed_protocols:
                return value

        return None
Ejemplo n.º 3
0
    def sanitize_css(self, style):
        """Sanitizes css in style tags"""
        # Convert entities in the style so that it can be parsed as CSS
        style = html5lib_shim.convert_entities(style)

        # Drop any url values before we do anything else
        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)

        # The gauntlet of sanitization

        # Validate the css in the style tag and if it's not valid, then drop
        # the whole thing.
        parts = style.split(';')
        gauntlet = re.compile(
            r"""^(  # consider a style attribute value as composed of:
[/:,#%!.\s\w]    # a non-newline character
|\w-\w           # 3 characters in the form \w-\w
|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
|"[\s\w]+"       # a double quoted string of [\s\w]+
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
)*$""",
            flags=re.U | re.VERBOSE
        )

        for part in parts:
            if not gauntlet.match(part):
                return ''

        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''

        clean = []
        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
            if not value:
                continue

            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')

            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')

        return ' '.join(clean)
Ejemplo n.º 4
0
    def sanitize_css(self, style):
        """Sanitizes css in style tags"""
        # Convert entities in the style so that it can be parsed as CSS
        style = html5lib_shim.convert_entities(style)

        # Drop any url values before we do anything else
        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)

        # The gauntlet of sanitization

        # Validate the css in the style tag and if it's not valid, then drop
        # the whole thing.
        parts = style.split(";")
        gauntlet = re.compile(
            r"""^(  # consider a style attribute value as composed of:
[-/:,#%!.\s\w]   # a non-newline character
|'[-/:,#%!.()"\s\w]*'       # a single quoted string
|"[-/:,#%!.()'\s\w]*"       # a double quoted string
|\([\d,%.\s]+\)  # a parenthesized string of one or more digits, commas, periods, ...
)*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
            flags=re.U | re.VERBOSE,
        )

        for part in parts:
            if not gauntlet.match(part):
                return ""

        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ""

        clean = []
        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue

            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ": " + value + ";")

            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ": " + value + ";")

        return " ".join(clean)
Ejemplo n.º 5
0
    def sanitize_css(self, style):
        """Sanitizes css in style tags"""
        # Convert entities in the style so that it can be parsed as CSS
        style = html5lib_shim.convert_entities(style)

        # Drop any url values before we do anything else
        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)

        # The gauntlet of sanitization

        # Validate the css in the style tag and if it's not valid, then drop
        # the whole thing.
        parts = style.split(';')
        gauntlet = re.compile(
            r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
            flags=re.U
        )

        for part in parts:
            if not gauntlet.match(part):
                return ''

        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''

        clean = []
        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
            if not value:
                continue

            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')

            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')

        return ' '.join(clean)
Ejemplo n.º 6
0
    def sanitize_css(self, style):
        """Sanitizes css in style tags"""
        # Convert entities in the style so that it can be parsed as CSS
        style = html5lib_shim.convert_entities(style)

        # Drop any url values before we do anything else
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)

        # The gauntlet of sanitization

        # Validate the css in the style tag and if it's not valid, then drop
        # the whole thing.
        parts = style.split(';')
        gauntlet = re.compile(
            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
        )

        for part in parts:
            if not gauntlet.match(part):
                return ''

        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''

        clean = []
        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
            if not value:
                continue

            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')

            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')

        return ' '.join(clean)
Ejemplo n.º 7
0
def test_convert_entities(data, expected):
    assert html5lib_shim.convert_entities(data) == expected
Ejemplo n.º 8
0
def test_convert_entities(data, expected):
    assert html5lib_shim.convert_entities(data) == expected