def sanitize_uri_value(self, value, allowed_protocols): """Checks a uri value to see if it's allowed :arg value: the uri value to sanitize :arg allowed_protocols: list of allowed protocols :returns: allowed value or None """ # NOTE(willkg): This transforms the value into one that's easier to # match and verify, but shouldn't get returned since it's vastly # different than the original value. # Convert all character entities in the value new_value = html5lib_shim.convert_entities(value) # Nix backtick, space characters, and control characters new_value = re.sub( "[`\000-\040\177-\240\s]+", '', new_value ) # Remove REPLACEMENT characters new_value = new_value.replace('\ufffd', '') # Lowercase it--this breaks the value, but makes it easier to match # against new_value = new_value.lower() try: # Drop attributes with uri values that have protocols that aren't # allowed parsed = urlparse(new_value) except ValueError: # URI is impossible to parse, therefore it's not allowed return None if parsed.scheme: # If urlparse found a scheme, check that if parsed.scheme in allowed_protocols: return value else: # Allow uris that are just an anchor if new_value.startswith('#'): return value # Handle protocols that urlparse doesn't recognize like "myprotocol" if ':' in new_value and new_value.split(':')[0] in allowed_protocols: return value # If there's no protocol/scheme specified, then assume it's "http" # and see if that's allowed if 'http' in allowed_protocols: return value return None
def sanitize_uri_value(self, value, allowed_protocols): """Checks a uri value to see if it's allowed :arg value: the uri value to sanitize :arg allowed_protocols: list of allowed protocols :returns: allowed value or None """ # NOTE(willkg): This transforms the value into one that's easier to # match and verify, but shouldn't get returned since it's vastly # different than the original value. # Convert all character entities in the value new_value = html5lib_shim.convert_entities(value) # Nix backtick, space characters, and control characters new_value = re.sub( r"[`\000-\040\177-\240\s]+", '', new_value ) # Remove REPLACEMENT characters new_value = new_value.replace('\ufffd', '') # Lowercase it--this breaks the value, but makes it easier to match # against new_value = new_value.lower() try: # Drop attributes with uri values that have protocols that aren't # allowed parsed = urlparse(new_value) except ValueError: # URI is impossible to parse, therefore it's not allowed return None if parsed.scheme: # If urlparse found a scheme, check that if parsed.scheme in allowed_protocols: return value else: # Allow uris that are just an anchor if new_value.startswith('#'): return value # Handle protocols that urlparse doesn't recognize like "myprotocol" if ':' in new_value and new_value.split(':')[0] in allowed_protocols: return value # If there's no protocol/scheme specified, then assume it's "http" # and see if that's allowed if 'http' in allowed_protocols: return value return None
def sanitize_css(self, style): """Sanitizes css in style tags""" # Convert entities in the style so that it can be parsed as CSS style = html5lib_shim.convert_entities(style) # Drop any url values before we do anything else style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # The gauntlet of sanitization # Validate the css in the style tag and if it's not valid, then drop # the whole thing. parts = style.split(';') gauntlet = re.compile( r"""^( # consider a style attribute value as composed of: [/:,#%!.\s\w] # a non-newline character |\w-\w # 3 characters in the form \w-\w |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space |"[\s\w]+" # a double quoted string of [\s\w]+ |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'' )*$""", flags=re.U | re.VERBOSE ) for part in parts: if not gauntlet.match(part): return '' if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean)
def sanitize_css(self, style): """Sanitizes css in style tags""" # Convert entities in the style so that it can be parsed as CSS style = html5lib_shim.convert_entities(style) # Drop any url values before we do anything else style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style) # The gauntlet of sanitization # Validate the css in the style tag and if it's not valid, then drop # the whole thing. parts = style.split(";") gauntlet = re.compile( r"""^( # consider a style attribute value as composed of: [-/:,#%!.\s\w] # a non-newline character |'[-/:,#%!.()"\s\w]*' # a single quoted string |"[-/:,#%!.()'\s\w]*" # a double quoted string |\([\d,%.\s]+\) # a parenthesized string of one or more digits, commas, periods, ... )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)' flags=re.U | re.VERBOSE, ) for part in parts: if not gauntlet.match(part): return "" if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return "" clean = [] for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ": " + value + ";") elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ": " + value + ";") return " ".join(clean)
def sanitize_css(self, style): """Sanitizes css in style tags""" # Convert entities in the style so that it can be parsed as CSS style = html5lib_shim.convert_entities(style) # Drop any url values before we do anything else style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # The gauntlet of sanitization # Validate the css in the style tag and if it's not valid, then drop # the whole thing. parts = style.split(';') gauntlet = re.compile( r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""", flags=re.U ) for part in parts: if not gauntlet.match(part): return '' if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean)
def sanitize_css(self, style): """Sanitizes css in style tags""" # Convert entities in the style so that it can be parsed as CSS style = html5lib_shim.convert_entities(style) # Drop any url values before we do anything else style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # The gauntlet of sanitization # Validate the css in the style tag and if it's not valid, then drop # the whole thing. parts = style.split(';') gauntlet = re.compile( r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""" ) for part in parts: if not gauntlet.match(part): return '' if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean)
def test_convert_entities(data, expected): assert html5lib_shim.convert_entities(data) == expected