def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) # Remove forbidden attributes for to_remove in attr_names - self.allowed_attributes: del token["data"][to_remove] attr_names.remove(to_remove) # Remove attributes with disallowed URL values for attr in attr_names & self.attr_val_is_uri: assert attr in attrs # I don't have a clue where this regexp comes from or why it matches those # characters, nor why we call unescape. I just know it's always been here. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # this will do is remove *more* than it otherwise would. val_unescaped = re.sub( "[`\x00-\x20\x7f-\xa0\\s]+", "", unescape(attrs[attr]) ).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") try: uri = urlparse.urlparse(val_unescaped) except ValueError: uri = None del attrs[attr] if uri and uri.scheme: if uri.scheme not in self.allowed_protocols: del attrs[attr] if uri.scheme == "data": m = data_content_type.match(uri.path) if not m: del attrs[attr] elif m.group("content_type") not in self.allowed_content_types: del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub( r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(attrs[attr]) ) if ( token["name"] in self.svg_allow_local_href and (namespaces["xlink"], "href") in attrs and re.search(r"^\s*[^#\s].*", attrs[(namespaces["xlink"], "href")]) ): del attrs[(namespaces["xlink"], "href")] if (None, "style") in attrs: attrs[(None, "style")] = self.sanitize_css(attrs[(None, "style")]) token["data"] = attrs return token
def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) # Remove forbidden attributes for to_remove in (attr_names - self.allowed_attributes): del token["data"][to_remove] attr_names.remove(to_remove) # Remove attributes with disallowed URL values for attr in (attr_names & self.attr_val_is_uri): assert attr in attrs # I don't have a clue where this regexp comes from or why it matches those # characters, nor why we call unescape. I just know it's always been here. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # this will do is remove *more* than it otherwise would. val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") try: uri = urlparse.urlparse(val_unescaped) except ValueError: uri = None del attrs[attr] if uri and uri.scheme: if uri.scheme not in self.allowed_protocols: del attrs[attr] if uri.scheme == 'data': m = data_content_type.match(uri.path) if not m: del attrs[attr] elif m.group('content_type') not in self.allowed_content_types: del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespaces['xlink'], 'href')] if (None, 'style') in attrs: attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')]) token["data"] = attrs return token