def clean(self, value, model_instance): """ Validates the given value using the provided HTMLCleaner and returns its "cleaned" value as a Python object. Raises ValidationError for any errors. """ value = super(HTMLField, self).clean(value, model_instance) parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parseFragment(value) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) if self.use_imageproxy: from imageproxy import Proxy user = User.objects.get(pk=getattr(model_instance, self.user_field)) proxy = Proxy(user) stream = ImageProxyFilter(stream, proxy) s = HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) clean_value = '' for item in output_generator: clean_value += item return clean_value
def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream)
def to_unicode(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)[container_len : -container_len - 1]
def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False, alphabetical_attributes=True) return serializer.render(stream)
def to_unicode(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)[container_len:-container_len - 1]
def strip_tags(html): if html: builder = treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags) tree = parser.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(tree) serializer = HTMLSerializer() return serializer.render(stream)
def sanitize(html, strip_whitespace=False): """ Sanitize HTML to leave only the readable top-level elements. """ TreeBuilder = html5lib.treebuilders.getTreeBuilder("lxml") parser = html5lib.HTMLParser(tree=TreeBuilder, tokenizer=ReadableTokenizer) tree = parser.parse(html) walker = ReadableTreewalker(tree) serializer = HTMLSerializer(strip_whitespace=strip_whitespace) return serializer.render(walker)
def strip_styles(text): parser = html5lib.HTMLParser(tokenizer=StyleSanitizer) domtree = parser.parseFragment(text) walker = html5lib.treewalkers.getTreeWalker('simpletree') stream = walker(domtree) try: serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream) except AssertionError, e: return domtree.toxml()
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ "{http://www.w3.org/1999/xhtml}blockquote", "{http://www.w3.org/1999/xhtml}ol", "{http://www.w3.org/1999/xhtml}li", "{http://www.w3.org/1999/xhtml}ul", ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip("\n") if not len(tree): # No children. tree.text = tree.text.rstrip("\n") # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith("\n"): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip("\n") parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker("etree") stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ '{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li', '{http://www.w3.org/1999/xhtml}ul' ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip('\n') if not len(tree): # No children. tree.text = tree.text.rstrip('\n') # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith('\n'): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip('\n') parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def sanitize_html(html): if isinstance(html, bytes): html = html.decode('utf-8', 'replace') import html5lib from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer.htmlserializer import HTMLSerializer from html5lib.treebuilders.etree_lxml import TreeBuilder from html5lib.treewalkers.lxmletree import TreeWalker parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder) tree = parser.parseFragment(html) serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False) stream = TreeWalker(tree) return serializer.render(stream)
def sanitize_html(html): if not html: return u'' if isinstance(html, bytes): html = html.decode('utf-8', 'replace') import html5lib from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer.htmlserializer import HTMLSerializer from html5lib.treebuilders.etree_lxml import TreeBuilder from html5lib.treewalkers.lxmletree import TreeWalker parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder) tree = parser.parseFragment(html) serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False) stream = TreeWalker(tree) return serializer.render(stream)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ['blockquote', 'ol', 'li', 'ul'] if not string: return string def parse_html(tree): prev_tag = '' for i, node in enumerate(tree.childNodes): if node.type == 4: # Text node value = node.value # Strip new lines directly inside block level elements. if node.parent.name in html_blocks: value = value.strip('\n') # Remove the first new line after a block level element. if (prev_tag in html_blocks and value.startswith('\n')): value = value[1:] tree.childNodes[i].value = value else: tree.insertBefore(parse_html(node), node) tree.removeChild(node) prev_tag = node.name return tree parse = parse_html(html5lib.parseFragment(string)) if not parse.childNodes: # The parser couldn't make sense of the given html, eg bad markup. return '' walker = html5lib.treewalkers.getTreeWalker('simpletree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def sanitize_html_fragment(fragment): """ Santize an HTML ``fragment``, returning a copy of the fragment that has been cleaned up. """ if fragment: import html5lib from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer.htmlserializer import HTMLSerializer parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer) parsed = parser.parseFragment(fragment) walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parsed) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) output = serializer.render(stream) return output else: return fragment
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ["blockquote", "ol", "li", "ul"] if not string: return string def parse_html(tree): prev_tag = "" for i, node in enumerate(tree.childNodes): if node.type == 4: # Text node value = node.value # Strip new lines directly inside block level elements. if node.parent.name in html_blocks: value = value.strip("\n") # Remove the first new line after a block level element. if prev_tag in html_blocks and value.startswith("\n"): value = value[1:] tree.childNodes[i].value = value else: tree.insertBefore(parse_html(node), node) tree.removeChild(node) prev_tag = node.name return tree parse = parse_html(html5lib.parseFragment(string)) walker = html5lib.treewalkers.getTreeWalker("simpletree") stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker("simpletree") stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def independentize_html_path(src, dst, css_dir=None, log=None): """Process the `src' HTML path to `dst' making it independent. - favicon links are removed - CSS references are updated (if `css_dir' is given), else removed. - Relative links are de-linkified. """ if log: log.info("independentize %s %s", src, dst) # Parse the HTML file. with open(src) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # - Drop favicon links. # - Update or drop CSS links. head = tree.find("head") for link in head.getchildren()[:]: if link.tag != "link": continue rel = link.get("rel", "").split() if "icon" in rel: # this is a favicon link if log: log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel")) head.remove(link) if "stylesheet" in rel: # this is a css ref if css_dir: # update the css dir href = link.get("href") href = posixpath.join(css_dir, posixpath.basename(href)) link.set("href", href) if log: log.debug("%s: update to <link href='%s'/>", dst, href) else: if log: log.debug("%s: remove <link href='%s'/>", dst, link.get("href")) head.remove(link) # De-linkify local references within the full docset. # TODO: Eventually would like to normalize these to point # to online version of the docs. body = tree.find("body") for elem in body.getiterator(): if elem.tag != "a": continue if not elem.get("href"): continue href = elem.get("href") scheme, netloc, path, params, query, fragment = urlparse(href) if scheme or netloc: # externals href continue if path: if log: log.debug("%s: de-linkify <a href='%s'>", dst, href) elem.tag = u"span" # de-linkify # Write out massaged doc. walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(dst, 'w') try: f.write(content) finally: f.close()