Beispiel #1
0
    def clean(self, value, model_instance):
        """
        Validates the given value using the provided HTMLCleaner
        and returns its "cleaned" value as a Python object.

        Raises ValidationError for any errors.
        """
        value = super(HTMLField, self).clean(value, model_instance)

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = parser.parseFragment(value)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)

        if self.use_imageproxy:
            from imageproxy import Proxy
            user = User.objects.get(pk=getattr(model_instance, self.user_field))
            proxy = Proxy(user)
            stream = ImageProxyFilter(stream, proxy)

        s = HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        clean_value = ''
        for item in output_generator:
            clean_value += item

        return clean_value
Beispiel #2
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                alphabetical_attributes=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #3
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
Beispiel #4
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False,
                                alphabetical_attributes=True)

    return serializer.render(stream)
Beispiel #5
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)[container_len:-container_len - 1]
Beispiel #6
0
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
Beispiel #7
0
def sanitize(html, strip_whitespace=False):
    """
    Sanitize HTML to leave only the readable top-level elements.
    """
    TreeBuilder = html5lib.treebuilders.getTreeBuilder("lxml")
    parser = html5lib.HTMLParser(tree=TreeBuilder, tokenizer=ReadableTokenizer)
    tree = parser.parse(html)
    walker = ReadableTreewalker(tree)
    serializer = HTMLSerializer(strip_whitespace=strip_whitespace)
    return serializer.render(walker)
def strip_styles(text):
    parser = html5lib.HTMLParser(tokenizer=StyleSanitizer)
    domtree = parser.parseFragment(text)
    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(domtree)
    try:
        serializer = HTMLSerializer(quote_attr_values=True,
                                    omit_optional_tags=False)
        return serializer.render(stream)
    except AssertionError, e:
        return domtree.toxml()
Beispiel #9
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        "{http://www.w3.org/1999/xhtml}blockquote",
        "{http://www.w3.org/1999/xhtml}ol",
        "{http://www.w3.org/1999/xhtml}li",
        "{http://www.w3.org/1999/xhtml}ul",
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip("\n")
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip("\n")

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith("\n"):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip("\n")
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker("etree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #10
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        '{http://www.w3.org/1999/xhtml}blockquote',
        '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li',
        '{http://www.w3.org/1999/xhtml}ul'
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip('\n')
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip('\n')

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith('\n'):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip('\n')
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #11
0
def sanitize_html(html):
    if isinstance(html, bytes):
        html = html.decode('utf-8', 'replace')
    import html5lib
    from html5lib.sanitizer import HTMLSanitizer
    from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.treebuilders.etree_lxml import TreeBuilder
    from html5lib.treewalkers.lxmletree import TreeWalker
    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
    tree = parser.parseFragment(html)
    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
    stream = TreeWalker(tree)
    return serializer.render(stream)
Beispiel #12
0
def sanitize_html(html):
    if not html:
        return u''
    if isinstance(html, bytes):
        html = html.decode('utf-8', 'replace')
    import html5lib
    from html5lib.sanitizer import HTMLSanitizer
    from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.treebuilders.etree_lxml import TreeBuilder
    from html5lib.treewalkers.lxmletree import TreeWalker
    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
    tree = parser.parseFragment(html)
    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
    stream = TreeWalker(tree)
    return serializer.render(stream)
Beispiel #13
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #14
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #15
0
def sanitize_html_fragment(fragment):
    """
    Santize an HTML ``fragment``, returning a copy of the fragment
    that has been cleaned up.
    """
    if fragment:
        import html5lib
        from html5lib.sanitizer import HTMLSanitizer
        from html5lib.serializer.htmlserializer import HTMLSerializer

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
        parsed = parser.parseFragment(fragment)
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(parsed)
        serializer = HTMLSerializer(quote_attr_values=True,
                omit_optional_tags=False)
        output = serializer.render(stream)
        return output
    else:
        return fragment
Beispiel #16
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ["blockquote", "ol", "li", "ul"]

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ""
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip("\n")

                # Remove the first new line after a block level element.
                if prev_tag in html_blocks and value.startswith("\n"):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    walker = html5lib.treewalkers.getTreeWalker("simpletree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #17
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker("simpletree")
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Beispiel #18
0
def independentize_html_path(src, dst, css_dir=None, log=None):
    """Process the `src' HTML path to `dst' making it independent.
    
    - favicon links are removed
    - CSS references are updated (if `css_dir' is given), else removed.
    - Relative links are de-linkified.
    """
    if log:
        log.info("independentize %s %s", src, dst)

    # Parse the HTML file.
    with open(src) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # - Drop favicon links.
    # - Update or drop CSS links.
    head = tree.find("head")
    for link in head.getchildren()[:]:
        if link.tag != "link":
            continue
        rel = link.get("rel", "").split()
        if "icon" in rel:  # this is a favicon link
            if log:
                log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel"))
            head.remove(link)
        if "stylesheet" in rel:  # this is a css ref
            if css_dir:  # update the css dir
                href = link.get("href")
                href = posixpath.join(css_dir, posixpath.basename(href))
                link.set("href", href)
                if log:
                    log.debug("%s: update to <link href='%s'/>", dst, href)
            else:
                if log:
                    log.debug("%s: remove <link href='%s'/>", dst,
                              link.get("href"))
                head.remove(link)

    # De-linkify local references within the full docset.
    # TODO: Eventually would like to normalize these to point
    # to online version of the docs.
    body = tree.find("body")
    for elem in body.getiterator():
        if elem.tag != "a":
            continue
        if not elem.get("href"):
            continue
        href = elem.get("href")
        scheme, netloc, path, params, query, fragment = urlparse(href)
        if scheme or netloc:  # externals href
            continue
        if path:
            if log:
                log.debug("%s: de-linkify <a href='%s'>", dst, href)
            elem.tag = u"span"  # de-linkify

    # Write out massaged doc.
    walker = treewalkers.getTreeWalker("etree", ET)
    stream = walker(tree)
    s = HTMLSerializer()
    outputter = s.serialize(stream)
    content = ''.join(list(outputter))
    f = open(dst, 'w')
    try:
        f.write(content)
    finally:
        f.close()