Exemple #1
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                alphabetical_attributes=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemple #2
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                alphabetical_attributes=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemple #3
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
Exemple #4
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)[container_len:-container_len - 1]
Exemple #5
0
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
Exemple #6
0
def sanitize(html, strip_whitespace=False):
    """
    Sanitize HTML to leave only the readable top-level elements.
    """
    TreeBuilder = html5lib.treebuilders.getTreeBuilder("lxml")
    parser = html5lib.HTMLParser(tree=TreeBuilder, tokenizer=ReadableTokenizer)
    tree = parser.parse(html)
    walker = ReadableTreewalker(tree)
    serializer = HTMLSerializer(strip_whitespace=strip_whitespace)
    return serializer.render(walker)
def strip_styles(text):
    parser = html5lib.HTMLParser(tokenizer=StyleSanitizer)
    domtree = parser.parseFragment(text)
    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(domtree)
    try:
        serializer = HTMLSerializer(quote_attr_values=True,
                                    omit_optional_tags=False)
        return serializer.render(stream)
    except AssertionError, e:
        return domtree.toxml()
Exemple #8
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        '{http://www.w3.org/1999/xhtml}blockquote',
        '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li',
        '{http://www.w3.org/1999/xhtml}ul'
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip('\n')
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip('\n')

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith('\n'):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip('\n')
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemple #9
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        "{http://www.w3.org/1999/xhtml}blockquote",
        "{http://www.w3.org/1999/xhtml}ol",
        "{http://www.w3.org/1999/xhtml}li",
        "{http://www.w3.org/1999/xhtml}ul",
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip("\n")
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip("\n")

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith("\n"):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip("\n")
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker("etree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Exemple #10
0
def sanitize_html(html):
    if isinstance(html, bytes):
        html = html.decode('utf-8', 'replace')
    import html5lib
    from html5lib.sanitizer import HTMLSanitizer
    from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.treebuilders.etree_lxml import TreeBuilder
    from html5lib.treewalkers.lxmletree import TreeWalker
    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
    tree = parser.parseFragment(html)
    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
    stream = TreeWalker(tree)
    return serializer.render(stream)
Exemple #11
0
def sanitize_html(html):
    if not html:
        return u''
    if isinstance(html, bytes):
        html = html.decode('utf-8', 'replace')
    import html5lib
    from html5lib.sanitizer import HTMLSanitizer
    from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.treebuilders.etree_lxml import TreeBuilder
    from html5lib.treewalkers.lxmletree import TreeWalker
    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
    tree = parser.parseFragment(html)
    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
    stream = TreeWalker(tree)
    return serializer.render(stream)
Exemple #12
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemple #13
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemple #14
0
def sanitize_html_fragment(fragment):
    """
    Santize an HTML ``fragment``, returning a copy of the fragment
    that has been cleaned up.
    """
    if fragment:
        import html5lib
        from html5lib.sanitizer import HTMLSanitizer
        from html5lib.serializer.htmlserializer import HTMLSerializer

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
        parsed = parser.parseFragment(fragment)
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(parsed)
        serializer = HTMLSerializer(quote_attr_values=True,
                omit_optional_tags=False)
        output = serializer.render(stream)
        return output
    else:
        return fragment
Exemple #15
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ["blockquote", "ol", "li", "ul"]

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ""
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip("\n")

                # Remove the first new line after a block level element.
                if prev_tag in html_blocks and value.startswith("\n"):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    walker = html5lib.treewalkers.getTreeWalker("simpletree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Exemple #16
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker("simpletree")
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)