Example #1
0
def parse_text(text):
	t1 = time.clock()
	parser = html5lib.HTMLParser(
			tree=treebuilders.getTreeBuilder('etree'),
			tokenizer=MySanitiser)
	t2 = time.clock()

	text = text.replace('\r', '')
	text = text.replace('\n', '<br>')
	t3 = time.clock()

	for search,replace in SMILEY_REPLACEMENTS:
		text = text.replace(search, replace)

	for regex,replace in BBCODE_REGEXES:
		text = regex.sub(replace, text)

	for search,replace in BBCODE_REPLACEMENTS:
		text = text.replace(search, replace)

	t4 = time.clock()
	doc = parser.parse(text)
	t5 = time.clock()

	walker = treewalkers.getTreeWalker('etree')
	stream = walker(doc)
	s = serializer.htmlserializer.HTMLSerializer()
	output_generator = s.serialize(stream)
	t6 = time.clock()

	done = Markup(''.join(list(output_generator)))
	t7 = time.clock()
	print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
	return done
 def filter_response(self, response, encoding=None):
     """
     Filter and fix-up the response object.
     """
     # Parse the response
     tree_type = settings.TREE_TYPE
     # Here we check for a TemplateResponse in the case we're being
     # used as a view decorator.
     if hasattr(response, 'render') and callable(response.render):
         response.render()
     tree = html5parser.parse(
         response.content, treebuilder=tree_type, encoding=encoding
     )
     # Build the serializer
     walker = treewalkers.getTreeWalker(tree_type)
     stream = walker(tree)
     options = self.get_serializer_options()
     serializer = htmlserializer.HTMLSerializer(**options)
     output = serializer.render(stream)
     output = output.encode(encoding)
     # Fix up the response
     response.content = output
     response['Content-Length'] = str(len(output))
     # Add a flag to prevent further filtering if the decorator is already
     # used on this response.
     setattr(response, settings.FILTERED_FLAG, True)
     return response
Example #3
0
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
Example #4
0
def html2list(payload):

    """This function reads a block of HTML and returns a cleaned list.
    :param payload: The HTML string to read.
    :type payload: str
    :returns: list -- The parsed output as a list of strings.
    """

    cleaned_output = []
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('lxml'),tokenizer=sanitizer.HTMLSanitizer)
    s = serializer.htmlserializer.HTMLSerializer(strip_whitespace=True,omit_optional_tags=True)
    r = treewalkers.getTreeWalker('lxml')(p.parse(payload))
    for item in r.tree.elementtree.getiterator():
        if item.getparent() is not None:
            if (item.getparent().tag.split('}')[-1] == 'html'):
                item.text = ''
        else: item.text = ''
        for k in item.attrib:
            del item.attrib[k]
        if type(item.text) is str:
            for c in P['R']:
                item.text = re.sub(c,'',item.text)
    for tag in s.serialize(r):
        if not re.match("""(?:<|&lt;)/?\w+((\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?(?:>|&gt;)?""",tag):
            tag = tag.encode('ascii', 'ignore')
            split_tag = map(lambda x: x.strip(), re.split('[|,;]|(?:=2C|=3B)',tag.replace('&amp;','&')))
            for t in split_tag:
                for e in P['E']:
                    if t == e:
                        split_tag.remove(t)
            if split_tag:
                cleaned_output += split_tag
    return cleaned_output
Example #5
0
  def get_toc(self, path):
    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2', 'h3', 'h4']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1]
                }
        elif element['type'] == 'Characters' and current is not None:
          current['text'] = element['data']
        elif element['type'] == 'EndTag' and current is not None:
          toc.append(current)
          current = None
      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
Example #6
0
def test_to_sax():
    handler = support.TracingSaxHandler()
    tree = html5lib.parse("""<html xml:lang="en">
        <title>Directory Listing</title>
        <a href="/"><b/></p>
    """, treebuilder="etree")
    walker = getTreeWalker("etree")
    sax.to_sax(walker(tree), handler)
    expected = [
        'startDocument',
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
            'html', {(None, 'xml:lang'): 'en'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
        ('characters', 'Directory Listing'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
        ('characters', '\n        '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
        ('characters', '\n    '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
        'endDocument',
    ]
    assert expected == handler.visited
Example #7
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document, '__getitem__'): document = [document]
        for fragment in document:
            sys.stdout.write(parser.tree.testSerializer(fragment))
        sys.stdout.write("\n")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts, opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
Example #8
0
def _normalize(html):
    """
    Normalize the given string of HTML, collapsing whitespace.
    """

    # This is taken from the "Serialization of Streams" section of
    # http://code.google.com/p/html5lib/wiki/UserDocumentation.
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    output_generator = s.serialize(stream)

    # TODO: We're not actually collapsing *all* whitespace; only
    # entire chunks of whitespace that the serializer gives us. Currently,
    # this seems "good enough" to pass our unit tests, which are
    # based on use cases of comparing pre-sanitized HTML to sanitized HTML,
    # but we may need to change this in the future.
    parts = []
    last_item_was_whitespace = False
    for item in output_generator:
        # Is it empty whitespace?
        if item.strip() != '':
            parts.append(item)
            last_item_was_whitespace = False
        elif not last_item_was_whitespace:
            # Collapse whitespace.
            parts.append(' ')
            last_item_was_whitespace = True
    return ''.join(parts)
Example #9
0
 def get_toc(self, path):
   toc = memcache.get('toc|%s' % path)
   if toc is None or self.request.cache == False:
     template_text = webapp.template.render(path, {});
     parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
     dom_tree = parser.parse(template_text)
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     toc = []
     current = None
     for element in stream:
       if element['type'] == 'StartTag':
         if element['name'] in ['h2', 'h3', 'h4']:
           for attr in element['data']:
             if attr[0] == 'id':
               current = {
                 'level' : int(element['name'][-1:]) - 1,
                 'id' : attr[1]
               }
       elif element['type'] == 'Characters' and current is not None:
         current['text'] = element['data']
       elif element['type'] == 'EndTag' and current is not None:
         toc.append(current)
         current = None
     memcache.set('toc|%s' % path, toc, 3600)
   return toc
Example #10
0
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                      tokenizer=HTMLSanitizer)
    dom_tree = html_parser.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    output = s.render(stream, 'utf-8')

    while 'toberemoved' in output:
        oldoutput = output
        matches = re.findall(r'&lt;toberemoved.*?&gt;.*?&lt;/toberemoved&gt;',
                             output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;toberemoved.*?&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        if output == oldoutput:
            break

    return output
Example #11
0
def sanitize(string, html_type):
    """
    >>> sanitize("\\t<p>a paragraph</p>","html")
    u'\\t<p>a paragraph</p>'

    >>> sanitize("\\t<script>alert('evil script');</script>", "xhtml")
    u"\\t&lt;script&gt;alert('evil script');&lt;/script&gt;"

    """
    try:
        import html5lib
        from html5lib import sanitizer, serializer, treewalkers, treebuilders
    except ImportError:
        raise Exception("html5lib not available")

    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
    tree = p.parseFragment(string)

    walker = treewalkers.getTreeWalker("simpletree")
    stream = walker(tree)

    if html_type == 'xhtml':
        s = serializer.xhtmlserializer.XHTMLSerializer()
    else:
        s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                     quote_attr_values=True)
    return s.render(stream)
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Example #13
0
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
Example #14
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
Example #15
0
    def get_toc(self, path):
        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search('/tutorials', path) or re.search('/mobile', path)):
            return ''

        toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            for element in stream:
                if element['type'] == 'StartTag':
                    if element['name'] in ['h2', 'h3', 'h4']:
                        for attr in element['data']:
                            if attr[0] == 'id':
                                current = {
                                    'level': int(element['name'][-1:]) - 1,
                                    'id': attr[1]
                                }
                elif element['type'] == 'Characters' and current is not None:
                    current['text'] = element['data']
                elif element['type'] == 'EndTag' and current is not None:
                    toc.append(current)
                    current = None
            memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path),
                         toc, 3600)

        return toc
Example #16
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Example #17
0
 def __str__(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
Example #18
0
	def search(self, term):
		# define link for search
		searchUrl = self.baseUrl + r"/sc/search?&must=" + term + r"&Type=Music&Type=&inandout=true&SRI=true&ND=-1"
		

		print(" --> searching on chemical for " + term)
		print(" --> with " + searchUrl)
		source = getWebAsStr(searchUrl)

		# create a parser, we use minidom
		p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
		dom_tree = p.parse(source)
		walker = treewalkers.getTreeWalker("dom")
		stream = walker(dom_tree)

		# now we can send the stream to our fetcher functions
		# find links on search result page
		l_hitLinks = self.fetch_HitLinks(stream)

		# find short info
		l_shortInfo = self.fetch_ShortInfo(stream)

		# create an two dimensional list
		results = []

		for link, info in zip(l_hitLinks, l_shortInfo):
			results.append([link, info]) 

		return results
Example #19
0
    def get_toc(self, path):
        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search("/tutorials", path) or re.search("/mobile", path)):
            return ""

        toc = memcache.get("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            for element in stream:
                if element["type"] == "StartTag":
                    if element["name"] in ["h2", "h3", "h4"]:
                        for attr in element["data"]:
                            if attr[0] == "id":
                                current = {"level": int(element["name"][-1:]) - 1, "id": attr[1]}
                elif element["type"] == "Characters" and current is not None:
                    current["text"] = element["data"]
                elif element["type"] == "EndTag" and current is not None:
                    toc.append(current)
                    current = None
            memcache.set("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

        return toc
Example #20
0
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    html_parser = html5lib.HTMLParser(
        tree=treebuilders.getTreeBuilder("dom"), tokenizer=HTMLSanitizer)
    dom_tree = html_parser.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(
        omit_optional_tags=False, quote_attr_values=True)
    output = s.render(stream, 'utf-8')

    while 'toberemoved' in output:
        oldoutput = output
        matches = re.findall(
            r'&lt;toberemoved.*?&gt;.*?&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;toberemoved.*?&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        if output == oldoutput:
            break

    return output
Example #21
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document,'__getitem__'): document = [document]
        for fragment in document:
            sys.stdout.write(parser.tree.testSerializer(fragment))
        sys.stdout.write("\n")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts,opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def hmtl2text(html):
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    in_script = False
    outbuf = []
    current_line = []
    for token in stream:
        token_name = token.get('name', "").lower()

        if token_name in ['script', 'style', 'noscript']:
            in_script = token.get('type', None) == 'StartTag'
        if in_script:
            continue

        if token_name in block_level_elements or token_name == "br":
            if current_line:
                outbuf.append(u"".join(current_line))
                current_line = []

        if token.get(u'type', None) == u'Characters':
            current_line.append(token['data'])
        if token.get(u'type', None) == u'SpaceCharacters':
            if current_line and current_line[-1] != u" ":
                current_line.append(u" ")

    if current_line:
        outbuf.append(u"".join(current_line))
    return clean_whitespace("\n".join(outbuf))
Example #23
0
    def clean(self, value, model_instance):
        """
        Validates the given value using the provided HTMLCleaner
        and returns its "cleaned" value as a Python object.

        Raises ValidationError for any errors.
        """
        value = super(HTMLField, self).clean(value, model_instance)

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = parser.parseFragment(value)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)

        if self.use_imageproxy:
            from imageproxy import Proxy
            user = User.objects.get(pk=getattr(model_instance, self.user_field))
            proxy = Proxy(user)
            stream = ImageProxyFilter(stream, proxy)

        s = HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        clean_value = ''
        for item in output_generator:
            clean_value += item

        return clean_value
def hmtl2text(html):
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    in_script = False
    outbuf = []
    current_line = []
    for token in stream:
        token_name = token.get('name', "").lower()

        if token_name in ['script', 'style', 'noscript']:
            in_script = token.get('type', None) == 'StartTag'
        if in_script:
            continue

        if token_name in block_level_elements or token_name == "br":
            if current_line:
                outbuf.append(u"".join(current_line))
                current_line = []

        if token.get(u'type', None) == u'Characters':
            current_line.append(token['data'])
        if token.get(u'type', None) == u'SpaceCharacters':
            if current_line and current_line[-1] != u" ":
                current_line.append(u" ")

    if current_line:
        outbuf.append(u"".join(current_line))
    return clean_whitespace("\n".join(outbuf))
Example #25
0
def test_lxml_xml():
    expected = [{
        'data': {},
        'name': 'div',
        'namespace': None,
        'type': 'StartTag'
    }, {
        'data': {},
        'name': 'div',
        'namespace': None,
        'type': 'StartTag'
    }, {
        'name': 'div',
        'namespace': None,
        'type': 'EndTag'
    }, {
        'name': 'div',
        'namespace': None,
        'type': 'EndTag'
    }]

    lxmltree = lxml.etree.fromstring('<div><div></div></div>')
    walker = treewalkers.getTreeWalker('lxml')
    output = Lint(walker(lxmltree))

    assert list(output) == expected
Example #26
0
def app_filter_html_path_inplace(path, filters, log=None):
    """Filter the given HTML file (in-place) based on "app-*" class
    attributes.
    
    For example, the HTML might contain something like:
        <div class="app-ide">
            ...ide info...
        </div>
        <div class="app-edit">
            ...edit info...
        </div>
    If there are no filters, then the HTML is not changed. If the filters
    include "ide" but not "edit", then the ide div remains and the
    edit div is removed.
    """
    if not filters:
        return
    if log:
        log("app-filter `%s'", path)

    # Parse the HTML file.
    with open(path) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # Filter out the unwanted elements.
    filtered = False
    assert isinstance(filters, set)
    for elem in tree.getiterator():
        indeces_to_drop = []
        for i, child in enumerate(elem.getchildren()):
            if _should_drop_elem(child, filters, "class", "app-"):
                indeces_to_drop.insert(0, i)
                filtered = True
                if log:
                    tag_str = "<%s" % child.tag
                    if child.attrib:
                        for n, v in child.attrib.items():
                            tag_str += ' %s="%s"' % (n, v)
                    tag_str += ">"
                    if len(tag_str) > 50:
                        tag_str = tag_str[:47] + '...'
                    log("... filter out %s", tag_str)
        for idx in indeces_to_drop:
            del elem[idx]

    # Write out any changes.
    if filtered:
        walker = treewalkers.getTreeWalker("etree", ET)
        stream = walker(tree)
        s = HTMLSerializer()
        outputter = s.serialize(stream)
        content = ''.join(list(outputter))
        f = open(path, 'w')
        f.write("""<!DOCTYPE html>
""")
        try:
            f.write(content)
        finally:
            f.close()
Example #27
0
def SearchMovie(title, year):
    r = requests.post(DOMAIN_NAME + "/subtitles/searchbytitle", data={"query": title, "l": ""})
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(r.text)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    return SearchTitleMatch(stream)
Example #28
0
def run_sanitizer(html, sanitizer):
    parser = html5lib.HTMLParser(tokenizer=sanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    result = s.serialize(stream)
    return u"".join(result)
Example #29
0
  def sanitize_string(self, user_input):
    p = html5lib.HTMLParser(tokenizer=CommonsHTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(user_input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    return u"".join(s.serialize(stream))
Example #30
0
def cleanup_html(html):
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    result = s.render(stream)
    return u"".join(result)
Example #31
0
def parse(f):
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = p.parse(f)
    walker = treewalkers.getTreeWalker("dom")

    tokens = []
    bintokens = []

    waitfor = None

    for tok in walker(doc):

        if waitfor:
            if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
                waitfor = None
            continue

        if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
            waitfor = ("EndTag", tok["name"])

        if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
            bintokens.append(1)
            tokens.append(tok)

        elif tok["type"] in ("Characters",):
            for tok1 in tok["data"].split():
                bintokens.append(0)
                tokens.append({"type": "Characters", "data": tok1})

        elif tok["type"] in ("SpaceCharacters", "Doctype"):
            pass

        else:
            raise ValueError("unrecognizable token type: %r" % tok)

    cumbintokens = [bintokens[0]]

    for tok in bintokens[1:]:
        cumbintokens.append(cumbintokens[-1] + tok)

    length = len(cumbintokens)

    midx = None
    m = None

    for i in range(length):
        for j in range(i + 1, length):
            end_tag = cumbintokens[-1] - cumbintokens[j]
            start_tag = cumbintokens[i]
            text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
            nm = end_tag + start_tag + text_between

            if not midx or nm > m:
                midx = i, j
                m = nm

    i, j = midx
    return serialize_tokens(tokens[i:j + 1])
Example #32
0
    def get_toc(self, path):

        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search('/tutorials', path) or re.search('/mobile', path)
                or re.search('style-guide', path)):
            return ''

        toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            innerTagCount = 0
            for element in stream:
                if element['type'] == 'StartTag':
                    if element['name'] in ['h2']:
                        for attr in element['data']:
                            if attr[0] == 'id':
                                current = {
                                    'level': int(element['name'][-1:]) - 1,
                                    'id': attr[1],
                                    'text': ''
                                }
                    elif current is not None:
                        innerTagCount += 1
                elif element['type'] == 'Characters' and current is not None:

                    # if we already have text check:
                    # - whether the last character is a < or a (
                    # - the string being added starts with > or )
                    # in which case do not add a space
                    if current['text'] != '':

                        if current['text'][-1] != '<' and not re.match(
                                r"^[\>\)]", element['data']):
                            current['text'] += ' '

                    current['text'] = current['text'] + element['data']

                elif element['type'] == 'EndTag' and current is not None:
                    if innerTagCount > 0:
                        innerTagCount -= 1
                    else:
                        current['text'] = cgi.escape(current['text'])
                        toc.append(current)
                        current = None

            memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path),
                         toc, 3600)

        return toc
Example #33
0
def serialize(input, tree="simpletree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    else:
        raise ValueError("type must be html")
    return s.render(walker(input), encoding)
Example #34
0
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
Example #35
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u"".join(output_generator)
Example #36
0
def build_tree(f):
    html = []
    for line in f:
        line = line.replace("\t", "    ")
        html.append(line)
    html = "".join(html)
    encoding = chardet.detect(html)
    # print "Detected encoding: ", encoding
    html = html.decode(encoding["encoding"])

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    chars = ""
    root = MyDOM(u"root", None)
    node = root
    for token in stream:
        token_type = token.get("type", None)

        if token_type.endswith("Error"):
            return None

        if token_type == "Comment":  # ignore comments for now
            continue

        if token_type.endswith("Characters"):
            chars += token.get("data", "")
            continue

        if chars.strip():
            node.addkid(chars, "chars")
        chars = ""

        tag_name = token.get("name", None)

        if token_type == "EmptyTag":
            continue
            node.addkid(tag_name, "tag")
            for k, v in token.get("data", {}).iteritems():
                node.addkid("%s:%s" % (k[1], v), "meta")
            continue

        assert tag_name is not None, token
        tag_name = tag_name.upper()

        if token_type == "EndTag":
            assert MyDOM.get_label(node) == tag_name, token
            node = node.get_parent()
            assert node is not None, "Unbalanced Tree"

        if token_type == "StartTag":
            node = node.addkid(tag_name, "tag")

    return root
Example #37
0
def build_tree(f):
    html = []
    for line in f:
        line = line.replace("\t", "    ")
        html.append(line)
    html = "".join(html)
    encoding = chardet.detect(html)
    # print "Detected encoding: ", encoding
    html = html.decode(encoding["encoding"])

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    chars = ""
    root = MyDOM(u"root", None)
    node = root
    for token in stream:
        token_type = token.get("type", None)
        
        if token_type.endswith("Error"):
            return None

        if token_type == "Comment":  # ignore comments for now
            continue

        if token_type.endswith("Characters"):
            chars += token.get("data", "")
            continue

        if chars.strip():
            node.addkid(chars, "chars")
        chars = ""

        tag_name = token.get("name", None)

        if token_type == "EmptyTag":
            continue
            node.addkid(tag_name, "tag")
            for k, v in token.get("data", {}).iteritems():
                node.addkid("%s:%s" % (k[1], v), "meta")
            continue

        assert tag_name is not None, token
        tag_name = tag_name.upper()

        if token_type == "EndTag":
            assert MyDOM.get_label(node) == tag_name, token
            node = node.get_parent()
            assert node is not None, "Unbalanced Tree"

        if token_type == "StartTag":
            node = node.addkid(tag_name, "tag")

    return root
Example #38
0
def tostring(lxmltree, options=None):
    options = options or {'omit_optional_tags': False}
    walker = treewalkers.getTreeWalker('lxml')
    stream = walker(lxmltree)
    s = serializer.HTMLSerializer(**options)
    output = s.render(stream)
    if not isinstance(output, str):
        # Python 2
        output = output.encode('utf-8')
    return output
Example #39
0
def writeHtml(writer, nodeList):
    from html5lib.treewalkers import getTreeWalker
    #from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.serializer.xhtmlserializer import XHTMLSerializer

    walker = getTreeWalker('dom')
    serializer = XHTMLSerializer()
    for node in nodeList:
        for item in serializer.serialize(walker(node)):
            writer.write(item)
Example #40
0
def sanitize(content):
    parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
                             tree = treebuilders.getTreeBuilder("dom"))
    dom = parser.parseFragment(content)
    tree_walker = treewalkers.getTreeWalker("dom")
    tree_stream = tree_walker(dom)
    serial = serializer.HTMLSerializer(omit_optional_tags = False,
                                           quote_attr_values = True)
    output = serial.serialize(tree_stream)
    return u''.join(output)
Example #41
0
def sanitize_html(data, encoding=None):
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                 tokenizer=sanitizer_factory)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(parser.parseFragment(data, encoding=encoding))
    slzr = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                    quote_attr_values=True,
                                                    use_trailing_solidus=True)
    html = slzr.render(stream, encoding)
    return html
Example #42
0
def writeHtml(writer, nodeList):
    from html5lib.treewalkers import getTreeWalker
    #from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.serializer.xhtmlserializer import XHTMLSerializer

    walker = getTreeWalker('dom')
    serializer = XHTMLSerializer()
    for node in nodeList:
        for item in serializer.serialize(walker(node)):
            writer.write(item)
Example #43
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == "dom":
                document.writexml(sys.stdout, encoding="utf-8")
            elif tb == "lxml":
                import lxml.etree
                sys.stdout.write(
                    lxml.etree.tostring(document, encoding="unicode"))
            elif tb == "etree":
                sys.stdout.write(
                    _utils.default_etree.tostring(document,
                                                  encoding="unicode"))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except Exception:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            if opts.sanitize:
                kwargs["sanitize"] = True

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'):
                sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
Example #44
0
 def _get_event_description_old(self, div_tag):
     # TODO: strip tags?
     # <div class="info_text specHigh1"> \n\t foo <p> \n\t blah blah.</p><p>blub blub.</p>
     tag =  self._get_tag(div_tag, 'div', 'class', 'info_text specHigh1')
     if tag:
         description = []
         for node in tag.childNodes:
             tokens = treewalkers.getTreeWalker("dom")(node)
             for text in serializer.HTMLSerializer(omit_optional_tags=False).serialize(tokens):
                 description.append(text.strip())
         return u''.join(description)
Example #45
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Example #46
0
 def render(self, dom_tree):
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     if self.method == "xhtml":
         Serializer = serializer.xhtmlserializer.XHTMLSerializer
     else:
         Serializer = serializer.htmlserializer.HTMLSerializer
     ser = Serializer(strip_whitespace=self.strip_whitespace,
                      quote_attr_values=True,
                      omit_optional_tags=False)
     return ser.render(stream)
Example #47
0
 def render(self, dom_tree):
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     if self.method == "xhtml":
         Serializer = serializer.xhtmlserializer.XHTMLSerializer
     else:
         Serializer = serializer.htmlserializer.HTMLSerializer
     ser = Serializer(
         strip_whitespace=self.strip_whitespace,
         quote_attr_values=True, omit_optional_tags=False)
     return ser.render(stream)
Example #48
0
  def get_toc(self, path):

    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      innerTagCount = 0
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1],
                  'text': ''
                }
          elif current is not None:
            innerTagCount += 1
        elif element['type'] == 'Characters' and current is not None:

          # if we already have text check:
          # - whether the last character is a < or a (
          # - the string being added starts with > or )
          # in which case do not add a space
          if current['text'] != '':

            if current['text'][-1] != '<' and not re.match(r"^[\>\)]", element['data']):
              current['text'] += ' '

          current['text'] = current['text'] + element['data']

        elif element['type'] == 'EndTag' and current is not None:
          if innerTagCount > 0:
            innerTagCount -= 1
          else:
            current['text'] = cgi.escape(current['text'])
            toc.append(current)
            current = None

      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
Example #49
0
def sanitize_html(value):
    '''A custom filter that sanitzes html output to make sure there is no bad stuff in it'''
    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(value)

    walker = treewalkers.getTreeWalker("dom")

    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
Example #50
0
File: parse.py Project: xrile/fjord
def printOutput(parser, document, opts):
    if opts.encoding:
        print('Encoding:', parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == 'dom':
                document.writexml(sys.stdout, encoding='utf-8')
            elif tb == 'lxml':
                import lxml.etree
                sys.stdout.write(lxml.etree.tostring(document))
            elif tb == 'etree':
                sys.stdout.write(utils.default_etree.tostring(document))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite('utf-8'))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = 'utf-8'
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append('Line %i Col %i' % pos + ' ' +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write('\nParse errors:\n' + '\n'.join(errList) + '\n')
Example #51
0
def serialize(input,
              tree=u"simpletree",
              format=u"html",
              encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == u"html":
        s = HTMLSerializer(**serializer_opts)
    elif format == u"xhtml":
        s = XHTMLSerializer(**serializer_opts)
    else:
        raise ValueError(u"type must be either html or xhtml")
    return s.render(walker(input), encoding)
Example #52
0
 def clean(self, value):
     chars = super(HTMLField, self).clean(value)
     #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead
     p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
     s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
     dom_tree = p.parseFragment(chars) #encoding="utf-8")  - unicode input seems to work fine
     
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     gen = s.serialize(stream)
     out = ""
     for i in gen:
         out += i
     return out
Example #53
0
def toString(tree, output_encoding="utf-8", serializer="html5lib", **kwargs):
    # Serialize to XML
    #if serializer == "lxml.etree":
    if False:
        rendered = etree.tostring(tree, encoding=output_encoding)
    # Serialize to HTML using lxml.html
    elif serializer == "lxml.html":
        rendered = lxml.html.tostring(tree, encoding=output_encoding)
    # Serialize to HTML using html5lib
    else:
        walker = treewalkers.getTreeWalker("lxml")
        s = htmlserializer.HTMLSerializer(**kwargs)
        rendered = s.render(walker(tree), encoding=output_encoding)
    return rendered
Example #54
0
def clean_html(data, full=True, parser=DEFAULT_PARSER):
    """
    Cleans HTML from XSS vulnerabilities using html5lib
    If full is False, only the contents inside <body> will be returned (without
    the <body> tags).
    """
    if full:
        dom_tree = parser.parse(data)
    else:
        dom_tree = parser.parseFragment(data)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    return u''.join(s.serialize(stream))
Example #55
0
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                            tokenizer=sanitizer_factory)
    dom_tree = p.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    return s.render(stream)
Example #56
0
 def serialize(self, **kwargs):
     """Return the unicode serialization of myself, with optional sanitization arguments."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     stream = sortAttributes(stream)
     serializer = HTMLSerializer(quote_attr_values="always",
                                 omit_optional_tags=False)
     html = serializer.render(stream)[container_len:-container_len - 1]
     return bleach.clean(
         html,
         tags=kwargs.get("tags") or (ALLOWED_TAGS + ["for"]),
         attributes=kwargs.get("attributes") or ALLOWED_ATTRIBUTES,
         styles=kwargs.get("styles") or ALLOWED_STYLES,
         strip_comments=True,
     )
Example #57
0
    def GenshiAdapter(tree):
        text = None
        for token in treewalkers.getTreeWalker('dom')(tree):
            type = token['type']
            if type in ('Characters', 'SpaceCharacters'):
                if text is None:
                    text = token['data']
                else:
                    text += token['data']
            elif text is not None:
                yield TEXT, text, (None, -1, -1)
                text = None

            if type in ('StartTag', 'EmptyTag'):
                if token['namespace']:
                    name = '{%s}%s' % (token['namespace'], token['name'])
                else:
                    name = token['name']
                attrs = Attrs([
                    (QName('{%s}%s' %
                           attr if attr[0] is not None else attr[1]), value)
                    for attr, value in token['data'].items()
                ])
                yield (START, (QName(name), attrs), (None, -1, -1))
                if type == 'EmptyTag':
                    type = 'EndTag'

            if type == 'EndTag':
                if token['namespace']:
                    name = '{%s}%s' % (token['namespace'], token['name'])
                else:
                    name = token['name']

                yield END, QName(name), (None, -1, -1)

            elif type == 'Comment':
                yield COMMENT, token['data'], (None, -1, -1)

            elif type == 'Doctype':
                yield DOCTYPE, (token['name'], token['publicId'],
                                token['systemId']), (None, -1, -1)

            else:
                pass  # FIXME: What to do?

        if text is not None:
            yield TEXT, text, (None, -1, -1)
    def GenshiAdapter(tree):
        text = None
        for token in treewalkers.getTreeWalker("dom")(tree):
            type = token["type"]
            if type in ("Characters", "SpaceCharacters"):
                if text is None:
                    text = token["data"]
                else:
                    text += token["data"]
            elif text is not None:
                yield TEXT, text, (None, -1, -1)
                text = None

            if type in ("StartTag", "EmptyTag"):
                if token["namespace"]:
                    name = "{%s}%s" % (token["namespace"], token["name"])
                else:
                    name = token["name"]
                attrs = Attrs([
                    (QName("{%s}%s" %
                           attr if attr[0] is not None else attr[1]), value)
                    for attr, value in token["data"].items()
                ])
                yield (START, (QName(name), attrs), (None, -1, -1))
                if type == "EmptyTag":
                    type = "EndTag"

            if type == "EndTag":
                if token["namespace"]:
                    name = "{%s}%s" % (token["namespace"], token["name"])
                else:
                    name = token["name"]

                yield END, QName(name), (None, -1, -1)

            elif type == "Comment":
                yield COMMENT, token["data"], (None, -1, -1)

            elif type == "Doctype":
                yield DOCTYPE, (token["name"], token["publicId"],
                                token["systemId"]), (None, -1, -1)

            else:
                pass  # FIXME: What to do?

        if text is not None:
            yield TEXT, text, (None, -1, -1)
def sanitize_html(input):
    """
    Removes any unwanted HTML tags and attributes, using html5lib.

    >>> sanitize_html("foobar<p>adf<i></p>abc</i>")
    u'foobar<p>adf<i></i></p><i>abc</i>'
    >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
    u'foobar<p style="color: red;">adf&lt;script&gt;alert("Uhoh!")&lt;/script&gt;<i></i></p><i>abc</i>'
    """
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))