コード例 #1
0
ファイル: parser.py プロジェクト: RoadrunnerWMC/bitBoard
def parse_text(text):
	t1 = time.clock()
	parser = html5lib.HTMLParser(
			tree=treebuilders.getTreeBuilder('etree'),
			tokenizer=MySanitiser)
	t2 = time.clock()

	text = text.replace('\r', '')
	text = text.replace('\n', '<br>')
	t3 = time.clock()

	for search,replace in SMILEY_REPLACEMENTS:
		text = text.replace(search, replace)

	for regex,replace in BBCODE_REGEXES:
		text = regex.sub(replace, text)

	for search,replace in BBCODE_REPLACEMENTS:
		text = text.replace(search, replace)

	t4 = time.clock()
	doc = parser.parse(text)
	t5 = time.clock()

	walker = treewalkers.getTreeWalker('etree')
	stream = walker(doc)
	s = serializer.htmlserializer.HTMLSerializer()
	output_generator = s.serialize(stream)
	t6 = time.clock()

	done = Markup(''.join(list(output_generator)))
	t7 = time.clock()
	print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
	return done
コード例 #2
0
 def filter_response(self, response, encoding=None):
     """
     Filter and fix-up the response object.
     """
     # Parse the response
     tree_type = settings.TREE_TYPE
     # Here we check for a TemplateResponse in the case we're being
     # used as a view decorator.
     if hasattr(response, 'render') and callable(response.render):
         response.render()
     tree = html5parser.parse(
         response.content, treebuilder=tree_type, encoding=encoding
     )
     # Build the serializer
     walker = treewalkers.getTreeWalker(tree_type)
     stream = walker(tree)
     options = self.get_serializer_options()
     serializer = htmlserializer.HTMLSerializer(**options)
     output = serializer.render(stream)
     output = output.encode(encoding)
     # Fix up the response
     response.content = output
     response['Content-Length'] = str(len(output))
     # Add a flag to prevent further filtering if the decorator is already
     # used on this response.
     setattr(response, settings.FILTERED_FLAG, True)
     return response
コード例 #3
0
ファイル: bookmarks.py プロジェクト: acasajus/GateOne
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
コード例 #4
0
ファイル: html2list.py プロジェクト: aanari/html2list
def html2list(payload):

    """This function reads a block of HTML and returns a cleaned list.
    :param payload: The HTML string to read.
    :type payload: str
    :returns: list -- The parsed output as a list of strings.
    """

    cleaned_output = []
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('lxml'),tokenizer=sanitizer.HTMLSanitizer)
    s = serializer.htmlserializer.HTMLSerializer(strip_whitespace=True,omit_optional_tags=True)
    r = treewalkers.getTreeWalker('lxml')(p.parse(payload))
    for item in r.tree.elementtree.getiterator():
        if item.getparent() is not None:
            if (item.getparent().tag.split('}')[-1] == 'html'):
                item.text = ''
        else: item.text = ''
        for k in item.attrib:
            del item.attrib[k]
        if type(item.text) is str:
            for c in P['R']:
                item.text = re.sub(c,'',item.text)
    for tag in s.serialize(r):
        if not re.match("""(?:<|&lt;)/?\w+((\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?(?:>|&gt;)?""",tag):
            tag = tag.encode('ascii', 'ignore')
            split_tag = map(lambda x: x.strip(), re.split('[|,;]|(?:=2C|=3B)',tag.replace('&amp;','&')))
            for t in split_tag:
                for e in P['E']:
                    if t == e:
                        split_tag.remove(t)
            if split_tag:
                cleaned_output += split_tag
    return cleaned_output
コード例 #5
0
ファイル: main.py プロジェクト: mikewest/www.html5rocks.com
  def get_toc(self, path):
    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2', 'h3', 'h4']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1]
                }
        elif element['type'] == 'Characters' and current is not None:
          current['text'] = element['data']
        elif element['type'] == 'EndTag' and current is not None:
          toc.append(current)
          current = None
      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
コード例 #6
0
ファイル: test_treeadapters.py プロジェクト: 0X1A/servo
def test_to_sax():
    handler = support.TracingSaxHandler()
    tree = html5lib.parse("""<html xml:lang="en">
        <title>Directory Listing</title>
        <a href="/"><b/></p>
    """, treebuilder="etree")
    walker = getTreeWalker("etree")
    sax.to_sax(walker(tree), handler)
    expected = [
        'startDocument',
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
            'html', {(None, 'xml:lang'): 'en'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
        ('characters', 'Directory Listing'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
        ('characters', '\n        '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
        ('characters', '\n    '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
        'endDocument',
    ]
    assert expected == handler.visited
コード例 #7
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document, '__getitem__'): document = [document]
        for fragment in document:
            sys.stdout.write(parser.tree.testSerializer(fragment))
        sys.stdout.write("\n")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts, opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
コード例 #8
0
ファイル: cosmetics.py プロジェクト: andreyjkee/webpagemaker
def _normalize(html):
    """
    Normalize the given string of HTML, collapsing whitespace.
    """

    # This is taken from the "Serialization of Streams" section of
    # http://code.google.com/p/html5lib/wiki/UserDocumentation.
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    output_generator = s.serialize(stream)

    # TODO: We're not actually collapsing *all* whitespace; only
    # entire chunks of whitespace that the serializer gives us. Currently,
    # this seems "good enough" to pass our unit tests, which are
    # based on use cases of comparing pre-sanitized HTML to sanitized HTML,
    # but we may need to change this in the future.
    parts = []
    last_item_was_whitespace = False
    for item in output_generator:
        # Is it empty whitespace?
        if item.strip() != '':
            parts.append(item)
            last_item_was_whitespace = False
        elif not last_item_was_whitespace:
            # Collapse whitespace.
            parts.append(' ')
            last_item_was_whitespace = True
    return ''.join(parts)
コード例 #9
0
 def get_toc(self, path):
   toc = memcache.get('toc|%s' % path)
   if toc is None or self.request.cache == False:
     template_text = webapp.template.render(path, {});
     parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
     dom_tree = parser.parse(template_text)
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     toc = []
     current = None
     for element in stream:
       if element['type'] == 'StartTag':
         if element['name'] in ['h2', 'h3', 'h4']:
           for attr in element['data']:
             if attr[0] == 'id':
               current = {
                 'level' : int(element['name'][-1:]) - 1,
                 'id' : attr[1]
               }
       elif element['type'] == 'Characters' and current is not None:
         current['text'] = element['data']
       elif element['type'] == 'EndTag' and current is not None:
         toc.append(current)
         current = None
     memcache.set('toc|%s' % path, toc, 3600)
   return toc
コード例 #10
0
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                      tokenizer=HTMLSanitizer)
    dom_tree = html_parser.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    output = s.render(stream, 'utf-8')

    while 'toberemoved' in output:
        oldoutput = output
        matches = re.findall(r'&lt;toberemoved.*?&gt;.*?&lt;/toberemoved&gt;',
                             output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;toberemoved.*?&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        if output == oldoutput:
            break

    return output
コード例 #11
0
def sanitize(string, html_type):
    """
    >>> sanitize("\\t<p>a paragraph</p>","html")
    u'\\t<p>a paragraph</p>'

    >>> sanitize("\\t<script>alert('evil script');</script>", "xhtml")
    u"\\t&lt;script&gt;alert('evil script');&lt;/script&gt;"

    """
    try:
        import html5lib
        from html5lib import sanitizer, serializer, treewalkers, treebuilders
    except ImportError:
        raise Exception("html5lib not available")

    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
    tree = p.parseFragment(string)

    walker = treewalkers.getTreeWalker("simpletree")
    stream = walker(tree)

    if html_type == 'xhtml':
        s = serializer.xhtmlserializer.XHTMLSerializer()
    else:
        s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                     quote_attr_values=True)
    return s.render(stream)
コード例 #12
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
コード例 #13
0
ファイル: bookmarks.py プロジェクト: taylor/GateOne
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
コード例 #14
0
ファイル: parser.py プロジェクト: MechanisM/kitsune
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
コード例 #15
0
ファイル: main.py プロジェクト: adityas/www.html5rocks.com
    def get_toc(self, path):
        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search('/tutorials', path) or re.search('/mobile', path)):
            return ''

        toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            for element in stream:
                if element['type'] == 'StartTag':
                    if element['name'] in ['h2', 'h3', 'h4']:
                        for attr in element['data']:
                            if attr[0] == 'id':
                                current = {
                                    'level': int(element['name'][-1:]) - 1,
                                    'id': attr[1]
                                }
                elif element['type'] == 'Characters' and current is not None:
                    current['text'] = element['data']
                elif element['type'] == 'EndTag' and current is not None:
                    toc.append(current)
                    current = None
            memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path),
                         toc, 3600)

        return toc
コード例 #16
0
ファイル: html.py プロジェクト: Hrishi-3331/StudyPoint
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
コード例 #17
0
ファイル: parser.py プロジェクト: rootmeb/kitsune
 def __str__(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False)
     return serializer.render(stream)[container_len : -container_len - 1]
コード例 #18
0
ファイル: chemical.py プロジェクト: Sighter/remeta
	def search(self, term):
		# define link for search
		searchUrl = self.baseUrl + r"/sc/search?&must=" + term + r"&Type=Music&Type=&inandout=true&SRI=true&ND=-1"
		

		print(" --> searching on chemical for " + term)
		print(" --> with " + searchUrl)
		source = getWebAsStr(searchUrl)

		# create a parser, we use minidom
		p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
		dom_tree = p.parse(source)
		walker = treewalkers.getTreeWalker("dom")
		stream = walker(dom_tree)

		# now we can send the stream to our fetcher functions
		# find links on search result page
		l_hitLinks = self.fetch_HitLinks(stream)

		# find short info
		l_shortInfo = self.fetch_ShortInfo(stream)

		# create an two dimensional list
		results = []

		for link, info in zip(l_hitLinks, l_shortInfo):
			results.append([link, info]) 

		return results
コード例 #19
0
ファイル: main.py プロジェクト: shixnxgai/www.html5rocks.com
    def get_toc(self, path):
        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search("/tutorials", path) or re.search("/mobile", path)):
            return ""

        toc = memcache.get("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            for element in stream:
                if element["type"] == "StartTag":
                    if element["name"] in ["h2", "h3", "h4"]:
                        for attr in element["data"]:
                            if attr[0] == "id":
                                current = {"level": int(element["name"][-1:]) - 1, "id": attr[1]}
                elif element["type"] == "Characters" and current is not None:
                    current["text"] = element["data"]
                elif element["type"] == "EndTag" and current is not None:
                    toc.append(current)
                    current = None
            memcache.set("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

        return toc
コード例 #20
0
ファイル: sanitizer.py プロジェクト: tovmeod/anaf
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    html_parser = html5lib.HTMLParser(
        tree=treebuilders.getTreeBuilder("dom"), tokenizer=HTMLSanitizer)
    dom_tree = html_parser.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(
        omit_optional_tags=False, quote_attr_values=True)
    output = s.render(stream, 'utf-8')

    while 'toberemoved' in output:
        oldoutput = output
        matches = re.findall(
            r'&lt;toberemoved.*?&gt;.*?&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;toberemoved.*?&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        if output == oldoutput:
            break

    return output
コード例 #21
0
ファイル: parse.py プロジェクト: datapublica/html5lib
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document,'__getitem__'): document = [document]
        for fragment in document:
            sys.stdout.write(parser.tree.testSerializer(fragment))
        sys.stdout.write("\n")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts,opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
コード例 #22
0
def hmtl2text(html):
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    in_script = False
    outbuf = []
    current_line = []
    for token in stream:
        token_name = token.get('name', "").lower()

        if token_name in ['script', 'style', 'noscript']:
            in_script = token.get('type', None) == 'StartTag'
        if in_script:
            continue

        if token_name in block_level_elements or token_name == "br":
            if current_line:
                outbuf.append(u"".join(current_line))
                current_line = []

        if token.get(u'type', None) == u'Characters':
            current_line.append(token['data'])
        if token.get(u'type', None) == u'SpaceCharacters':
            if current_line and current_line[-1] != u" ":
                current_line.append(u" ")

    if current_line:
        outbuf.append(u"".join(current_line))
    return clean_whitespace("\n".join(outbuf))
コード例 #23
0
ファイル: fields.py プロジェクト: brstgt/cream-testsite
    def clean(self, value, model_instance):
        """
        Validates the given value using the provided HTMLCleaner
        and returns its "cleaned" value as a Python object.

        Raises ValidationError for any errors.
        """
        value = super(HTMLField, self).clean(value, model_instance)

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = parser.parseFragment(value)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)

        if self.use_imageproxy:
            from imageproxy import Proxy
            user = User.objects.get(pk=getattr(model_instance, self.user_field))
            proxy = Proxy(user)
            stream = ImageProxyFilter(stream, proxy)

        s = HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        clean_value = ''
        for item in output_generator:
            clean_value += item

        return clean_value
コード例 #24
0
def hmtl2text(html):
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    in_script = False
    outbuf = []
    current_line = []
    for token in stream:
        token_name = token.get('name', "").lower()

        if token_name in ['script', 'style', 'noscript']:
            in_script = token.get('type', None) == 'StartTag'
        if in_script:
            continue

        if token_name in block_level_elements or token_name == "br":
            if current_line:
                outbuf.append(u"".join(current_line))
                current_line = []

        if token.get(u'type', None) == u'Characters':
            current_line.append(token['data'])
        if token.get(u'type', None) == u'SpaceCharacters':
            if current_line and current_line[-1] != u" ":
                current_line.append(u" ")

    if current_line:
        outbuf.append(u"".join(current_line))
    return clean_whitespace("\n".join(outbuf))
コード例 #25
0
ファイル: test_treewalkers.py プロジェクト: zachlewis2020/wpt
def test_lxml_xml():
    expected = [{
        'data': {},
        'name': 'div',
        'namespace': None,
        'type': 'StartTag'
    }, {
        'data': {},
        'name': 'div',
        'namespace': None,
        'type': 'StartTag'
    }, {
        'name': 'div',
        'namespace': None,
        'type': 'EndTag'
    }, {
        'name': 'div',
        'namespace': None,
        'type': 'EndTag'
    }]

    lxmltree = lxml.etree.fromstring('<div><div></div></div>')
    walker = treewalkers.getTreeWalker('lxml')
    output = Lint(walker(lxmltree))

    assert list(output) == expected
コード例 #26
0
def app_filter_html_path_inplace(path, filters, log=None):
    """Filter the given HTML file (in-place) based on "app-*" class
    attributes.
    
    For example, the HTML might contain something like:
        <div class="app-ide">
            ...ide info...
        </div>
        <div class="app-edit">
            ...edit info...
        </div>
    If there are no filters, then the HTML is not changed. If the filters
    include "ide" but not "edit", then the ide div remains and the
    edit div is removed.
    """
    if not filters:
        return
    if log:
        log("app-filter `%s'", path)

    # Parse the HTML file.
    with open(path) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # Filter out the unwanted elements.
    filtered = False
    assert isinstance(filters, set)
    for elem in tree.getiterator():
        indeces_to_drop = []
        for i, child in enumerate(elem.getchildren()):
            if _should_drop_elem(child, filters, "class", "app-"):
                indeces_to_drop.insert(0, i)
                filtered = True
                if log:
                    tag_str = "<%s" % child.tag
                    if child.attrib:
                        for n, v in child.attrib.items():
                            tag_str += ' %s="%s"' % (n, v)
                    tag_str += ">"
                    if len(tag_str) > 50:
                        tag_str = tag_str[:47] + '...'
                    log("... filter out %s", tag_str)
        for idx in indeces_to_drop:
            del elem[idx]

    # Write out any changes.
    if filtered:
        walker = treewalkers.getTreeWalker("etree", ET)
        stream = walker(tree)
        s = HTMLSerializer()
        outputter = s.serialize(stream)
        content = ''.join(list(outputter))
        f = open(path, 'w')
        f.write("""<!DOCTYPE html>
""")
        try:
            f.write(content)
        finally:
            f.close()
コード例 #27
0
def SearchMovie(title, year):
    r = requests.post(DOMAIN_NAME + "/subtitles/searchbytitle", data={"query": title, "l": ""})
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(r.text)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    return SearchTitleMatch(stream)
コード例 #28
0
ファイル: html5lib_backend.py プロジェクト: russelljk/cleanly
def run_sanitizer(html, sanitizer):
    parser = html5lib.HTMLParser(tokenizer=sanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    result = s.serialize(stream)
    return u"".join(result)
コード例 #29
0
ファイル: sanitize.py プロジェクト: CommonsCloud/Core-API
  def sanitize_string(self, user_input):
    p = html5lib.HTMLParser(tokenizer=CommonsHTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(user_input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    return u"".join(s.serialize(stream))
コード例 #30
0
ファイル: html5lib_backend.py プロジェクト: russelljk/cleanly
def cleanup_html(html):
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    result = s.render(stream)
    return u"".join(result)
コード例 #31
0
def parse(f):
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = p.parse(f)
    walker = treewalkers.getTreeWalker("dom")

    tokens = []
    bintokens = []

    waitfor = None

    for tok in walker(doc):

        if waitfor:
            if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
                waitfor = None
            continue

        if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
            waitfor = ("EndTag", tok["name"])

        if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
            bintokens.append(1)
            tokens.append(tok)

        elif tok["type"] in ("Characters",):
            for tok1 in tok["data"].split():
                bintokens.append(0)
                tokens.append({"type": "Characters", "data": tok1})

        elif tok["type"] in ("SpaceCharacters", "Doctype"):
            pass

        else:
            raise ValueError("unrecognizable token type: %r" % tok)

    cumbintokens = [bintokens[0]]

    for tok in bintokens[1:]:
        cumbintokens.append(cumbintokens[-1] + tok)

    length = len(cumbintokens)

    midx = None
    m = None

    for i in range(length):
        for j in range(i + 1, length):
            end_tag = cumbintokens[-1] - cumbintokens[j]
            start_tag = cumbintokens[i]
            text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
            nm = end_tag + start_tag + text_between

            if not midx or nm > m:
                midx = i, j
                m = nm

    i, j = midx
    return serialize_tokens(tokens[i:j + 1])
コード例 #32
0
ファイル: main.py プロジェクト: CoderK/www.html5rocks.com
    def get_toc(self, path):

        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search('/tutorials', path) or re.search('/mobile', path)
                or re.search('style-guide', path)):
            return ''

        toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            innerTagCount = 0
            for element in stream:
                if element['type'] == 'StartTag':
                    if element['name'] in ['h2']:
                        for attr in element['data']:
                            if attr[0] == 'id':
                                current = {
                                    'level': int(element['name'][-1:]) - 1,
                                    'id': attr[1],
                                    'text': ''
                                }
                    elif current is not None:
                        innerTagCount += 1
                elif element['type'] == 'Characters' and current is not None:

                    # if we already have text check:
                    # - whether the last character is a < or a (
                    # - the string being added starts with > or )
                    # in which case do not add a space
                    if current['text'] != '':

                        if current['text'][-1] != '<' and not re.match(
                                r"^[\>\)]", element['data']):
                            current['text'] += ' '

                    current['text'] = current['text'] + element['data']

                elif element['type'] == 'EndTag' and current is not None:
                    if innerTagCount > 0:
                        innerTagCount -= 1
                    else:
                        current['text'] = cgi.escape(current['text'])
                        toc.append(current)
                        current = None

            memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path),
                         toc, 3600)

        return toc
コード例 #33
0
ファイル: __init__.py プロジェクト: mfa/html5lib-python
def serialize(input, tree="simpletree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    else:
        raise ValueError("type must be html")
    return s.render(walker(input), encoding)
コード例 #34
0
ファイル: html.py プロジェクト: tantalor/emend
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
コード例 #35
0
ファイル: sanitize.py プロジェクト: genghisu/eruditio
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u"".join(output_generator)
コード例 #36
0
def build_tree(f):
    html = []
    for line in f:
        line = line.replace("\t", "    ")
        html.append(line)
    html = "".join(html)
    encoding = chardet.detect(html)
    # print "Detected encoding: ", encoding
    html = html.decode(encoding["encoding"])

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    chars = ""
    root = MyDOM(u"root", None)
    node = root
    for token in stream:
        token_type = token.get("type", None)

        if token_type.endswith("Error"):
            return None

        if token_type == "Comment":  # ignore comments for now
            continue

        if token_type.endswith("Characters"):
            chars += token.get("data", "")
            continue

        if chars.strip():
            node.addkid(chars, "chars")
        chars = ""

        tag_name = token.get("name", None)

        if token_type == "EmptyTag":
            continue
            node.addkid(tag_name, "tag")
            for k, v in token.get("data", {}).iteritems():
                node.addkid("%s:%s" % (k[1], v), "meta")
            continue

        assert tag_name is not None, token
        tag_name = tag_name.upper()

        if token_type == "EndTag":
            assert MyDOM.get_label(node) == tag_name, token
            node = node.get_parent()
            assert node is not None, "Unbalanced Tree"

        if token_type == "StartTag":
            node = node.addkid(tag_name, "tag")

    return root
コード例 #37
0
ファイル: treedist.py プロジェクト: christianbuck/mtma_bitext
def build_tree(f):
    html = []
    for line in f:
        line = line.replace("\t", "    ")
        html.append(line)
    html = "".join(html)
    encoding = chardet.detect(html)
    # print "Detected encoding: ", encoding
    html = html.decode(encoding["encoding"])

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    chars = ""
    root = MyDOM(u"root", None)
    node = root
    for token in stream:
        token_type = token.get("type", None)
        
        if token_type.endswith("Error"):
            return None

        if token_type == "Comment":  # ignore comments for now
            continue

        if token_type.endswith("Characters"):
            chars += token.get("data", "")
            continue

        if chars.strip():
            node.addkid(chars, "chars")
        chars = ""

        tag_name = token.get("name", None)

        if token_type == "EmptyTag":
            continue
            node.addkid(tag_name, "tag")
            for k, v in token.get("data", {}).iteritems():
                node.addkid("%s:%s" % (k[1], v), "meta")
            continue

        assert tag_name is not None, token
        tag_name = tag_name.upper()

        if token_type == "EndTag":
            assert MyDOM.get_label(node) == tag_name, token
            node = node.get_parent()
            assert node is not None, "Unbalanced Tree"

        if token_type == "StartTag":
            node = node.addkid(tag_name, "tag")

    return root
コード例 #38
0
def tostring(lxmltree, options=None):
    options = options or {'omit_optional_tags': False}
    walker = treewalkers.getTreeWalker('lxml')
    stream = walker(lxmltree)
    s = serializer.HTMLSerializer(**options)
    output = s.render(stream)
    if not isinstance(output, str):
        # Python 2
        output = output.encode('utf-8')
    return output
コード例 #39
0
ファイル: document.py プロジェクト: dsnopek/lingwo-old
def writeHtml(writer, nodeList):
    from html5lib.treewalkers import getTreeWalker
    #from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.serializer.xhtmlserializer import XHTMLSerializer

    walker = getTreeWalker('dom')
    serializer = XHTMLSerializer()
    for node in nodeList:
        for item in serializer.serialize(walker(node)):
            writer.write(item)
コード例 #40
0
ファイル: content_sanitizer.py プロジェクト: jeffrz/airship
def sanitize(content):
    parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
                             tree = treebuilders.getTreeBuilder("dom"))
    dom = parser.parseFragment(content)
    tree_walker = treewalkers.getTreeWalker("dom")
    tree_stream = tree_walker(dom)
    serial = serializer.HTMLSerializer(omit_optional_tags = False,
                                           quote_attr_values = True)
    output = serial.serialize(tree_stream)
    return u''.join(output)
コード例 #41
0
def sanitize_html(data, encoding=None):
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                 tokenizer=sanitizer_factory)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(parser.parseFragment(data, encoding=encoding))
    slzr = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                    quote_attr_values=True,
                                                    use_trailing_solidus=True)
    html = slzr.render(stream, encoding)
    return html
コード例 #42
0
def writeHtml(writer, nodeList):
    from html5lib.treewalkers import getTreeWalker
    #from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.serializer.xhtmlserializer import XHTMLSerializer

    walker = getTreeWalker('dom')
    serializer = XHTMLSerializer()
    for node in nodeList:
        for item in serializer.serialize(walker(node)):
            writer.write(item)
コード例 #43
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == "dom":
                document.writexml(sys.stdout, encoding="utf-8")
            elif tb == "lxml":
                import lxml.etree
                sys.stdout.write(
                    lxml.etree.tostring(document, encoding="unicode"))
            elif tb == "etree":
                sys.stdout.write(
                    _utils.default_etree.tostring(document,
                                                  encoding="unicode"))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except Exception:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            if opts.sanitize:
                kwargs["sanitize"] = True

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'):
                sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
コード例 #44
0
ファイル: migration.py プロジェクト: gimler/techism2
 def _get_event_description_old(self, div_tag):
     # TODO: strip tags?
     # <div class="info_text specHigh1"> \n\t foo <p> \n\t blah blah.</p><p>blub blub.</p>
     tag =  self._get_tag(div_tag, 'div', 'class', 'info_text specHigh1')
     if tag:
         description = []
         for node in tag.childNodes:
             tokens = treewalkers.getTreeWalker("dom")(node)
             for text in serializer.HTMLSerializer(omit_optional_tags=False).serialize(tokens):
                 description.append(text.strip())
         return u''.join(description)
コード例 #45
0
ファイル: html.py プロジェクト: lissyx/askbot-devel
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
コード例 #46
0
 def render(self, dom_tree):
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     if self.method == "xhtml":
         Serializer = serializer.xhtmlserializer.XHTMLSerializer
     else:
         Serializer = serializer.htmlserializer.HTMLSerializer
     ser = Serializer(strip_whitespace=self.strip_whitespace,
                      quote_attr_values=True,
                      omit_optional_tags=False)
     return ser.render(stream)
コード例 #47
0
ファイル: html.py プロジェクト: spicycms/spicy.core
 def render(self, dom_tree):
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     if self.method == "xhtml":
         Serializer = serializer.xhtmlserializer.XHTMLSerializer
     else:
         Serializer = serializer.htmlserializer.HTMLSerializer
     ser = Serializer(
         strip_whitespace=self.strip_whitespace,
         quote_attr_values=True, omit_optional_tags=False)
     return ser.render(stream)
コード例 #48
0
ファイル: main.py プロジェクト: circleride/www.html5rocks.com
  def get_toc(self, path):

    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      innerTagCount = 0
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1],
                  'text': ''
                }
          elif current is not None:
            innerTagCount += 1
        elif element['type'] == 'Characters' and current is not None:

          # if we already have text check:
          # - whether the last character is a < or a (
          # - the string being added starts with > or )
          # in which case do not add a space
          if current['text'] != '':

            if current['text'][-1] != '<' and not re.match(r"^[\>\)]", element['data']):
              current['text'] += ' '

          current['text'] = current['text'] + element['data']

        elif element['type'] == 'EndTag' and current is not None:
          if innerTagCount > 0:
            innerTagCount -= 1
          else:
            current['text'] = cgi.escape(current['text'])
            toc.append(current)
            current = None

      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
コード例 #49
0
def sanitize_html(value):
    '''A custom filter that sanitzes html output to make sure there is no bad stuff in it'''
    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(value)

    walker = treewalkers.getTreeWalker("dom")

    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
コード例 #50
0
ファイル: parse.py プロジェクト: xrile/fjord
def printOutput(parser, document, opts):
    if opts.encoding:
        print('Encoding:', parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == 'dom':
                document.writexml(sys.stdout, encoding='utf-8')
            elif tb == 'lxml':
                import lxml.etree
                sys.stdout.write(lxml.etree.tostring(document))
            elif tb == 'etree':
                sys.stdout.write(utils.default_etree.tostring(document))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite('utf-8'))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = 'utf-8'
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append('Line %i Col %i' % pos + ' ' +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write('\nParse errors:\n' + '\n'.join(errList) + '\n')
コード例 #51
0
def serialize(input,
              tree=u"simpletree",
              format=u"html",
              encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == u"html":
        s = HTMLSerializer(**serializer_opts)
    elif format == u"xhtml":
        s = XHTMLSerializer(**serializer_opts)
    else:
        raise ValueError(u"type must be either html or xhtml")
    return s.render(walker(input), encoding)
コード例 #52
0
ファイル: utils.py プロジェクト: GunioRobot/hubplus
 def clean(self, value):
     chars = super(HTMLField, self).clean(value)
     #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead
     p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
     s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
     dom_tree = p.parseFragment(chars) #encoding="utf-8")  - unicode input seems to work fine
     
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     gen = s.serialize(stream)
     out = ""
     for i in gen:
         out += i
     return out
コード例 #53
0
def toString(tree, output_encoding="utf-8", serializer="html5lib", **kwargs):
    # Serialize to XML
    #if serializer == "lxml.etree":
    if False:
        rendered = etree.tostring(tree, encoding=output_encoding)
    # Serialize to HTML using lxml.html
    elif serializer == "lxml.html":
        rendered = lxml.html.tostring(tree, encoding=output_encoding)
    # Serialize to HTML using html5lib
    else:
        walker = treewalkers.getTreeWalker("lxml")
        s = htmlserializer.HTMLSerializer(**kwargs)
        rendered = s.render(walker(tree), encoding=output_encoding)
    return rendered
コード例 #54
0
ファイル: html.py プロジェクト: malavvora/tutorial-project
def clean_html(data, full=True, parser=DEFAULT_PARSER):
    """
    Cleans HTML from XSS vulnerabilities using html5lib
    If full is False, only the contents inside <body> will be returned (without
    the <body> tags).
    """
    if full:
        dom_tree = parser.parse(data)
    else:
        dom_tree = parser.parseFragment(data)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    return u''.join(s.serialize(stream))
コード例 #55
0
ファイル: utils.py プロジェクト: EFJackson/SMS
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                            tokenizer=sanitizer_factory)
    dom_tree = p.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    return s.render(stream)
コード例 #56
0
 def serialize(self, **kwargs):
     """Return the unicode serialization of myself, with optional sanitization arguments."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     stream = sortAttributes(stream)
     serializer = HTMLSerializer(quote_attr_values="always",
                                 omit_optional_tags=False)
     html = serializer.render(stream)[container_len:-container_len - 1]
     return bleach.clean(
         html,
         tags=kwargs.get("tags") or (ALLOWED_TAGS + ["for"]),
         attributes=kwargs.get("attributes") or ALLOWED_ATTRIBUTES,
         styles=kwargs.get("styles") or ALLOWED_STYLES,
         strip_comments=True,
     )
コード例 #57
0
    def GenshiAdapter(tree):
        text = None
        for token in treewalkers.getTreeWalker('dom')(tree):
            type = token['type']
            if type in ('Characters', 'SpaceCharacters'):
                if text is None:
                    text = token['data']
                else:
                    text += token['data']
            elif text is not None:
                yield TEXT, text, (None, -1, -1)
                text = None

            if type in ('StartTag', 'EmptyTag'):
                if token['namespace']:
                    name = '{%s}%s' % (token['namespace'], token['name'])
                else:
                    name = token['name']
                attrs = Attrs([
                    (QName('{%s}%s' %
                           attr if attr[0] is not None else attr[1]), value)
                    for attr, value in token['data'].items()
                ])
                yield (START, (QName(name), attrs), (None, -1, -1))
                if type == 'EmptyTag':
                    type = 'EndTag'

            if type == 'EndTag':
                if token['namespace']:
                    name = '{%s}%s' % (token['namespace'], token['name'])
                else:
                    name = token['name']

                yield END, QName(name), (None, -1, -1)

            elif type == 'Comment':
                yield COMMENT, token['data'], (None, -1, -1)

            elif type == 'Doctype':
                yield DOCTYPE, (token['name'], token['publicId'],
                                token['systemId']), (None, -1, -1)

            else:
                pass  # FIXME: What to do?

        if text is not None:
            yield TEXT, text, (None, -1, -1)
コード例 #58
0
    def GenshiAdapter(tree):
        text = None
        for token in treewalkers.getTreeWalker("dom")(tree):
            type = token["type"]
            if type in ("Characters", "SpaceCharacters"):
                if text is None:
                    text = token["data"]
                else:
                    text += token["data"]
            elif text is not None:
                yield TEXT, text, (None, -1, -1)
                text = None

            if type in ("StartTag", "EmptyTag"):
                if token["namespace"]:
                    name = "{%s}%s" % (token["namespace"], token["name"])
                else:
                    name = token["name"]
                attrs = Attrs([
                    (QName("{%s}%s" %
                           attr if attr[0] is not None else attr[1]), value)
                    for attr, value in token["data"].items()
                ])
                yield (START, (QName(name), attrs), (None, -1, -1))
                if type == "EmptyTag":
                    type = "EndTag"

            if type == "EndTag":
                if token["namespace"]:
                    name = "{%s}%s" % (token["namespace"], token["name"])
                else:
                    name = token["name"]

                yield END, QName(name), (None, -1, -1)

            elif type == "Comment":
                yield COMMENT, token["data"], (None, -1, -1)

            elif type == "Doctype":
                yield DOCTYPE, (token["name"], token["publicId"],
                                token["systemId"]), (None, -1, -1)

            else:
                pass  # FIXME: What to do?

        if text is not None:
            yield TEXT, text, (None, -1, -1)
コード例 #59
0
def sanitize_html(input):
    """
    Removes any unwanted HTML tags and attributes, using html5lib.

    >>> sanitize_html("foobar<p>adf<i></p>abc</i>")
    u'foobar<p>adf<i></i></p><i>abc</i>'
    >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
    u'foobar<p style="color: red;">adf&lt;script&gt;alert("Uhoh!")&lt;/script&gt;<i></i></p><i>abc</i>'
    """
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))