def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) if opts.xml: sys.stdout.write(document.toxml("utf-8")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: sys.stdout.write(parser.tree.testSerializer(fragment)) sys.stdout.write("\n") elif opts.hilite: sys.stdout.write(document.hilite("utf-8")) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: kwargs[opt] = getattr(opts, opt) if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) for text in serializer.HTMLSerializer(**kwargs).serialize(tokens): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == "dom": document.writexml(sys.stdout, encoding="utf-8") elif tb == "lxml": import lxml.etree sys.stdout.write( lxml.etree.tostring(document, encoding="unicode")) elif tb == "etree": sys.stdout.write( _utils.default_etree.tostring(document, encoding="unicode")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) except Exception: pass if not kwargs['quote_char']: del kwargs['quote_char'] if opts.sanitize: kwargs["sanitize"] = True tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize( tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator)
def printOutput(parser, document, opts): if opts.encoding: print('Encoding:', parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == 'dom': document.writexml(sys.stdout, encoding='utf-8') elif tb == 'lxml': import lxml.etree sys.stdout.write(lxml.etree.tostring(document)) elif tb == 'etree': sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.hilite: sys.stdout.write(document.hilite('utf-8')) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = 'utf-8' for text in serializer.HTMLSerializer(**kwargs).serialize( tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append('Line %i Col %i' % pos + ' ' + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write('\nParse errors:\n' + '\n'.join(errList) + '\n')
def clean_html(data, full=False, parser=DEFAULT_PARSER): """ Cleans HTML from XSS vulnerabilities using html5lib If full is False, only the contents inside <body> will be returned (without the <body> tags). """ if full: dom_tree = parser.parse(data) else: dom_tree = parser.parseFragment(data) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values="always", sanitize=True ) return u"".join(s.serialize(stream))
def clean_html(data, full=True, parser=DEFAULT_PARSER): """ Cleans HTML from XSS vulnerabilities using html5lib If full is False, only the contents inside <body> will be returned (without the <body> tags). """ if full: dom_tree = parser.parse(data) else: dom_tree = parser.parseFragment(data) walker = treewalkers.getTreeWalker('dom') kwargs = _filter_kwargs() stream = TextSanitizer(walker(dom_tree), **kwargs) s = serializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values='always', ) return u''.join(s.serialize(stream))
def html_sanitize(text): if not text: return '' p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) element = p.parseFragment(text) walker = getTreeWalker("etree") stream = walker(element) s = serializer.HTMLSerializer() text = s.render(stream) text = UnicodeDammit(text, ["utf-8"]) REMOVE_ATTRIBUTES = [ 'lang','language','onmouseover','onmouseout','script','font','style', 'dir','face','size','color','style','class','width','height','hspace', 'border','valign','align','background','bgcolor','text','link','vlink', 'alink','cellpadding','cellspacing', 'id'] soup = BeautifulSoup(text.unicode_markup) for attribute in REMOVE_ATTRIBUTES: for tag in soup.findAll(): if(attribute == 'style'): new_style = '' style = tag.attrs.get('style', None) if style: if style.find('normal') != -1: new_style += " font-weight:normal; " elif style.find('bold') != -1: new_style += " font-weight:bold; " if style.find('italic') != -1: new_style += " font-style: italic; " if style.find('underline') != -1: new_style += " text-decoration: underline; " tag.attrs['style'] = new_style else: del(tag[attribute]) html = soup.prettify('utf-8') try: body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip() except IndexError: body = html return body
def scrub(feed_uri, data): # some data is not trustworthy for tag in config.ignore_in_feed(feed_uri).split(): if tag.find('lang') >= 0: tag = 'language' if data.feed.has_key(tag): del data.feed[tag] for entry in data.entries: if entry.has_key(tag): del entry[tag] if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] for key in entry.keys(): if not key.endswith('_detail'): continue for detail in entry[key].copy(): if detail == tag: del entry[key][detail] # adjust title types if config.title_type(feed_uri): title_type = config.title_type(feed_uri) title_type = type_map.get(title_type, title_type) for entry in data.entries: if entry.has_key('title_detail'): entry.title_detail['type'] = title_type # adjust summary types if config.summary_type(feed_uri): summary_type = config.summary_type(feed_uri) summary_type = type_map.get(summary_type, summary_type) for entry in data.entries: if entry.has_key('summary_detail'): entry.summary_detail['type'] = summary_type # adjust content types if config.content_type(feed_uri): content_type = config.content_type(feed_uri) content_type = type_map.get(content_type, content_type) for entry in data.entries: if entry.has_key('content'): entry.content[0]['type'] = content_type # some people put html in author names if config.name_type(feed_uri).find('html') >= 0: from shell.tmpl import stripHtml if data.feed.has_key('author_detail') and \ data.feed.author_detail.has_key('name'): data.feed.author_detail['name'] = \ str(stripHtml(data.feed.author_detail.name)) for entry in data.entries: if entry.has_key('author_detail') and \ entry.author_detail.has_key('name'): entry.author_detail['name'] = \ str(stripHtml(entry.author_detail.name)) if entry.has_key('source'): source = entry.source if source.has_key('author_detail') and \ source.author_detail.has_key('name'): source.author_detail['name'] = \ str(stripHtml(source.author_detail.name)) # handle dates in the future future_dates = config.future_dates(feed_uri).lower() if future_dates == 'ignore_date': now = time.gmtime() if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] for entry in data.entries: if entry.has_key('published_parsed') and entry['published_parsed']: if entry['published_parsed'] > now: del entry['published_parsed'] del entry['published'] if entry.has_key('updated_parsed') and entry['updated_parsed']: if entry['updated_parsed'] > now: del entry['updated_parsed'] del entry['updated'] elif future_dates == 'ignore_entry': now = time.time() if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] data.entries = [ entry for entry in data.entries if (not entry.has_key('published_parsed') or not entry['published_parsed'] or entry['published_parsed'] <= now) and (not entry.has_key('updated_parsed') or not entry['updated_parsed'] or entry['updated_parsed'] <= now) ] scrub_xmlbase = config.xml_base(feed_uri) # resolve relative URIs and sanitize for entry in data.entries + [data.feed]: for key in entry.keys(): if key == 'content' and not entry.has_key('content_detail'): node = entry.content[0] elif key.endswith('_detail'): node = entry[key] else: continue if not node.has_key('type'): continue if not 'html' in node['type']: continue if not node.has_key('value'): continue if node.has_key('base'): if scrub_xmlbase: if scrub_xmlbase == 'feed_alternate': if entry.has_key('source') and \ entry.source.has_key('link'): node['base'] = entry.source.link elif data.feed.has_key('link'): node['base'] = data.feed.link elif scrub_xmlbase == 'entry_alternate': if entry.has_key('link'): node['base'] = entry.link else: node['base'] = feedparser._urljoin( node['base'], scrub_xmlbase) node['value'] = feedparser._resolveRelativeURIs( node.value, node.base, 'utf-8', node.type) if node['value']: # Run this through HTML5's sanitizer doc = None if 'xhtml' in node['type']: try: from xml.dom import minidom doc = minidom.parseString(node['value']) except: node['type'] = 'text/html' if not doc: from html5lib import html5parser, treebuilders, sanitizer p = html5parser.HTMLParser( tree=treebuilders.getTreeBuilder('dom'), tokenizer=sanitizer.HTMLSanitizer) doc = p.parseFragment(node['value'], encoding='utf-8') from html5lib import treewalkers, serializer walker = treewalkers.getTreeWalker('dom')(doc) xhtml = serializer.HTMLSerializer(inject_meta_charset=False) tree = xhtml.serialize(walker, encoding='utf-8') node['value'] = ''.join([str(token) for token in tree])
def serialize_html(input, options): options = dict([(str(k), v) for k, v in options.items()]) stream = AlphabeticalAttributesFilter(JsonWalker(input)) return serializer.HTMLSerializer(**options).render( stream, options.get("encoding", None))
def setUp(self): self.parser = etree.XMLParser(resolve_entities=False) self.treewalker = html5lib.getTreeWalker("lxml") self.serializer = serializer.HTMLSerializer()
def serialize_html(input, options): options = dict([(str(k), v) for k, v in options.items()]) return serializer.HTMLSerializer(**options).render( JsonWalker(input), options.get("encoding", None))
# -*- coding:utf-8 -*- import re from django.utils.html import escape from html5lib import HTMLParser, serializer, treewalkers WWW_PATTERN = re.compile(r'(^|\s|\(|\[|\<|\:)www\.', re.UNICODE) FTP_PATTERN = re.compile(r'(^|\s|\(|\[|\<|\:)ftp\.', re.UNICODE) PROTOCOL_PATTERN = re.compile( r'(http://|ftp://|mailto:|https://)(.*?)([\.\,\?\!\)]*?)(\s|>|<|"|$)' ) _parser = HTMLParser() _parse = _parser.parseFragment _serializer = serializer.HTMLSerializer() _tree_walker = treewalkers.getTreeWalker('simpletree') _serialize = lambda doc: u''.join(_serializer.serialize(_tree_walker(doc)) ) if doc.childNodes else u'' def usertext(value): doc = _parse(value) def urlify(s): s = re.sub(WWW_PATTERN, r'\1http://www.', s) s = re.sub(FTP_PATTERN, r'\1ftp://ftp.', s) s = re.sub(PROTOCOL_PATTERN, r'<a href="\1\2">\1\2</a>\3\4', s) return s def has_parents(node, tags): if node is None:
def serialize_html(self, input, options): options = dict([(str(k), v) for k, v in options.iteritems()]) return u''.join( serializer.HTMLSerializer(**options).serialize( JsonWalker(input), options.get("encoding", None)))