def html_is_valid(html: Optional[str], fragment: bool = False): from html5lib import HTMLParser # type: ignore htmlparser = HTMLParser(strict=True) try: if fragment: htmlparser.parseFragment(html) else: htmlparser.parse(html) success = True except Exception as e: print(e) success = False return success
def get_dom(self, buf): buf = buf.strip() if not buf: return None p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=self.token_sanitizer()) return p.parseFragment(buf)
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs["sanitize"] = True else: parser_kwargs["tokenizer"] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def sanitize(content): parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer, tree = treebuilders.getTreeBuilder("dom")) dom = parser.parseFragment(content) tree_walker = treewalkers.getTreeWalker("dom") tree_stream = tree_walker(dom) serial = serializer.HTMLSerializer(omit_optional_tags = False, quote_attr_values = True) output = serial.serialize(tree_stream) return u''.join(output)
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator)
def sanitize_input(chars): """ html1 = "<b>shon</b>" html1 = "<b>shon</b><script>zzz</script>" print sanitize_input(html1) """ p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) dom_tree = p.parseFragment(chars) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) gen = s.serialize(stream) out = ''.join(i for i in gen) return str(BeautifulSoup(out)) # BeautifulSoup is to convert <br> to <br />
def sanitize_html(input): """ Removes any unwanted HTML tags and attributes, using html5lib. >>> sanitize_html("foobar<p>adf<i></p>abc</i>") u'foobar<p>adf<i></i></p><i>abc</i>' >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>') u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>' """ p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def clean_html(input): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def apply_linkification( html: str, skip_tags: Optional[List[str]] = None, ) -> str: """Apply custom linkification filter to convert text patterns to links.""" parser = HTMLParser(namespaceHTMLElements=False) html_tree = parser.parseFragment(html) walker_stream = html5lib.getTreeWalker('etree')(html_tree) filtered_html_tree = LinkifyFilter(walker_stream, skip_tags) serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, sanitize=False, alphabetical_attributes=False, ) return serializer.render(filtered_html_tree)
def html_sanitize(text): if not text: return '' p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) element = p.parseFragment(text) walker = getTreeWalker("etree") stream = walker(element) s = serializer.HTMLSerializer() text = s.render(stream) text = UnicodeDammit(text, ["utf-8"]) REMOVE_ATTRIBUTES = [ 'lang','language','onmouseover','onmouseout','script','font','style', 'dir','face','size','color','style','class','width','height','hspace', 'border','valign','align','background','bgcolor','text','link','vlink', 'alink','cellpadding','cellspacing', 'id'] soup = BeautifulSoup(text.unicode_markup) for attribute in REMOVE_ATTRIBUTES: for tag in soup.findAll(): if(attribute == 'style'): new_style = '' style = tag.attrs.get('style', None) if style: if style.find('normal') != -1: new_style += " font-weight:normal; " elif style.find('bold') != -1: new_style += " font-weight:bold; " if style.find('italic') != -1: new_style += " font-style: italic; " if style.find('underline') != -1: new_style += " text-decoration: underline; " tag.attrs['style'] = new_style else: del(tag[attribute]) html = soup.prettify('utf-8') try: body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip() except IndexError: body = html return body
def _cleanTask(self, task, org): """Cleans the data given so that it can be safely stored as a task. Args: task: Dictionary as constructed by the csv.DictReader(). org: the GCIOrganization for which the task is created. Returns: A list of error messages if any have occurred. """ errors = [] # check title if not task['title']: errors.append('No valid title present.') # clean description try: parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) parsed = parser.parseFragment(task['description'], encoding='utf-8') cleaned_string = ''.join( [tag.toxml() for tag in parsed.childNodes]) task['description'] = cleaned_string.strip().replace('\r\n', '\n') except (HTMLParseError, ParseError, TypeError) as e: logging.warning('Cleaning of description failed with: %s', e) errors.append( 'Failed to clean the description, do not use naughty HTML such as ' '<script>.') # clean time to complete try: hours_to_complete = int(task['time_to_complete']) # Must be at least 2 days (48hrs) if hours_to_complete < 2 * 24: errors.append( 'Time to complete must be at least 48 hrs, given was: %s' % hours_to_complete) else: task['time_to_complete'] = hours_to_complete except (ValueError, TypeError) as e: errors.append('No valid time to completion found, given was: %s.' % task['time_to_complete']) # clean mentors mentor_ids = set(task['mentors'].split(',')) mentors = [] mentor_entities = [] for mentor_id in mentor_ids: q = GCIProfile.all() q.filter('link_id', mentor_id.strip()) q.filter('mentor_for', org) q.filter('status', 'active') mentor = q.get() if mentor: mentors.append(mentor.key()) mentor_entities.append(mentor) else: errors.append('%s is not a mentor.' % mentor_id) task['mentors'] = mentors task['mentor_entities'] = mentor_entities program_entity = org.program # clean task types types = [] for task_type in set(task['task_type'].split(',')): task_type = task_type.strip() if task_type in program_entity.task_types: types.append(task_type) else: errors.append('%s is not a valid task type.' % task_type) task['types'] = types # clean task tags tags = [] for tag in set(task['arbit_tag'].split(',')): tags.append(tag.strip()) task['tags'] = tags return errors
def test_self_closing_col(): parser = HTMLParser() parser.parseFragment('<table><colgroup><col /></colgroup></table>') assert not parser.errors
def _cleanTask(self, task, org): """Cleans the data given so that it can be safely stored as a task. Args: task: Dictionary as constructed by the csv.DictReader(). org: the GCIOrganization for which the task is created. Returns: A list of error messages if any have occurred. """ errors = [] # check title if not task['title']: errors.append('No valid title present.') # clean description try: parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) parsed = parser.parseFragment(task['description'], encoding='utf-8') cleaned_string = ''.join([tag.toxml() for tag in parsed.childNodes]) task['description'] = cleaned_string.strip().replace('\r\n', '\n') except (HTMLParseError, ParseError, TypeError) as e: logging.warning('Cleaning of description failed with: %s', e) errors.append( 'Failed to clean the description, do not use naughty HTML such as ' '<script>.') # clean time to complete try: hours_to_complete = int(task['time_to_complete']) # Must be at least 2 days (48hrs) if hours_to_complete < 2*24: errors.append('Time to complete must be at least 48 hrs, given was: %s' % hours_to_complete) else: task['time_to_complete'] = hours_to_complete except (ValueError, TypeError) as e: errors.append('No valid time to completion found, given was: %s.' % task['time_to_complete']) # clean mentors mentor_ids = set(task['mentors'].split(',')) mentors = [] mentor_entities = [] for mentor_id in mentor_ids: q = GCIProfile.all() q.filter('link_id', mentor_id.strip()) q.filter('mentor_for', org) q.filter('status', 'active') mentor = q.get() if mentor: mentors.append(mentor.key()) mentor_entities.append(mentor) else: errors.append('%s is not a mentor.' % mentor_id) task['mentors'] = mentors task['mentor_entities'] = mentor_entities program_entity = org.program # clean task types types = [] for task_type in set(task['task_type'].split(',')): task_type = task_type.strip() if task_type in program_entity.task_types: types.append(task_type) else: errors.append('%s is not a valid task type.' % task_type) task['types'] = types # clean task tags tags = [] for tag in set(task['arbit_tag'].split(',')): tags.append(tag.strip()) task['tags'] = tags return errors
def as_unicode(cls, events): parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) for row in events: for idx, cell in enumerate(row): row[idx] = parser.parseFragment(unicode(cell, 'utf-8')).toxml() yield row
def parse(self, value): #print value p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) return p.parseFragment(value)