Ejemplo n.º 1
0
def html_is_valid(html: Optional[str], fragment: bool = False):
    from html5lib import HTMLParser  # type: ignore

    htmlparser = HTMLParser(strict=True)
    try:
        if fragment:
            htmlparser.parseFragment(html)
        else:
            htmlparser.parse(html)
        success = True
    except Exception as e:
        print(e)
        success = False
    return success
Ejemplo n.º 2
0
Archivo: html.py Proyecto: riffm/iktomi
 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)
Ejemplo n.º 3
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Ejemplo n.º 4
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Ejemplo n.º 5
0
def sanitize(content):
    parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
                             tree = treebuilders.getTreeBuilder("dom"))
    dom = parser.parseFragment(content)
    tree_walker = treewalkers.getTreeWalker("dom")
    tree_stream = tree_walker(dom)
    serial = serializer.HTMLSerializer(omit_optional_tags = False,
                                           quote_attr_values = True)
    output = serial.serialize(tree_stream)
    return u''.join(output)
Ejemplo n.º 6
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Ejemplo n.º 7
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Ejemplo n.º 8
0
def sanitize_input(chars):
    """
    html1 = "<b>shon</b>"
    html1 = "<b>shon</b><script>zzz</script>"
    print sanitize_input(html1)
    """
    p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    dom_tree = p.parseFragment(chars)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    gen = s.serialize(stream)
    out = ''.join(i for i in gen)
    return str(BeautifulSoup(out)) # BeautifulSoup is to convert <br> to <br />
Ejemplo n.º 9
0
def sanitize_html(input):
    """
    Removes any unwanted HTML tags and attributes, using html5lib.

    >>> sanitize_html("foobar<p>adf<i></p>abc</i>")
    u'foobar<p>adf<i></i></p><i>abc</i>'
    >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
    u'foobar<p style="color: red;">adf&lt;script&gt;alert("Uhoh!")&lt;/script&gt;<i></i></p><i>abc</i>'
    """
    p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
Ejemplo n.º 10
0
def sanitize_html(input):
    """
    Removes any unwanted HTML tags and attributes, using html5lib.

    >>> sanitize_html("foobar<p>adf<i></p>abc</i>")
    u'foobar<p>adf<i></i></p><i>abc</i>'
    >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
    u'foobar<p style="color: red;">adf&lt;script&gt;alert("Uhoh!")&lt;/script&gt;<i></i></p><i>abc</i>'
    """
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
Ejemplo n.º 11
0
def clean_html(input):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
Ejemplo n.º 12
0
def clean_html(input):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
Ejemplo n.º 13
0
def apply_linkification(
    html: str,
    skip_tags: Optional[List[str]] = None,
) -> str:
    """Apply custom linkification filter to convert text patterns to links."""
    parser = HTMLParser(namespaceHTMLElements=False)

    html_tree = parser.parseFragment(html)
    walker_stream = html5lib.getTreeWalker('etree')(html_tree)

    filtered_html_tree = LinkifyFilter(walker_stream, skip_tags)

    serializer = HTMLSerializer(
        quote_attr_values='always',
        omit_optional_tags=False,
        sanitize=False,
        alphabetical_attributes=False,
    )
    return serializer.render(filtered_html_tree)
Ejemplo n.º 14
0
def html_sanitize(text):
	if not text:
		return ''
	p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
	element = p.parseFragment(text)
	walker = getTreeWalker("etree")
	stream = walker(element)
	s = serializer.HTMLSerializer()
	text = s.render(stream)
	text = UnicodeDammit(text, ["utf-8"])
	REMOVE_ATTRIBUTES = [
		'lang','language','onmouseover','onmouseout','script','font','style',
		'dir','face','size','color','style','class','width','height','hspace',
		'border','valign','align','background','bgcolor','text','link','vlink',
		'alink','cellpadding','cellspacing', 'id']

	soup = BeautifulSoup(text.unicode_markup)
	for attribute in REMOVE_ATTRIBUTES:
		for tag in soup.findAll():

			if(attribute == 'style'):
				new_style = ''
				style = tag.attrs.get('style', None)
				if style:
					if style.find('normal') != -1: new_style += " font-weight:normal; "
					elif style.find('bold') != -1: new_style += " font-weight:bold; "
					if style.find('italic') != -1: new_style += " font-style: italic; "
					if style.find('underline') != -1: new_style += " text-decoration: underline; "
					tag.attrs['style'] = new_style

			else:
				del(tag[attribute])

	html = soup.prettify('utf-8')
	try:
		body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip()
	except IndexError:
		body = html
	return body
Ejemplo n.º 15
0
    def _cleanTask(self, task, org):
        """Cleans the data given so that it can be safely stored as a task.

      Args:
        task: Dictionary as constructed by the csv.DictReader().
        org: the GCIOrganization for which the task is created.

      Returns:
          A list of error messages if any have occurred.
    """

        errors = []

        # check title
        if not task['title']:
            errors.append('No valid title present.')

        # clean description
        try:
            parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
            parsed = parser.parseFragment(task['description'],
                                          encoding='utf-8')
            cleaned_string = ''.join(
                [tag.toxml() for tag in parsed.childNodes])
            task['description'] = cleaned_string.strip().replace('\r\n', '\n')
        except (HTMLParseError, ParseError, TypeError) as e:
            logging.warning('Cleaning of description failed with: %s', e)
            errors.append(
                'Failed to clean the description, do not use naughty HTML such as '
                '<script>.')

        # clean time to complete
        try:
            hours_to_complete = int(task['time_to_complete'])

            # Must be at least 2 days (48hrs)
            if hours_to_complete < 2 * 24:
                errors.append(
                    'Time to complete must be at least 48 hrs, given was: %s' %
                    hours_to_complete)
            else:
                task['time_to_complete'] = hours_to_complete
        except (ValueError, TypeError) as e:
            errors.append('No valid time to completion found, given was: %s.' %
                          task['time_to_complete'])

        # clean mentors
        mentor_ids = set(task['mentors'].split(','))

        mentors = []
        mentor_entities = []
        for mentor_id in mentor_ids:
            q = GCIProfile.all()
            q.filter('link_id', mentor_id.strip())
            q.filter('mentor_for', org)
            q.filter('status', 'active')
            mentor = q.get()
            if mentor:
                mentors.append(mentor.key())
                mentor_entities.append(mentor)
            else:
                errors.append('%s is not a mentor.' % mentor_id)

        task['mentors'] = mentors
        task['mentor_entities'] = mentor_entities

        program_entity = org.program

        # clean task types
        types = []
        for task_type in set(task['task_type'].split(',')):
            task_type = task_type.strip()
            if task_type in program_entity.task_types:
                types.append(task_type)
            else:
                errors.append('%s is not a valid task type.' % task_type)
        task['types'] = types

        # clean task tags
        tags = []
        for tag in set(task['arbit_tag'].split(',')):
            tags.append(tag.strip())
        task['tags'] = tags

        return errors
Ejemplo n.º 16
0
def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors
Ejemplo n.º 17
0
  def _cleanTask(self, task, org):
    """Cleans the data given so that it can be safely stored as a task.

      Args:
        task: Dictionary as constructed by the csv.DictReader().
        org: the GCIOrganization for which the task is created.

      Returns:
          A list of error messages if any have occurred.
    """

    errors = []

    # check title
    if not task['title']:
      errors.append('No valid title present.')

    # clean description
    try:
      parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
      parsed = parser.parseFragment(task['description'], encoding='utf-8')
      cleaned_string = ''.join([tag.toxml() for tag in parsed.childNodes])
      task['description'] = cleaned_string.strip().replace('\r\n', '\n')
    except (HTMLParseError, ParseError, TypeError) as e:
      logging.warning('Cleaning of description failed with: %s', e)
      errors.append(
          'Failed to clean the description, do not use naughty HTML such as '
          '<script>.')

    # clean time to complete
    try:
      hours_to_complete = int(task['time_to_complete'])

      # Must be at least 2 days (48hrs)
      if hours_to_complete < 2*24:
        errors.append('Time to complete must be at least 48 hrs, given was: %s'
                      % hours_to_complete)
      else:
        task['time_to_complete'] = hours_to_complete
    except (ValueError, TypeError) as e:
      errors.append('No valid time to completion found, given was: %s.'
                    % task['time_to_complete'])

    # clean mentors
    mentor_ids = set(task['mentors'].split(','))

    mentors = []
    mentor_entities = []
    for mentor_id in mentor_ids:
      q = GCIProfile.all()
      q.filter('link_id', mentor_id.strip())
      q.filter('mentor_for', org)
      q.filter('status', 'active')
      mentor = q.get()
      if mentor:
        mentors.append(mentor.key())
        mentor_entities.append(mentor)
      else:
        errors.append('%s is not a mentor.' % mentor_id)

    task['mentors'] = mentors
    task['mentor_entities'] = mentor_entities

    program_entity = org.program

    # clean task types
    types = []
    for task_type in set(task['task_type'].split(',')):
      task_type = task_type.strip()
      if task_type in program_entity.task_types:
        types.append(task_type)
      else:
        errors.append('%s is not a valid task type.' % task_type)
    task['types'] = types

    # clean task tags
    tags = []
    for tag in set(task['arbit_tag'].split(',')):
      tags.append(tag.strip())
    task['tags'] = tags

    return errors
Ejemplo n.º 18
0
def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors
Ejemplo n.º 19
0
 def as_unicode(cls, events):
     parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
     for row in events:
         for idx, cell in enumerate(row):
             row[idx] = parser.parseFragment(unicode(cell, 'utf-8')).toxml()
         yield row
Ejemplo n.º 20
0
 def parse(self, value):
     #print value
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
     return p.parseFragment(value)