def scrubstring(string): from scrubber import Scrubber scrubber = Scrubber(autolink=True) try: string = string.decode('ascii') except UnicodeDecodeError: string = string.decode('utf-8') string = scrubber.scrub(string) return string.encode('utf-8')
class ScrubberTestCase(unittest.TestCase): tests = ( ( # Invalid HTML """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""", "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>""" ), ( # Autolink """www.example.com<br>""", """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />""" ), ( # No autolinking of existing links """<a href="http://www.example.com">Example</a>""", """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>""" ), ( # No enocoding of pre-encoded urls during autolink: """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""", """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />""" ), ( # Strip scripts """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""", """<div>safe description</div>""", ), ( # Remove target from links """<a href="www.google.com" target="_new">Google</a>""", """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>""" ), ( # General cleaning (remove <br clear="all">, ...) """<br clear="all">""", """<br />""" ), ( # Converting b and i to strong and em """<b>strong</b> <i>em</i>""", """<strong>strong</strong> <em>em</em>""" ), ( # Encoded script (decimal) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>""" ), ( # Encoded script (hex) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>""" ), ( # Test unicode u"""Mitä kuuluu""", u"""Mitä kuuluu""" ), ( # Test embed """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""", """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>""" ), ( # Test evil code """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""", "" ), ( # Bad font tags """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""", """test wowzers <p>foo</p><em>bar</em>""" ), ( # Stripping empty attributed """<font style="">Foo</font> <span id="">Bar</span>""", """Foo <span>Bar</span>""" ), ( # a0 == nbsp u"""test\xa0www.this.com""", u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>""" ), ( # Remove comments "Foo <!-- bar -->", "Foo " ), ( # Layered font tags """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""", """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>""" ), ( # Save contents of tags specified in 'disallowed_tags_save_content' "<blink>Foo</blink>", "Foo" ), ( # Character entities shouldn't get autolinked """http://www.google.com """, """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a> """ ), ( # Test unicode with autolinker u"""http://www.google.com/?q=mitä""", u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""", ), ( # Test mailto: links """<a href="mailto:[email protected]">Mail Test</a>""", """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>""" ), ( # Test removing a node but keeping the contents """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""", """<div>Hello World!</div>""" ), ( # Make keeping content for incomplete tags works "<blink><br><br>", "<br /><br />" ), ) def setUp(self): self.scrubber = Scrubber() def testScrubber(self): for html, expected in self.tests: self.failUnlessEqual(self.scrubber.scrub(html), expected)
class ScrubberTestCase(unittest.TestCase): tests = ( ( # Invalid HTML """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""", "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>""" ), ( # Autolink """www.example.com<br>""", """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />""" ), ( # No autolinking of existing links """<a href="http://www.example.com">Example</a>""", """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>""" ), ( # No enocoding of pre-encoded urls during autolink: """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""", """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />""" ), ( # Strip scripts """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""", """<div>safe description</div>""", ), ( # Remove target from links """<a href="www.google.com" target="_new">Google</a>""", """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>""" ), ( # General cleaning (remove <br clear="all">, ...) """<br clear="all">""", """<br />"""), ( # Converting b and i to strong and em """<b>strong</b> <i>em</i>""", """<strong>strong</strong> <em>em</em>"""), ( # Encoded script (decimal) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>"""), ( # Encoded script (hex) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>"""), ( # Test unicode u"""Mitä kuuluu""", u"""Mitä kuuluu"""), ( # Test embed """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""", """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>""" ), ( # Test evil code """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""", ""), ( # Bad font tags """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""", """test wowzers <p>foo</p><em>bar</em>"""), ( # Stripping empty attributed """<font style="">Foo</font> <span id="">Bar</span>""", """Foo <span>Bar</span>"""), ( # a0 == nbsp u"""test\xa0www.this.com""", u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>""" ), ( # Remove comments "Foo <!-- bar -->", "Foo "), ( # Layered font tags """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""", """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>""" ), ( # Save contents of tags specified in 'disallowed_tags_save_content' "<blink>Foo</blink>", "Foo"), ( # Character entities shouldn't get autolinked """http://www.google.com """, """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a> """ ), ( # Test unicode with autolinker u"""http://www.google.com/?q=mitä""", u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""", ), ( # Test mailto: links """<a href="mailto:[email protected]">Mail Test</a>""", """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>""" ), ( # Test removing a node but keeping the contents """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""", """<div>Hello World!</div>"""), ( # Make keeping content for incomplete tags works "<blink><br><br>", "<br /><br />"), ) def setUp(self): self.scrubber = Scrubber() def testScrubber(self): for html, expected in self.tests: self.failUnlessEqual(self.scrubber.scrub(html), expected)