Example #1
0
def scrubstring(string):
	from scrubber import Scrubber
	scrubber = Scrubber(autolink=True)
	try:
		string = string.decode('ascii')
	except UnicodeDecodeError:
		string = string.decode('utf-8')
	string = scrubber.scrub(string)
	return string.encode('utf-8')
Example #2
0
class ScrubberTestCase(unittest.TestCase):
    tests = (
        ( # Invalid HTML
            """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""",
            "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>"""
        ),
        ( # Autolink
            """www.example.com<br>""",
            """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />"""
        ),
        ( # No autolinking of existing links
            """<a href="http://www.example.com">Example</a>""",
            """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>"""
        ),        
        ( # No enocoding of pre-encoded urls during autolink:
            """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""",
            """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />"""
        ),
        ( # Strip scripts
            """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""",
            """<div>safe description</div>""",
        ),
        ( # Remove target from links
            """<a href="www.google.com" target="_new">Google</a>""",
            """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>"""
        ),
        ( # General cleaning (remove <br clear="all">, ...)
            """<br clear="all">""",
            """<br />"""
        ),
        ( # Converting b and i to strong and em
            """<b>strong</b> <i>em</i>""",
            """<strong>strong</strong> <em>em</em>"""
        ),
        ( # Encoded script (decimal)
            """<span style="&#97;&#110;&#121;&#58;&#32;&#101;&#120;&#112;&#114;&#101;&#115;&#115;&#105;&#111;&#110;&#40;&#119;&#105;&#110;&#100;&#111;&#119;&#46;&#108;&#111;&#99;&#97;&#116;&#105;&#111;&#110;&#61;&#39;&#104;&#116;&#116;&#112;&#58;&#47;&#47;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;&#47;&#39;&#41;">safe</span>""",
            """<span>safe</span>"""
        ),
        ( # Encoded script (hex)
            """<span style="&#x61;&#x6e;&#x79;&#x3a;&#x20;&#x65;&#x78;&#x70;&#x72;&#x65;&#x73;&#x73;&#x69;&#x6f;&#x6e;&#x28;&#x77;&#x69;&#x6e;&#x64;&#x6f;&#x77;&#x2e;&#x6c;&#x6f;&#x63;&#x61;&#x74;&#x69;&#x6f;&#x6e;&#x3d;&#x27;&#x68;&#x74;&#x74;&#x70;&#x3a;&#x2f;&#x2f;&#x65;&#x78;&#x61;&#x6d;&#x70;&#x6c;&#x65;&#x2e;&#x6f;&#x72;&#x67;&#x2f;&#x27;&#x29;">safe</span>""",
            """<span>safe</span>"""
        ),
        ( # Test unicode
            u"""Mitä kuuluu""",
            u"""Mitä kuuluu"""
        ),
        ( # Test embed
            """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""",
            """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&amp;downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&amp;allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>"""
        ),
        ( # Test evil code
            """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""",
            ""
        ),
        ( # Bad font tags
            """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""",
            """test wowzers  <p>foo</p><em>bar</em>"""
        ),
        ( # Stripping empty attributed
            """<font style="">Foo</font> <span id="">Bar</span>""",
            """Foo <span>Bar</span>"""
        ),
        ( # a0 == nbsp
            u"""test\xa0www.this.com""",
            u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>"""
        ),
        ( # Remove comments
            "Foo <!-- bar -->",
            "Foo "
        ),
        ( # Layered font tags
            """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""",
            """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>"""
        ),
        ( # Save contents of tags specified in 'disallowed_tags_save_content'
            "<blink>Foo</blink>",
            "Foo"
        ),
        ( # Character entities shouldn't get autolinked
            """http://www.google.com&nbsp;&nbsp;""",
            """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a>&nbsp;&nbsp;"""
        ),
        ( # Test unicode with autolinker
            u"""http://www.google.com/?q=mitä""",
            u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""",
        ),
        ( # Test mailto: links
            """<a href="mailto:[email protected]">Mail Test</a>""",
            """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>"""
        ),
        ( # Test removing a node but keeping the contents
            """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""",
            """<div>Hello World!</div>"""
        ),
        ( # Make keeping content for incomplete tags works
            "<blink><br><br>",
            "<br /><br />"
        ),
    )

    def setUp(self):
        self.scrubber = Scrubber()

    def testScrubber(self):
        for html, expected in self.tests:
            self.failUnlessEqual(self.scrubber.scrub(html), expected)
Example #3
0
class ScrubberTestCase(unittest.TestCase):
    tests = (
        (  # Invalid HTML
            """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""",
            "" if BeautifulSoup.__version__.startswith('3.1') else
            """<div>\nI will execute here, too, if you mouse over me\n</div>"""
        ),
        (  # Autolink
            """www.example.com<br>""",
            """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />"""
        ),
        (  # No autolinking of existing links
            """<a href="http://www.example.com">Example</a>""",
            """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>"""
        ),
        (  # No enocoding of pre-encoded urls during autolink:
            """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""",
            """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />"""
        ),
        (  # Strip scripts
            """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""",
            """<div>safe description</div>""",
        ),
        (  # Remove target from links
            """<a href="www.google.com" target="_new">Google</a>""",
            """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>"""
        ),
        (  # General cleaning (remove <br clear="all">, ...)
            """<br clear="all">""", """<br />"""),
        (  # Converting b and i to strong and em
            """<b>strong</b> <i>em</i>""",
            """<strong>strong</strong> <em>em</em>"""),
        (  # Encoded script (decimal)
            """<span style="&#97;&#110;&#121;&#58;&#32;&#101;&#120;&#112;&#114;&#101;&#115;&#115;&#105;&#111;&#110;&#40;&#119;&#105;&#110;&#100;&#111;&#119;&#46;&#108;&#111;&#99;&#97;&#116;&#105;&#111;&#110;&#61;&#39;&#104;&#116;&#116;&#112;&#58;&#47;&#47;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;&#47;&#39;&#41;">safe</span>""",
            """<span>safe</span>"""),
        (  # Encoded script (hex)
            """<span style="&#x61;&#x6e;&#x79;&#x3a;&#x20;&#x65;&#x78;&#x70;&#x72;&#x65;&#x73;&#x73;&#x69;&#x6f;&#x6e;&#x28;&#x77;&#x69;&#x6e;&#x64;&#x6f;&#x77;&#x2e;&#x6c;&#x6f;&#x63;&#x61;&#x74;&#x69;&#x6f;&#x6e;&#x3d;&#x27;&#x68;&#x74;&#x74;&#x70;&#x3a;&#x2f;&#x2f;&#x65;&#x78;&#x61;&#x6d;&#x70;&#x6c;&#x65;&#x2e;&#x6f;&#x72;&#x67;&#x2f;&#x27;&#x29;">safe</span>""",
            """<span>safe</span>"""),
        (  # Test unicode
            u"""Mitä kuuluu""", u"""Mitä kuuluu"""),
        (  # Test embed
            """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""",
            """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&amp;downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&amp;allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>"""
        ),
        (  # Test evil code
            """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""",
            ""),
        (  # Bad font tags
            """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""",
            """test wowzers  <p>foo</p><em>bar</em>"""),
        (  # Stripping empty attributed
            """<font style="">Foo</font> <span id="">Bar</span>""",
            """Foo <span>Bar</span>"""),
        (  # a0 == nbsp
            u"""test\xa0www.this.com""",
            u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>"""
        ),
        (  # Remove comments
            "Foo <!-- bar -->", "Foo "),
        (  # Layered font tags
            """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""",
            """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>"""
        ),
        (  # Save contents of tags specified in 'disallowed_tags_save_content'
            "<blink>Foo</blink>", "Foo"),
        (  # Character entities shouldn't get autolinked
            """http://www.google.com&nbsp;&nbsp;""",
            """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a>&nbsp;&nbsp;"""
        ),
        (  # Test unicode with autolinker
            u"""http://www.google.com/?q=mitä""",
            u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""",
        ),
        (  # Test mailto: links
            """<a href="mailto:[email protected]">Mail Test</a>""",
            """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>"""
        ),
        (  # Test removing a node but keeping the contents
            """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""",
            """<div>Hello World!</div>"""),
        (  # Make keeping content for incomplete tags works
            "<blink><br><br>", "<br /><br />"),
    )

    def setUp(self):
        self.scrubber = Scrubber()

    def testScrubber(self):
        for html, expected in self.tests:
            self.failUnlessEqual(self.scrubber.scrub(html), expected)