Esempio n. 1
0
def scrubstring(string):
	from scrubber import Scrubber
	scrubber = Scrubber(autolink=True)
	try:
		string = string.decode('ascii')
	except UnicodeDecodeError:
		string = string.decode('utf-8')
	string = scrubber.scrub(string)
	return string.encode('utf-8')
Esempio n. 2
0
def main():
    inp_json = open(args.inpf).read()
    inp_json = json.loads(inp_json)

    text = inp_json["signal"]
    annots = []
    for aset in inp_json["asets"]:
        if aset["type"] == "PERSON":
            annots = aset["annots"]
    annots.sort(key=lambda k: k[0])
    text = text.encode("utf-8").decode("utf-8")
    text = text.strip()
    text = text.replace("\r\n", "^^\n")

    current_annot = (annots[0] if annots else None)
    processed_length = 0
    for line in text.split('\n'):
        if not line:
            continue
        script_annots = Scrubber.dry_clean(line)[1]
        if not current_annot:
            continue
        line_len = len(line)
        line_annots = []
        while True:
            if not current_annot:
                break
            start = current_annot[0] - processed_length
            end = current_annot[1] - processed_length
            if (start <= line_len and end <= line_len):
                annots.pop(0)
                line_annots.append([start, end])
                current_annot = (annots[0] if annots else None)
            else:
                break
        processed_length += len(line)
        line_annots = merge_consecutive_markings(line_annots)
        script_annots = merge_consecutive_markings(script_annots)
        check_converage(line, line_annots, script_annots)
    print "Total number of docs: %s" % total_number_of_docs
    print "Docs with no PIIs / all PIIs detected: %s" % docs_with_all_pii_detected
    print "Docs with only partially missed PIIs: %s" % docs_with_partially_detected_pii
    print "Docs with fully missed PIIs: %s" % docs_with_full_missed_pii
Esempio n. 3
0
import os
from scrubber import Scrubber
import BeautifulSoup
#import BeautifulSoup

# initialise the scrubber! all this stuff is overriding scrubber defaults so hack it to bits if you want!
scrubber = Scrubber(autolink=False)
scrubber.allowed_tags = set((
    'a',
    'abbr',
    'acronym',
    'b',
    'bdo',
    'big',
    'blockquote',
    'br',
    'center',
    'cite',
    'code',
    'dd',
    'del',
    'dfn',
    'div',
    'dl',
    'dt',
    'em',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
import os
from scrubber import Scrubber
import BeautifulSoup
#import BeautifulSoup

# initialise the scrubber! all this stuff is overriding scrubber defaults so hack it to bits if you want!
scrubber = Scrubber(autolink=False)
scrubber.allowed_tags = set((
    'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'blockquote', 'br',
    'center', 'cite', 'code',
    'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
    'kbd', 'li', 'ol', 'param', 'pre', 'p', 'q',
    's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
    'table', 'tbody', 'td', 'th', 'thead', 'tr', 'tt', 'ul', 'u',
    'var', 'wbr',
))
scrubber.disallowed_tags_save_content = set((
    'blink', 'body', 'html','font',
))
scrubber.allowed_attributes = set((
    'align', 'alt', 'border', 'cite', 'dir',
    'height', 'href', 'src', 'title', 'type', 'width',
    'face', 'size', # font tags
    'flashvars', # Not sure about flashvars - if any harm can come from it
    'classid', # FF needs the classid on object tags for flash
    'name', 'value', 'quality', 'data', 'scale', # for flash embed param tags, could limit to just param if this is harmful
    'salign', 'align', 'wmode',
)) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target'
scrubber.normalized_tag_replacements = {'b': 'strong', 'i': 'em'}
# any giveaway classes the definately identify a footer.
Esempio n. 5
0
class ScrubberTestCase(unittest.TestCase):
    tests = (
        ( # Invalid HTML
            """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""",
            "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>"""
        ),
        ( # Autolink
            """www.example.com<br>""",
            """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />"""
        ),
        ( # No autolinking of existing links
            """<a href="http://www.example.com">Example</a>""",
            """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>"""
        ),        
        ( # No enocoding of pre-encoded urls during autolink:
            """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""",
            """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />"""
        ),
        ( # Strip scripts
            """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""",
            """<div>safe description</div>""",
        ),
        ( # Remove target from links
            """<a href="www.google.com" target="_new">Google</a>""",
            """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>"""
        ),
        ( # General cleaning (remove <br clear="all">, ...)
            """<br clear="all">""",
            """<br />"""
        ),
        ( # Converting b and i to strong and em
            """<b>strong</b> <i>em</i>""",
            """<strong>strong</strong> <em>em</em>"""
        ),
        ( # Encoded script (decimal)
            """<span style="&#97;&#110;&#121;&#58;&#32;&#101;&#120;&#112;&#114;&#101;&#115;&#115;&#105;&#111;&#110;&#40;&#119;&#105;&#110;&#100;&#111;&#119;&#46;&#108;&#111;&#99;&#97;&#116;&#105;&#111;&#110;&#61;&#39;&#104;&#116;&#116;&#112;&#58;&#47;&#47;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;&#47;&#39;&#41;">safe</span>""",
            """<span>safe</span>"""
        ),
        ( # Encoded script (hex)
            """<span style="&#x61;&#x6e;&#x79;&#x3a;&#x20;&#x65;&#x78;&#x70;&#x72;&#x65;&#x73;&#x73;&#x69;&#x6f;&#x6e;&#x28;&#x77;&#x69;&#x6e;&#x64;&#x6f;&#x77;&#x2e;&#x6c;&#x6f;&#x63;&#x61;&#x74;&#x69;&#x6f;&#x6e;&#x3d;&#x27;&#x68;&#x74;&#x74;&#x70;&#x3a;&#x2f;&#x2f;&#x65;&#x78;&#x61;&#x6d;&#x70;&#x6c;&#x65;&#x2e;&#x6f;&#x72;&#x67;&#x2f;&#x27;&#x29;">safe</span>""",
            """<span>safe</span>"""
        ),
        ( # Test unicode
            u"""Mitä kuuluu""",
            u"""Mitä kuuluu"""
        ),
        ( # Test embed
            """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""",
            """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&amp;downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&amp;allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>"""
        ),
        ( # Test evil code
            """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""",
            ""
        ),
        ( # Bad font tags
            """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""",
            """test wowzers  <p>foo</p><em>bar</em>"""
        ),
        ( # Stripping empty attributed
            """<font style="">Foo</font> <span id="">Bar</span>""",
            """Foo <span>Bar</span>"""
        ),
        ( # a0 == nbsp
            u"""test\xa0www.this.com""",
            u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>"""
        ),
        ( # Remove comments
            "Foo <!-- bar -->",
            "Foo "
        ),
        ( # Layered font tags
            """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""",
            """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>"""
        ),
        ( # Save contents of tags specified in 'disallowed_tags_save_content'
            "<blink>Foo</blink>",
            "Foo"
        ),
        ( # Character entities shouldn't get autolinked
            """http://www.google.com&nbsp;&nbsp;""",
            """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a>&nbsp;&nbsp;"""
        ),
        ( # Test unicode with autolinker
            u"""http://www.google.com/?q=mitä""",
            u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""",
        ),
        ( # Test mailto: links
            """<a href="mailto:[email protected]">Mail Test</a>""",
            """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>"""
        ),
        ( # Test removing a node but keeping the contents
            """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""",
            """<div>Hello World!</div>"""
        ),
        ( # Make keeping content for incomplete tags works
            "<blink><br><br>",
            "<br /><br />"
        ),
    )

    def setUp(self):
        self.scrubber = Scrubber()

    def testScrubber(self):
        for html, expected in self.tests:
            self.failUnlessEqual(self.scrubber.scrub(html), expected)
Esempio n. 6
0
 def setUp(self):
     self.scrubber = Scrubber()
Esempio n. 7
0
#
# Web frontend for CVE -> RHSA report generator.
#
# Requires CherryPy, Mako, Scrubber, Python <= 2.7

import rhsac
import cherrypy
import os
import re
import sqlite3
from mako.template import Template
from mako.lookup import TemplateLookup
from scrubber import Scrubber

tlu = TemplateLookup(directories=['templates'])
scrubber = Scrubber(autolink=True)
curdir = os.path.join(os.getcwd(), os.path.dirname(__file__))

cherrypy.config.update({
    'tools.staticdir.root': curdir,
    'server.environment': 'production'
})

fixemph = re.compile('!!FIX!!')


class RHSAGenWeb:
    @cherrypy.expose
    def index(self, snmp=False):
        return tlu.get_template("index.html").render(snmp=snmp)
Esempio n. 8
0
 def setUp(self):
     self.scrubber = Scrubber()
Esempio n. 9
0
class ScrubberTestCase(unittest.TestCase):
    tests = (
        (  # Invalid HTML
            """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""",
            "" if BeautifulSoup.__version__.startswith('3.1') else
            """<div>\nI will execute here, too, if you mouse over me\n</div>"""
        ),
        (  # Autolink
            """www.example.com<br>""",
            """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />"""
        ),
        (  # No autolinking of existing links
            """<a href="http://www.example.com">Example</a>""",
            """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>"""
        ),
        (  # No enocoding of pre-encoded urls during autolink:
            """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""",
            """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />"""
        ),
        (  # Strip scripts
            """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""",
            """<div>safe description</div>""",
        ),
        (  # Remove target from links
            """<a href="www.google.com" target="_new">Google</a>""",
            """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>"""
        ),
        (  # General cleaning (remove <br clear="all">, ...)
            """<br clear="all">""", """<br />"""),
        (  # Converting b and i to strong and em
            """<b>strong</b> <i>em</i>""",
            """<strong>strong</strong> <em>em</em>"""),
        (  # Encoded script (decimal)
            """<span style="&#97;&#110;&#121;&#58;&#32;&#101;&#120;&#112;&#114;&#101;&#115;&#115;&#105;&#111;&#110;&#40;&#119;&#105;&#110;&#100;&#111;&#119;&#46;&#108;&#111;&#99;&#97;&#116;&#105;&#111;&#110;&#61;&#39;&#104;&#116;&#116;&#112;&#58;&#47;&#47;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;&#47;&#39;&#41;">safe</span>""",
            """<span>safe</span>"""),
        (  # Encoded script (hex)
            """<span style="&#x61;&#x6e;&#x79;&#x3a;&#x20;&#x65;&#x78;&#x70;&#x72;&#x65;&#x73;&#x73;&#x69;&#x6f;&#x6e;&#x28;&#x77;&#x69;&#x6e;&#x64;&#x6f;&#x77;&#x2e;&#x6c;&#x6f;&#x63;&#x61;&#x74;&#x69;&#x6f;&#x6e;&#x3d;&#x27;&#x68;&#x74;&#x74;&#x70;&#x3a;&#x2f;&#x2f;&#x65;&#x78;&#x61;&#x6d;&#x70;&#x6c;&#x65;&#x2e;&#x6f;&#x72;&#x67;&#x2f;&#x27;&#x29;">safe</span>""",
            """<span>safe</span>"""),
        (  # Test unicode
            u"""Mitä kuuluu""", u"""Mitä kuuluu"""),
        (  # Test embed
            """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""",
            """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&amp;downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&amp;allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>"""
        ),
        (  # Test evil code
            """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""",
            ""),
        (  # Bad font tags
            """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""",
            """test wowzers  <p>foo</p><em>bar</em>"""),
        (  # Stripping empty attributed
            """<font style="">Foo</font> <span id="">Bar</span>""",
            """Foo <span>Bar</span>"""),
        (  # a0 == nbsp
            u"""test\xa0www.this.com""",
            u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>"""
        ),
        (  # Remove comments
            "Foo <!-- bar -->", "Foo "),
        (  # Layered font tags
            """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""",
            """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>"""
        ),
        (  # Save contents of tags specified in 'disallowed_tags_save_content'
            "<blink>Foo</blink>", "Foo"),
        (  # Character entities shouldn't get autolinked
            """http://www.google.com&nbsp;&nbsp;""",
            """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a>&nbsp;&nbsp;"""
        ),
        (  # Test unicode with autolinker
            u"""http://www.google.com/?q=mitä""",
            u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""",
        ),
        (  # Test mailto: links
            """<a href="mailto:[email protected]">Mail Test</a>""",
            """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>"""
        ),
        (  # Test removing a node but keeping the contents
            """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""",
            """<div>Hello World!</div>"""),
        (  # Make keeping content for incomplete tags works
            "<blink><br><br>", "<br /><br />"),
    )

    def setUp(self):
        self.scrubber = Scrubber()

    def testScrubber(self):
        for html, expected in self.tests:
            self.failUnlessEqual(self.scrubber.scrub(html), expected)
Esempio n. 10
0
import gzip
import os.path
from scrubber import Scrubber 

s = Scrubber(.025)
file = './all.tsv.gz'

# Make sure we've got the data file locally 
if not os.path.exists(file):
  print 'Missing GPS data:', file 
else:
  gps = gzip.open(file, 'r').readlines()

  # secondary j iterator used to prevent doulbe testing / dropping of points 
  j = 0
  
  for i in xrange(1, len(gps)):
    j = i if j == len(gps) else j
      
    p1 = gps[j-1].strip().split('\t')
    p2 = gps[j].strip().split('\t')
  
    if (p1[0] == p2[0]):
  
      keep = s.keep( [ float(p1[2]), float(p1[3]) ], [ float(p2[2]), float(p2[3])] )
      if not keep:
        j += 1 
        print 'Dropping point:', p2 #i, keep, p1, p2
  
    j += 1
Esempio n. 11
0
import gzip
import os.path
from scrubber import Scrubber

s = Scrubber(.025)
file = './all.tsv.gz'

# Make sure we've got the data file locally
if not os.path.exists(file):
    print 'Missing GPS data:', file
else:
    gps = gzip.open(file, 'r').readlines()

    # secondary j iterator used to prevent doulbe testing / dropping of points
    j = 0

    for i in xrange(1, len(gps)):
        j = i if j == len(gps) else j

        p1 = gps[j - 1].strip().split('\t')
        p2 = gps[j].strip().split('\t')

        if (p1[0] == p2[0]):

            keep = s.keep([float(p1[2]), float(p1[3])],
                          [float(p2[2]), float(p2[3])])
            if not keep:
                j += 1
                print 'Dropping point:', p2  #i, keep, p1, p2

        j += 1
Esempio n. 12
0
from django import template
from django.utils.safestring import mark_safe
import markdown
from scrubber import Scrubber

register = template.Library()

scrubber = Scrubber(
    autolink=False,
    nofollow=False)  # Scrubber's autolink doesn't handle ftp://
md = markdown.Markdown(extensions=['nl2br', 'autolink'])


@register.filter(is_safe=True)
def safe_markdown(value, arg=''):
    return mark_safe(scrubber.scrub(md.reset().convert(value)))