Example #1
0
def test_speed():
    
    n = 10000
    p = 0.0001
    b = BloomFilter(n, p)
    print b
    
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        strings.add(string)
    
    total_time = 0
    starttime = time.time()
    for string in strings:
    	b.add(string)
    total_time = (time.time() - starttime)
    	
    ns = float(len(strings))
    k = float(b.k)
    total_time = float(total_time)
    
    print "Number of hash functions: %d" % b.k
    print "Speed per hash: %f seconds" % (total_time / ns / k)
    print "Speed per add: %f seconds" % (total_time / ns)
Example #2
0
 def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
              domain_reg_list: List[str],
              path_dicts: List[str], header: dict = None):
     self.domain_reg = ''
     self.domain_reg_list = domain_reg_list
     self.complement = 0
     self.depth = 5 if not depth else depth
     self.max_url_nums = 5000 if not max_url_nums else max_url_nums
     self.cookie = cookies
     self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
     self.url_dict = dict()
     self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
     self.current_depth = 0
     self.current_crawl_queue = list()
     self.next_crawl_queue = list()
     self.max_queue_length = self.max_url_nums + 1000
     self.header = header
     self.path_dicts = path_dicts
     self.header = header
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
     ]
     self.exclude_urls_reg_str = ''
Example #3
0
def test_error_rate():
    n = 10000
    p = 0.001
    b = BloomFilter(n, p)
    print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p)
    print "Optimal values are m = %d, k = %d" % (b.m, b.k)
    elt = 'apple'
    
    print "Testing..."
    assert elt not in b
    
    print "After adding '%s'..." % elt
    b.add(elt)
    
    print "Testing..."
    assert elt in b
    
    # create random strings
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	strings.add(string)
    
    # other strings
    other_strings = set()
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	other_strings.add(string)
    
    # add all to set
    for s in list(strings):
    	b.add(s)
    
    # test for collisions
    other_strings = other_strings - strings
    collisions = 0
    for s in list(other_strings):
    	if s in b:
    		collisions += 1
    
    print "False positive rate was %d / %d = %f" % (
    	collisions, len(other_strings), 
    	float(collisions) / float(len(other_strings)))
Example #4
0
 def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
              depth: int = 5):
     self.cookie = cookie
     self.headers = headers if headers else DEFAULT_HEADERS
     self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
     self.current_queue = Manager().Queue(maxsize=max_num * 2)
     self.max_url_num = max_num
     self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
     self.url_dict = Manager().dict()
     self.domain_reg_list = domain_regs
     self.depth = depth
     self.current_depth = 0
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
     ]
Example #5
0
 def setUp(self):
     self.bf = BloomFilter(0.001,10**3)