Python BloomFilter.BloomFilter Examples

Programming Language: Python

Namespace/Package Name: bloompy

Class/Type: BloomFilter

Method/Function: BloomFilter

Examples at hotexamples.com: 5

Python BloomFilter.BloomFilter - 5 examples found. These are the top rated real world Python examples of bloompy.BloomFilter.BloomFilter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BloomFilter(5)

add(5)

fromfile(1)

tofile(1)

Example #1

Show file

def test_speed():
    
    n = 10000
    p = 0.0001
    b = BloomFilter(n, p)
    print b
    
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        strings.add(string)
    
    total_time = 0
    starttime = time.time()
    for string in strings:
    	b.add(string)
    total_time = (time.time() - starttime)
    	
    ns = float(len(strings))
    k = float(b.k)
    total_time = float(total_time)
    
    print "Number of hash functions: %d" % b.k
    print "Speed per hash: %f seconds" % (total_time / ns / k)
    print "Speed per add: %f seconds" % (total_time / ns)

Example #2

Show file

 def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
              domain_reg_list: List[str],
              path_dicts: List[str], header: dict = None):
     self.domain_reg = ''
     self.domain_reg_list = domain_reg_list
     self.complement = 0
     self.depth = 5 if not depth else depth
     self.max_url_nums = 5000 if not max_url_nums else max_url_nums
     self.cookie = cookies
     self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
     self.url_dict = dict()
     self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
     self.current_depth = 0
     self.current_crawl_queue = list()
     self.next_crawl_queue = list()
     self.max_queue_length = self.max_url_nums + 1000
     self.header = header
     self.path_dicts = path_dicts
     self.header = header
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
     ]
     self.exclude_urls_reg_str = ''

Example #3

Show file

def test_error_rate():
    n = 10000
    p = 0.001
    b = BloomFilter(n, p)
    print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p)
    print "Optimal values are m = %d, k = %d" % (b.m, b.k)
    elt = 'apple'
    
    print "Testing..."
    assert elt not in b
    
    print "After adding '%s'..." % elt
    b.add(elt)
    
    print "Testing..."
    assert elt in b
    
    # create random strings
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	strings.add(string)
    
    # other strings
    other_strings = set()
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	other_strings.add(string)
    
    # add all to set
    for s in list(strings):
    	b.add(s)
    
    # test for collisions
    other_strings = other_strings - strings
    collisions = 0
    for s in list(other_strings):
    	if s in b:
    		collisions += 1
    
    print "False positive rate was %d / %d = %f" % (
    	collisions, len(other_strings), 
    	float(collisions) / float(len(other_strings)))

Example #4

Show file

 def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
              depth: int = 5):
     self.cookie = cookie
     self.headers = headers if headers else DEFAULT_HEADERS
     self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
     self.current_queue = Manager().Queue(maxsize=max_num * 2)
     self.max_url_num = max_num
     self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
     self.url_dict = Manager().dict()
     self.domain_reg_list = domain_regs
     self.depth = depth
     self.current_depth = 0
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
     ]

Example #5

Show file

 def setUp(self):
     self.bf = BloomFilter(0.001,10**3)