class BloomFilter: def __init__(self, n): #对于基本参数进行设置 self.k = 10 self.m = n * 20 + 1 #这一步很关键,处以一个素数会让冲突减小一万倍 self.bitarray = Bitarray(self.m) def hash_str(self, keyword, i): seed = eval("1313" + i * "13") # 31 131 1313 13131 131313 etc.. hash = 0 for i in range(len(keyword)): hash = (hash * seed) + ord(keyword[i]) index = hash % self.m return index def add_keyword(self, keyword): for i in range(self.k): index = self.hash_str(keyword=keyword, i=i) self.bitarray.set(index) def check_keyword(self, keyword): for i in range(self.k): index = self.hash_str(keyword=keyword, i=i) if not self.bitarray.get(index): return False return True
class BloomFilter(set): def __init__(self, size, hash_num): super(BloomFilter, self).__init__() self.bitarray = Bitarray(size) self.size = size self.hash_num = hash_num def __len__(self): return self.size def __iter__(self): return iter(self.bitarray) def add(self, item): for i in range(self.hash_num): index = mmh3.hash(item, i) % self.size self.bitarray.set(index) return self def __contains__(self, item): flag = True for i in range(self.hash_num): index = mmh3.hash(item, i) % self.size if self.bitarray.get(index) == 0: flag = False return flag
def main(): funcs = ['BKDRHash', 'RSHash', 'JSHash', 'SDBMHash', 'DEKHash'] bit_obj = Bitarray(800000) words = [] train_num = 110000 while train_num > 0: ran_str = create_random_string(1, 10) if ran_str not in words: words.append(ran_str) train_num -= 1 for word in words: add_keyword(word, bit_obj, funcs) count = 0 test_num = count_test = 100000 while count_test > 0: ran_str = create_random_string(1, 10) if ran_str not in words: count_test -= 1 if check(ran_str, bit_obj, funcs): count += 1 print float(count) / test_num
def initialize_filter(size=mod, num_hash_func=k, word_cnt=0, ideal=IDEAL_CHOICE_OF_HASH_FUNCS): global k, mod, bit_array_obj, IDEAL_CHOICE_OF_HASH_FUNCS if ideal: k = int(math.log(2, math.e) * mod / float(word_cnt)) k = num_hash_func mod = size bit_array_obj = Bitarray(mod)
def __init__(self, size=1, num_hash_func=13, word_cnt=0, ideal=False, debug_mode=False): self.size = size self.num_hash_func = num_hash_func self.word_cnt = word_cnt self.IDEAL = ideal self.DEBUG_MODE = debug_mode if self.IDEAL: if word_cnt: self.num_hash_func = int( math.log(2, math.e) * size / float(word_cnt)) self.bit_array_obj = Bitarray(size) # BKRD Hash seeds(for not the most ideal cases) len=26 self.BKRD_hash_seeds = [ 13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654, 65, 367, 1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154, 2345, 213, 755, 12467 ]
def __init__(self, n): #对于基本参数进行设置 self.k = 10 self.m = n * 20 + 1 #这一步很关键,处以一个素数会让冲突减小一万倍 self.bitarray = Bitarray(self.m)
from GeneralHashFunctions import * import math import Ex3_1_Bloom_Filter_ast_BKDRHash_seeds # 99990 numbers from 0 to 10000000 IDEAL_CHOICE_OF_HASH_FUNCS = 1 # Number of Hash Functions k>=10 k = 13 # Bit Array Size mod = -1 if IDEAL_CHOICE_OF_HASH_FUNCS: mod = 5555555 else: mod = 55555 # Bit Array bit_array_obj = Bitarray(mod) # BKRD Hash seeds(for not the most ideal cases) len=26 BKRD_hash_seeds = [ 13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654, 65, 367, 1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154, 2345, 213, 755, 12467 ] def initialize_filter(size=mod, num_hash_func=k, word_cnt=0, ideal=IDEAL_CHOICE_OF_HASH_FUNCS): global k, mod, bit_array_obj, IDEAL_CHOICE_OF_HASH_FUNCS if ideal: k = int(math.log(2, math.e) * mod / float(word_cnt))
def __init__(self,size,hash_num): super(BloomFilter,self).__init__() self.bitarray = Bitarray(size) self.size = size self.hash_num = hash_num
class BloomFilter: def __init__(self, size=1, num_hash_func=13, word_cnt=0, ideal=False, debug_mode=False): self.size = size self.num_hash_func = num_hash_func self.word_cnt = word_cnt self.IDEAL = ideal self.DEBUG_MODE = debug_mode if self.IDEAL: if word_cnt: self.num_hash_func = int( math.log(2, math.e) * size / float(word_cnt)) self.bit_array_obj = Bitarray(size) # BKRD Hash seeds(for not the most ideal cases) len=26 self.BKRD_hash_seeds = [ 13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654, 65, 367, 1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154, 2345, 213, 755, 12467 ] # form the bit map hit values # [return] <list> def hit(self, key): hit = [] hit.append(RSHash(key) % self.size) hit.append(JSHash(key) % self.size) hit.append(PJWHash(key) % self.size) hit.append(ELFHash(key) % self.size) hit.append(SDBMHash(key) % self.size) hit.append(DJBHash(key) % self.size) hit.append(DEKHash(key) % self.size) hit.append(BPHash(key) % self.size) hit.append(FNVHash(key) % self.size) hit.append(APHash(key) % self.size) for i in range(self.num_hash_func - 10): if self.IDEAL: seed = Ex3_1_Bloom_Filter_ast_BKDRHash_seeds.get_BKRD_hash_seed( i) else: seed = self.BKRD_hash_seeds[i] hit.append(BKDRHash(key, seed) % self.size) return hit # add the keyword to the bit map def add(self, key): # call the function to generate the hit indices in the bit array hit = self.hit(key) # update the bit map values for idx in hit: self.bit_array_obj.set(idx) # search the bit map to determine whether a keyword is in(True) the bit map or not(False) def search(self, key): # call the function to generate the hit indices in the bit array to_hit = self.hit(key) for idx in to_hit: # the corresponding idx in the bit array is 0, i.e., keyword not in if not self.bit_array_obj.get(idx): return False return True # test the bloom filter # @ para [size] the size of the bit array # @ para [file_train] an English text file to train the filter # @ para [file_test] an English text file to test the filter def test(self, file_train, file_test): # read the file to train words = [] f = open(file_train, 'r') for line in f.xreadlines(): # read the text line by line for word in line.strip().split( ' '): # split the contents with whitespace to be words words.append(word) # add the word to <list> words f.close() len_words = len(words) if self.DEBUG_MODE: print "# All keywords added" # read the file to test test_words = [] f = open(file_test, 'r') for line in f.xreadlines(): # read the text line by line for word in line.strip().split( ' '): # split the contents with whitespace to be words test_words.append(word) # add the word to <list> words f.close() if self.DEBUG_MODE: print "# All test words read" # add the keywords to the bit map for word in words: self.add(word) # test the filter total = len(test_words) wrong_cnt = 0 for word in test_words: if (word in words) != (self.search(word)): wrong_cnt += 1 false_positive_rate = float(wrong_cnt) / total print "Words added:", len_words print "Bit Array Size:", self.size print "Number of hash functions:", self.num_hash_func print "Wrong mapping count:", wrong_cnt print "Total word count:", total print "False positive rate:", false_positive_rate return false_positive_rate
crawled.add_str(page) add_page_to_folder(page, content) outlinks = get_all_links(content, page) if varLock.acquire(): union(tocrawl, outlinks) graph[page] = outlinks COUNT += 1 varLock.release() tocrawl.task_done() tocrawl.clear() if __name__ == '__main__': seed = 'http://www.baidu.com' max_page = 10 tocrawl = Queue.Queue() # tocrawl is a global working queue tocrawl.put(seed) crawled = Bitarray(20 * max_page) graph = {} COUNT = 0 varLock = threading.Lock() NUM = 5 for i in range(NUM): t = threading.Thread(target=page_working) t.setDaemon(True) t.start() tocrawl.join()
if f == 0 and add_page_to_folder(page, content): max_page -= 1 varLock.release() else: varLock.release() #这里释放查找不匹配情况下,查找上的锁 q.task_done() start = time.time() NUM = 128 #线程数 # seed = sys.argv[1] # max_page = int(sys.argv[2]) seed = 'https://www.guancha.cn/' max_page = 7000 crawled = Bitarray(max_page * 20) constant = max_page * 20 #Hash值需要%的常数 varLock = threading.Lock() q = queue.Queue() q.put(seed) for i in range(NUM): t = threading.Thread(target=working) t.setDaemon(True) t.start() q.join() end = time.time() print(end - start) print(max_page)