def __init__(self, redisName, filterName): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' } self.start_queue = RdsQueue(redisName) self.filter = BloomFilter(filterName) self.lock_commit = Lock() self.logger = logger self.conn = conn self.cursor = self.conn.cursor()
def __init__(self, transactions, items, numReduce): self.__numReduce = numReduce self.__num_transaction = len(transactions) self.__items = items self.__transaction_index_map = {} self.__listHashFunc = BloomFilter( int(self.__num_transaction / self.__numReduce), 0.05).listHashFunc transactionsHashed = list() for i in range(self.__num_transaction): transactionsHashed.append( list(map(lambda y: y(str(i)), self.__listHashFunc))) for hashed, transaction in zip(transactionsHashed, transactions): self.add_transaction(transaction, hashed)
def run(seed): bf = BloomFilter(0.00001,1000000) #初始化布隆过滤器 queue = Queue.Queue(maxsize = 0) #初始化URL队列 urlCount = 0 #初始化已得到URL变量 urlList = [] #初始化下载列表 queue.put(seed) while(queue.empty() == False): currentURL = queue.get() urlList.append(currentURL) print 'currentURL',to_bytestring(currentURL) try: #timeout处理 html = urllib.urlopen(currentURL) except: continue bs_obj = BeautifulSoup(html,'html.parser') a_list = bs_obj.findAll('a') + bs_obj.findAll('img') for aa in a_list: if aa.attrs.get('href'): hrefStr = aa.attrs.get('href') else: hrefStr = aa.attrs.get('src') if hrefStr: hrefStr = is_relativeURL(hrefStr,currentURL) if hrefStr == -1: #判断相对/绝对路径 continue if is_needURL(hrefStr) == True: #判断是否需要抓取 if bf.is_element_exist(hrefStr) == False: #布隆过滤 bf.insert_element(hrefStr) print to_bytestring(hrefStr) if is_resourceFile(hrefStr) == False: #判断是否是资源文件 queue.put(hrefStr) urlList.append(hrefStr) try: downloadHtml(hrefStr) except: pass urlCount = urlCount + 1 print '所有--当前',urlCount,len(urlList)
def sampleData(file1, file2, column): filter = BloomFilter(13419082, 23) firstUsersIds1 = userIds(file1, column) for user in firstUsersIds1: filter.add(str(user)) firstUsersIds2 = userIds(file2, 'fc20') same = 0 diff = 0 for user in firstUsersIds2: if filter.contains(str(user)): same += 1 else: diff += 1 return same, diff
def add_transaction(self, transaction, hashed): for item in transaction: if item not in self.__transaction_index_map: self.__transaction_index_map[item] = BloomFilter( int(self.__num_transaction / self.__numReduce), 0.05) self.__transaction_index_map[item].addHashed(hashed)
def main(): input_size = 10000 fp_rate = 0.01 count_size = 4 bloom_filter = BloomFilter(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print("Expected false positive rate for all calculations is :" + str(fp_rate)) print() print("For Standard Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(bloom_filter) + bloom_filter.get_bitarray_size())) shifting_bloom_filter = ShiftingBloomFilterM(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): shifting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in shifting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Shifting Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(shifting_bloom_filter) + shifting_bloom_filter.get_bitarray_size())) counting_bloom_filter = CountingBloomFilter(input_size, fp_rate, count_size=count_size) start_time = time.time() for i in range(0, input_size): counting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in counting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size for i in range(0, input_size): if not str(i) in counting_bloom_filter: print(str(i)) print() print("For counting filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(counting_bloom_filter) + counting_bloom_filter.get_bitarray_size())) scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH) start_time = time.time() for i in range(0, input_size): scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(scalable_bloom_filter) + scalable_bloom_filter.get_bitarray_size())) c_scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH, countable=True, count_size=count_size) start_time = time.time() for i in range(0, input_size): c_scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in c_scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For counting scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(c_scalable_bloom_filter) + c_scalable_bloom_filter.get_bitarray_size())) size_sum = 0 filled_bit_count = 0 max_count = 0 for a in c_scalable_bloom_filter.bloom_filters: for i in range(0, len(a)): if counting_bloom_filter.get_bit_value(i) > 0: size_sum += counting_bloom_filter.get_bit_value(i) filled_bit_count += 1 if max_count < counting_bloom_filter.get_bit_value(i): max_count = counting_bloom_filter.get_bit_value(i) avg_size = size_sum / filled_bit_count print("For counting filter -------- avg count:" + str(avg_size)) print("For counting filter-------- max count:" + str(max_count)) hasmap = {} start_time = time.time() for i in range(0, input_size): hasmap[str(i)] = i end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in hasmap: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Hashmap ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(hasmap))) py_list = [] start_time = time.time() for i in range(0, input_size): py_list.append(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in py_list: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For List ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(py_list))) temp = 0 for i in c_scalable_bloom_filter.bloom_filters: temp += memory_usage.get_obj_size(i) print( "aaa" + str(memory_usage.get_obj_size(c_scalable_bloom_filter.bloom_filters))) print("xxx" + str(temp))
def __init__(self, filterSize, hashCount, clickedUsers): self.allData = [] self.clickedCounter = len(clickedUsers) self.noClickedCounter = 0 self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount) self.__addUsers(clickedUsers)
import socket import random import os from sample_data import USERS from server_config import NODES from pickle_hash import serialize_GET, serialize_PUT, serialize_DELETE from node_ring import NodeRing from lru_cache import Lru_Node, Lru_Cache from bloomFilter import BloomFilter BUFFER_SIZE = 1024 hash_codes = set() has_cache = False lru_cache_obj = Lru_Cache(0) lru_cache_initialized = False bf = BloomFilter(10, 0.05) class UDPClient(): def __init__(self, host, port): self.host = host self.port = int(port) def send(self, request): print('Connecting to server at {}:{}:{}'.format( self.host, self.port, os.getpid())) try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.sendto(request, (self.host, self.port)) response, ip = s.recvfrom(BUFFER_SIZE) return response