class BalanedData: def __init__(self, filterSize, hashCount, clickedUsers): self.allData = [] self.clickedCounter = len(clickedUsers) self.noClickedCounter = 0 self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount) self.__addUsers(clickedUsers) def __addUsers(self, clickedUsersIds): for userId in clickedUsersIds: self.__addUser(userId) def __addUser(self, userId): self.collectedDataUsersFilter.add(userId) def addUserRow(self, userId, row): isCollected = self.collectedDataUsersFilter.contains(userId) if isCollected: self.allData.append(row) elif self.clickedCounter > self.noClickedCounter: self.__addUser(userId) self.noClickedCounter += 1 self.allData.append(row)
def __init__(self, redisName, filterName): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' } self.start_queue = RdsQueue(redisName) self.filter = BloomFilter(filterName) self.lock_commit = Lock() self.logger = logger self.conn = conn self.cursor = self.conn.cursor()
def run(seed): bf = BloomFilter(0.00001,1000000) #初始化布隆过滤器 queue = Queue.Queue(maxsize = 0) #初始化URL队列 urlCount = 0 #初始化已得到URL变量 urlList = [] #初始化下载列表 queue.put(seed) while(queue.empty() == False): currentURL = queue.get() urlList.append(currentURL) print 'currentURL',to_bytestring(currentURL) try: #timeout处理 html = urllib.urlopen(currentURL) except: continue bs_obj = BeautifulSoup(html,'html.parser') a_list = bs_obj.findAll('a') + bs_obj.findAll('img') for aa in a_list: if aa.attrs.get('href'): hrefStr = aa.attrs.get('href') else: hrefStr = aa.attrs.get('src') if hrefStr: hrefStr = is_relativeURL(hrefStr,currentURL) if hrefStr == -1: #判断相对/绝对路径 continue if is_needURL(hrefStr) == True: #判断是否需要抓取 if bf.is_element_exist(hrefStr) == False: #布隆过滤 bf.insert_element(hrefStr) print to_bytestring(hrefStr) if is_resourceFile(hrefStr) == False: #判断是否是资源文件 queue.put(hrefStr) urlList.append(hrefStr) try: downloadHtml(hrefStr) except: pass urlCount = urlCount + 1 print '所有--当前',urlCount,len(urlList)
def sampleData(file1, file2, column): filter = BloomFilter(13419082, 23) firstUsersIds1 = userIds(file1, column) for user in firstUsersIds1: filter.add(str(user)) firstUsersIds2 = userIds(file2, 'fc20') same = 0 diff = 0 for user in firstUsersIds2: if filter.contains(str(user)): same += 1 else: diff += 1 return same, diff
def __init__(self, transactions, items, numReduce): self.__numReduce = numReduce self.__num_transaction = len(transactions) self.__items = items self.__transaction_index_map = {} self.__listHashFunc = BloomFilter( int(self.__num_transaction / self.__numReduce), 0.05).listHashFunc transactionsHashed = list() for i in range(self.__num_transaction): transactionsHashed.append( list(map(lambda y: y(str(i)), self.__listHashFunc))) for hashed, transaction in zip(transactionsHashed, transactions): self.add_transaction(transaction, hashed)
class SpiderMan: insert_sql = "insert into artist(artistId, artistName) values(%s,%s)" def __init__(self, redisName, filterName): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' } self.start_queue = RdsQueue(redisName) self.filter = BloomFilter(filterName) self.lock_commit = Lock() self.logger = logger self.conn = conn self.cursor = self.conn.cursor() # 发送请求 def detailRequest(self, url, data, encode="utf-8"): """ :param url: :param data: :param encode: :return: Request """ try: resp = requests.post(url, data=data) resp.encoding = encode if resp.status_code == 200: return resp except Exception as e: print(f"爬取失败!{e}") # 生成16个随机字符 def generate_random_strs(self, length): string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" # 控制次数参数i i = 0 # 初始化随机字符串 random_strs = "" while i < length: e = random.random() * len(string) # 向下取整 e = math.floor(e) random_strs = random_strs + list(string)[e] i = i + 1 return bytes(random_strs, encoding="utf8") # AES加密 def AESencrypt(self, msg, key): # 如果不是16的倍数则进行填充(paddiing) padding = 16 - len(msg) % 16 # 这里使用padding对应的单字符进行填充 msg = msg + padding * chr(padding) # 用来加密或者解密的初始向量(必须是16位) iv = '0102030405060708' aes = AES.new(key.encode("utf-8"), IV=iv.encode("utf-8"), mode=AES.MODE_CBC) # 加密后得到的是bytes类型的数据 encryptedbytes = aes.encrypt(msg.encode("utf-8")) # 使用Base64进行编码,返回byte字符串 encodestrs = base64.b64encode(encryptedbytes) # 对byte字符串按utf-8进行解码 enctext = encodestrs.decode('utf-8') return enctext # RSA加密 def RSAencrypt(self, randomstrs, key, f): # 随机字符串逆序排列 string = randomstrs[::-1] # 将随机字符串转换成byte类型数据 # text = bytes(string, encoding='utf8') seckey = int(codecs.encode(string, encoding='hex'), 16)**int( key, 16) % int(f, 16) return format(seckey, 'x').zfill(256) # 定义Post参数 def postParam(self, detailId): """ :param detailId: :return: dict(param) """ return dict # 获取加密参数 def get_params(self, detailId): d = self.postParam(detailId) # 固定值 g = '0CoJUm6Qyw8W8jud' f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' e = '010001' # 随机生成长度为16的随机字符串,可固定 # i = self.generate_random_strs(16) i = 'kEhRsbVFNaFQEOaG' # 随机值通过RSA加密之后得到encSecKey的值,可固定 # encSecKey = RSAencrypt(i, e, f) encSecKey = 'a200e63459c28899f38dd1866058664c3fc10567b9d72ef91378dedf3971075e45732976768705520ee58a55d0e2a3b72ff8fe351e16651af42d001e77cf1c823006a8974cb88c986d1525cfe71935db2ec7a1b3677dfc670dbcdc4e58820fc31ade511a79a8e910a28d542fd44b7f67958468bd41d73d2ade5268565ac9f5de' # 两次AES加密之后得到params的值 enctext = self.AESencrypt(d, g) encText = self.AESencrypt(enctext, i) return encText, encSecKey #数据插入 def insert_data(self, sql, detailList): try: #数据库操作锁 self.lock_commit.acquire() self.cursor.executemany(sql, detailList) self.conn.commit() self.lock_commit.release() self.logger.info(f'insert {detailList[-1]} success') except Exception as e: self.logger.debug(e) # 页面解析 def parse_detail(self, detailId): """ :param detailId: :return: list """ return #通用采集流程 def getdetailInfo(self): p = 0 detailList = [] while True: if self.start_queue.queueLen(): detailId = self.start_queue.pop() if self.filter.isContains(detailId): self.logger.debug(f"{detailId} has been crawled") continue self.filter.insert(detailId) try: detailList.append(self.parse_detail(detailId)) p += 1 if p == 10 or self.start_queue.queueLen() == 0: if detailList != []: self.insert_data(self.insert_sql, detailList) detailList = [] else: self.logger.debug('no data to sql') p = 0 except Exception as e: self.logger.debug(f"{detailId} {e}") elif detailList != []: self.insert_data(self.insert_sql, detailList) detailList = [] break # 采集流程1 def getdetailsInfo(self): p = 0 detailList = [] while True: try: if self.start_queue.queueLen(): detailId = self.start_queue.pop() if self.filter.isContains(detailId): self.logger.debug(f"{detailId} has been crawled") continue self.filter.insert(detailId) detailList = self.parse_detail(detailId) p += 1 if p == 10 or self.start_queue.queueLen() == 0: if detailList != []: self.insert_data(self.insert_sql, detailList) detailList = [] else: self.logger.debug('no data to sql') p = 0 elif detailList != []: self.insert_data(self.insert_sql, detailList) detailList = [] break except Exception as e: self.logger.debug(e) #多线程 def multi_task(self, function, num): tasks = [] for _ in range(num): thread = Thread(target=function) thread.start() tasks.append(thread) for th in tasks: th.join() def main(self, num): self.multi_task(self.getdetailInfo, num)
def add_transaction(self, transaction, hashed): for item in transaction: if item not in self.__transaction_index_map: self.__transaction_index_map[item] = BloomFilter( int(self.__num_transaction / self.__numReduce), 0.05) self.__transaction_index_map[item].addHashed(hashed)
def main(): input_size = 10000 fp_rate = 0.01 count_size = 4 bloom_filter = BloomFilter(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print("Expected false positive rate for all calculations is :" + str(fp_rate)) print() print("For Standard Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(bloom_filter) + bloom_filter.get_bitarray_size())) shifting_bloom_filter = ShiftingBloomFilterM(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): shifting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in shifting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Shifting Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(shifting_bloom_filter) + shifting_bloom_filter.get_bitarray_size())) counting_bloom_filter = CountingBloomFilter(input_size, fp_rate, count_size=count_size) start_time = time.time() for i in range(0, input_size): counting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in counting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size for i in range(0, input_size): if not str(i) in counting_bloom_filter: print(str(i)) print() print("For counting filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(counting_bloom_filter) + counting_bloom_filter.get_bitarray_size())) scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH) start_time = time.time() for i in range(0, input_size): scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(scalable_bloom_filter) + scalable_bloom_filter.get_bitarray_size())) c_scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH, countable=True, count_size=count_size) start_time = time.time() for i in range(0, input_size): c_scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in c_scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For counting scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(c_scalable_bloom_filter) + c_scalable_bloom_filter.get_bitarray_size())) size_sum = 0 filled_bit_count = 0 max_count = 0 for a in c_scalable_bloom_filter.bloom_filters: for i in range(0, len(a)): if counting_bloom_filter.get_bit_value(i) > 0: size_sum += counting_bloom_filter.get_bit_value(i) filled_bit_count += 1 if max_count < counting_bloom_filter.get_bit_value(i): max_count = counting_bloom_filter.get_bit_value(i) avg_size = size_sum / filled_bit_count print("For counting filter -------- avg count:" + str(avg_size)) print("For counting filter-------- max count:" + str(max_count)) hasmap = {} start_time = time.time() for i in range(0, input_size): hasmap[str(i)] = i end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in hasmap: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Hashmap ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(hasmap))) py_list = [] start_time = time.time() for i in range(0, input_size): py_list.append(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in py_list: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For List ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(py_list))) temp = 0 for i in c_scalable_bloom_filter.bloom_filters: temp += memory_usage.get_obj_size(i) print( "aaa" + str(memory_usage.get_obj_size(c_scalable_bloom_filter.bloom_filters))) print("xxx" + str(temp))
def __init__(self, filterSize, hashCount, clickedUsers): self.allData = [] self.clickedCounter = len(clickedUsers) self.noClickedCounter = 0 self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount) self.__addUsers(clickedUsers)
import socket import random import os from sample_data import USERS from server_config import NODES from pickle_hash import serialize_GET, serialize_PUT, serialize_DELETE from node_ring import NodeRing from lru_cache import Lru_Node, Lru_Cache from bloomFilter import BloomFilter BUFFER_SIZE = 1024 hash_codes = set() has_cache = False lru_cache_obj = Lru_Cache(0) lru_cache_initialized = False bf = BloomFilter(10, 0.05) class UDPClient(): def __init__(self, host, port): self.host = host self.port = int(port) def send(self, request): print('Connecting to server at {}:{}:{}'.format( self.host, self.port, os.getpid())) try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.sendto(request, (self.host, self.port)) response, ip = s.recvfrom(BUFFER_SIZE) return response