class BloomFilterSearch(): def __init__(self, ciphers_dict, bit_vec, hash): self.ciphers_dict = ciphers_dict self.bit_vec = bit_vec self.hash = hash self.false_positives = 0 self.filter = BloomFilter(self.bit_vec, self.hash) def SearchWord(self, dictionary, word): server = PEKSServer(dictionary[word][0]) server.Test(self.ciphers_dict, dictionary[word][1]) def insert(self, s1, cipher): hashGen = hashlib.sha256() hashGen.update(str(s1[0]).encode()) hash = hashGen.hexdigest() self.filter.insert(str(aes.decrypt(cipher, hash[:32]))) def percentage(self, part, whole): return 100 * float(part) / float(whole) def bl_search(self, dict, store): for w in dict: result = self.filter.check(str(dict[w][1])) if (result is True): if (w not in store): self.false_positives += 1 p = self.percentage(self.false_positives, len(store)) percent.append(p) o.write('Size of bit vector: ' + str(self.bit_vec) + '\n') o.write('Number of hashes used: ' + str(self.hash) + '\n') o.write('Number of false positives: ' + str(self.false_positives) + '\n') o.write('Percentage of false positives: ' + str(p) + '% \n\n\n')
def computeBloomFilter(): filename = input_path # total # of line in the file user_file_NOL = 0 # content frequency user_file_content = {} with open(filename) as user_file: for line in user_file: user_file_NOL += 1 # creates bloomfilter of required size bloom_filter = BloomFilter(user_file_NOL) # read contents of file and insert into BF with open(filename) as user_file: for line in user_file: try: user_file_content[line] += 1 except: user_file_content[line] = 1 bloom_filter.insert(line, freq=user_file_content[line]) return bloom_filter
def setUp(self): self.b_filter = BloomFilter() self.control = set() for i in range(0, 400): elem = randint(0, 1000) self.b_filter.add_element(elem) self.control.add(elem)
def make_experiment(self): self.timeStart = time.time() for i in range(0, self.b): #Build the bloomFilter bf = BloomFilter(self.m, self.k) #Insert N times odd items to the bloomFilter for i in range(0, self.n): rnumber = random.randint(0, self.n) rnumber = rnumber * 2 + 1 bf.insert(str(rnumber)) #Check N times even items from the bloomFilter for i in range(0, self.n): rnumber = random.randint(0, self.n) rnumber = rnumber * 2 result = bf.check(str(rnumber)) if (result is True): self.falsePositiveCounter = self.falsePositiveCounter + 1 self.falsePositiveCounter = self.falsePositiveCounter / self.b self.timeFinish = time.time() #return [(self.timeFinish-self.timeStart),self.percentage(self.falsePositiveCounter,self.n)] return self.percentage(self.falsePositiveCounter, self.n)
def make_experiment(self): self.timeStart = time.time() for i in range(0,self.b): #Build the bloomFilter bf = BloomFilter(self.m,self.k) #Insert N times odd items to the bloomFilter for i in range(0,self.n): rnumber = random.randint(0,self.n) rnumber = rnumber*2 + 1 bf.insert(str(rnumber)) #Check N times even items from the bloomFilter for i in range(0,self.n): rnumber = random.randint(0,self.n) rnumber = rnumber*2 result = bf.check(str(rnumber)) if(result is True): self.falsePositiveCounter = self.falsePositiveCounter + 1 self.falsePositiveCounter = self.falsePositiveCounter / self.b self.timeFinish = time.time() #return [(self.timeFinish-self.timeStart),self.percentage(self.falsePositiveCounter,self.n)] return self.percentage(self.falsePositiveCounter,self.n)
class BloomSpellChecker(object): def __init__(self): self.myBloomFilter = BloomFilter() def addWord( self, aWord ): self.myBloomFilter.addWord(aWord) def checkWord( self, aWordToCheck ): for theWordToCheck in self.generateWordOptions(aWordToCheck): if self.myBloomFilter.checkWord( theWordToCheck ): return theWordToCheck return "no correction found" def generateWordOptions( self, aWordToCheck ): theWordOptions = [ aWordToCheck, aWordToCheck.lower(), aWordToCheck.capitalize() ] theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck ) ) theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.lower() ) ) theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.capitalize() ) ) return theWordOptions def generateWordOptionsByRemovingRepeatingCharacters(self, aWord): theWordOptions = set() if len(aWord) <= 1: theWordOptions.add(aWord) else: theFirstLetter = aWord[0:1] for theIntermediateWordOption in self.generateWordOptionsByRemovingRepeatingCharacters(aWord[1:]): if theFirstLetter == theIntermediateWordOption[0]: theWordOptions.add( theFirstLetter + theIntermediateWordOption[1:] ) theWordOptions.add( theFirstLetter + theIntermediateWordOption[0:] ) return theWordOptions
def testCheckWordWithRealHasher(self): theBloomFilter = BloomFilter() self.validateAddingWord( theBloomFilter, "foo" ); self.validateAddingWord( theBloomFilter, "bar" ); self.validateAddingWord( theBloomFilter, "barf" ); self.validateAddingWord( theBloomFilter, "barge" ); self.validateAddingWord( theBloomFilter, "barn" ); self.validateAddingWord( theBloomFilter, "bart" ); self.validateAddingWord( theBloomFilter, "fnarfle-pants" ); self.validateAddingWord( theBloomFilter, "BLARG" ); self.validateAddingWord( theBloomFilter, "blarg" ); self.validateAddingWord( theBloomFilter, "a" ); self.validateAddingWord( theBloomFilter, "aardvark" ); self.validateAddingWord( theBloomFilter, "platypus" ); self.validateAddingWord( theBloomFilter, "melee" ); self.validateAddingWord( theBloomFilter, "somethingreallylong" ); self.validateAddingWord( theBloomFilter, "carrot" ); self.validateAddingWord( theBloomFilter, "derpa derpa der" ); self.validateAddingWord( theBloomFilter, "b" ); self.validateAddingWord( theBloomFilter, "#winning" ); self.assertFalse( theBloomFilter.checkWord( "bat" )); self.assertFalse( theBloomFilter.checkWord( "mele" )); self.assertFalse( theBloomFilter.checkWord( "blah" ));
def run(self): # username = '******' username = '******' # password = '******' password = '******' while True: self.login_index(username, password) # 登录 self.driver.save_screenshot('login.png') XieChenGSpider.delete_old_day() # 删除过期数据 print(XieChenGSpider.Num) select_hotel = """SELECT hotel FROM t_cl_xiecheng_hotel""" db = pymysql.connect(host='47.92.162.87', port=3306, user='******', password='******', db='db_bby_xiecheng') cursor = db.cursor() cursor.execute(select_hotel) result = cursor.fetchall() db.close() for hotel in result: hotel_name = hotel[0] print(hotel_name) bf = BloomFilter() if bf.isContains(hotel_name + str(XieChenGSpider.Num)): # 判断字符串是否存在 print('exists!') else: print('not exists!') bf.insert(hotel_name + str(XieChenGSpider.Num)) hotel_url = self.get_url(hotel_name) self.get_hotel_page(hotel_url) XieChenGSpider.Num += 1 self.driver.quit()
def response(flow): url = 'https://restapi.ele.me/mix/app/channelPage?extras[]=coupon&scene=app:channel' # print(flow.request.cookies) # print(flow.request.cookies) print(flow.request.headers['User-Agent']) print('#' * 90) if flow.request.url.startswith(url) and 'costFrom' in flow.request.url: bf = BloomFilter() text = flow.response.text data = json.loads(text) items_list = data.get('recommendList').get('items') for items in items_list: shop_id = items.get('restaurant').get('id') shop_name = items.get('restaurant').get('name') shop_url = shop_url_path.format(shop_id) print(shop_id, shop_name, shop_url) # addr = get_addr(shop_id) # 详细地址 # phone = get_phone(shop_id) # 电话 # licenses = get_license(shop_id) # 营业执照url # print(shop_id, shop_name, shop_url, licenses) if bf.isContains(shop_id): # 判断字符串是否存在 print('exists!') else: print('not exists!') bf.insert(shop_id) data_list = [ shop_id, shop_name, '珠江摩尔国际大厦8号楼', '', shop_url, '', '昌平区', '北京市', ' ', '' ] with open(r'新店饿了么.csv', 'a+', encoding='utf-8-sig', newline='') as file: writer = csv.writer(file, dialect='excel') writer.writerow(data_list) print('*' * 100)
def main(num_keys, file_dir): input_keys = int(num_keys) bf = BloomFilter(input_keys) for line in fileinput.input(file_dir): line = line.strip() if bf.query(line) == 0: bf.insert(line) print line
def test_hash_indexes(self): k = 3 m = 128 bloom = BloomFilter(m, k) hash_indexes = bloom.hash_indexes('#') self.assertEqual(len(hash_indexes), 3) for index in hash_indexes: self.assertIn(index, range(0, m))
def __init__(self): #Initialisation de filtre avec une taille de 1200 bits #et deux fonctions de hashage self.tableau=[] self.nb_function=1 self.taille=33000 self.filtre=BloomFilter(300,2) self.faux_positive=0 self.res="resultBloom.csv"
def calc2(): fo = open('pg1661.txt', 'r') count = 0 reads = BloomFilter(320000) for i in range(10000): s = fo.readline() l = s.split(' ') for i in l: count += reads.BloomFilter(i) return count
def testFoo(self): theBloomFilter = BloomFilter(self.myTrackingHasher) theBloomFilter.addWord('foo') theBloomFilter.addWord('foo1') theBloomFilter.addWord('foo2') theBloomFilter.addWord('foo3') theBloomFilter.addWord('foo4') theBloomFilter.addWord('foo5') theBloomFilter.addWord('foo6') self.myTrackingHasher.printTrackingData()
def makeFilterloadPayload(publicKeyHash, scriptHash, transactionID): filter = BloomFilter(3, 0.01, 0, 1) filter.add(publicKeyHash) filter.add(scriptHash) filter.add(transactionID) print(len(filter.bit_array)) payload = filter.serialize() return payload
def main(): # bf = BF(mbits=15000, nitems=680) bf = BF() # --- Insertion to bloomfilter pattern --- bf.insertToBloomFilterPattern('4.6649981966.7720977692.5995632615.525560175001.063711911256403200000000') print bf.lookupFromBloomFilterPattern('4.6649981966.7720977692.5995632615.525560175001.063711911256403200000000') print bf.lookupFromBloomFilterPattern('4.7684212566.6980465244.51134545.522727217001.006710836100502560.2343192991111110') bf.setInitialElementsOfBloomFilterPattern('../rawfiles/bf_initialcandidate.csv')
def create_bloom_filter(self, data): print("Creating bloom filter") false_negatives = [] preds = self.model.predicts(data.positives) for i in range(len(data.positives)): if preds[i] <= self.threshold: false_negatives.append(data.positives[i]) print("Number of false negatives at bloom time", len(false_negatives)) self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2, string_digest) for fn in false_negatives: self.bloom_filter.add(fn) print("Created bloom filter")
class OccurenceBloom: def __init__(self): #Initialisation de filtre avec une taille de 1200 bits #et deux fonctions de hashage self.tableau=[] self.nb_function=1 self.taille=33000 self.filtre=BloomFilter(300,2) self.faux_positive=0 self.res="resultBloom.csv" def ajouter(self,adresse): #Verifier si le hash de l adresse existe dans le filtre if(self.filtre.existe(adresse)): #Si oui , on verifie si ce n'est pas un faux positive existe=False taille=len(self.tableau) #Recherche normal dans le tableau de frequence for i in range(taille): if(self.tableau[i][0]==adresse): existe=True self.tableau[i]=(adresse,self.tableau[i][1]+1) break if not existe: #Un faux positive detecte, on insere la nouvelle adresse self.tableau.append((adresse,1)) self.faux_positive+=1 else: #Le hash n existe pas on insere directement a la fin self.tableau.append((adresse,1)) #On mets a jour le filtre self.filtre.ajouter(adresse) def sauvegarder(self): with open("save.bloom","wb") as sauvegarde: pickle.dump(self, sauvegarde, pickle.HIGHEST_PROTOCOL) print("sauvegarde en cours ....") def reset(self): self.tableau=[] self.filtre=BloomFilter(self.taille,self.nb_function) self.faux_positive=0 def dix_premier(self): return sorted(self.tableau,key=lambda x:-x[1])[0:10] def changer_taille(self,taille): self.taille=taille self.filtre=BloomFilter(taille,self.nb_function) def changer_nb_fct(self,nb_function): self.nb_function=nb_function self.filtre=BloomFilter(self.taille,nb_function)
class TestBloomFilter(TestCase): def setUp(self): self.b_filter = BloomFilter() self.control = set() for i in range(0, 400): elem = randint(0, 1000) self.b_filter.add_element(elem) self.control.add(elem) def test_check_element(self): for i in range(0, 100000): elem = randint(0, 1000) if not self.b_filter.check_element(elem): self.assertTrue(elem not in self.control)
class LocalBloomFilter(): def __init__(self, capacity, error, prime_length=True): self.bf = BloomFilter(capacity, error, prime_length) self.bitmap = bytes(int(self.bf.bits / 8) + 1) def add(self, data): if isinstance(data, (list, tuple)): for v in data: assert isinstance( v, str), 'add() arg must be a str or list/tuple of strings' self.bf.add(self.bitmap, v) else: assert isinstance( data, str), 'add() arg must be a str or list/tuple of strings' self.bf.add(self.bitmap, data) def is_contain(self, data): if isinstance(data, (list, tuple)): for v in data: assert isinstance( v, str ), 'is_contain() arg must be a str or list/tuple of strings' return [self.bf.is_contain(self.bitmap, v) for v in data] else: assert isinstance( data, str), 'is_contain() arg must be a str or list/tuple of strings' return self.bf.is_contain(self.bitmap, data) def clean(self): self.bf.clean_bitmap(self.bitmap)
def testFoo(self): theBloomFilter = BloomFilter( self.myTrackingHasher ) theBloomFilter.addWord( 'foo' ) theBloomFilter.addWord( 'foo1' ) theBloomFilter.addWord( 'foo2' ) theBloomFilter.addWord( 'foo3' ) theBloomFilter.addWord( 'foo4' ) theBloomFilter.addWord( 'foo5' ) theBloomFilter.addWord( 'foo6' ) self.myTrackingHasher.printTrackingData()
def test_working(self): bloom_filter = BloomFilter(25) #check if it works as expected bloom_filter.add_word("these") self.assertTrue(bloom_filter.check_word("these")) bloom_filter.add_word("fdfdf") self.assertTrue(bloom_filter.check_word("fdfdf")) self.assertFalse(bloom_filter.check_word("plaban"))
def getMissingContent(n, bloomfilter_bytes): missing_content = {} receivedBF = BloomFilter(n) receivedBF.readBloomFilterFromBytes(bloomfilter_bytes) user_file_content = {} line_number = 0 with open(input_path) as user_file: for line in user_file: line_number += 1 try: user_file_content[line] += 1 except: user_file_content[line] = 1 if not receivedBF.validate(line, freq=user_file_content[line]): missing_content[line_number] = line return(missing_content)
def test_real_fp_prob(filename, fp_prob=0.01, hash_cnt=3, iteration=30, test_size=10000): """ test false positive probability by given desgired false positive probability and given k hash function constraints """ word_list = [] int_file = None try: # use ISO for weird character int_file = open(filename, "r", encoding="ISO-8859-1") while True: line = int_file.readline().replace('\n', '') if not line: break word_list.append(line) except Exception as excep: raise finally: if int_file is not None: int_file.close() print( f'===============================================================================================================' ) print( f'test false positive probability by given desgired false positive probability and given k hash function constraints' ) print( f'===============================================================================================================' ) if test_size <= 0: raise ValueError("test_size must bigger than zero") if len(word_list) <= 1: raise ValueError("word_list_size must be at least 2") word_present_cnt = (int)(len(word_list) / 2) word_absent_cnt = len(word_list) - word_present_cnt word_present = word_list[:word_present_cnt] word_absent = word_list[word_present_cnt:] # number of items being insert items_count = word_present_cnt bloom_filter = BloomFilter(items_count, fp_prob, hash_cnt) print(f'word_list size: {len(word_list)}') print(f'iteration: {iteration}') print(f'input test size: {test_size}') # use smaller cnt as possible test_size test_size = min(test_size, len(word_list)) print(f'adjusted test size: {test_size}') for iter in range(iteration): print(f'test false positive rate: \ {format(evaluate_error_rate(word_present, word_absent, bloom_filter, min(test_size, len(word_list))), "f")}' )
def read_write_test(word, items_count, fp_rpob, hash_cnt): """ test single write and read test """ print( f'===============================================================================================================' ) print(f'single write and read test') print( f'===============================================================================================================' ) # eg : items_count = 1000, fp_prob = 0.01, hash_cnt=3, bloom_filter = BloomFilter(items_count, fp_rpob, hash_cnt) bloom_filter.add(word) print(f'add data: {word}') print(f'look up data: {word} = {bloom_filter.may_match(word)}')
def evaluate_fp_rate(items_count, fp_list, hash_cnt_list): """ evaluate false positive rate by given desgired false positive probability and given k hash function constraints """ print( f'===============================================================================================================' ) print( f'evaluate false positive probailitiy by given desgired false positive probability and given k hash function constraints' ) print( f'===============================================================================================================' ) for fp_prob in fp_list: for hash_cnt in hash_cnt_list: size = BloomFilter.get_size_by_hash_count_and_fp_prob( items_count, hash_cnt, fp_prob) ratio = size / items_count print(f'false_positive:{fp_prob} \ array_size: {size} \ hash_count: {hash_cnt} \ ratio: {format(ratio, ".2e")} \ space(MB): {format(size / math.pow(2, 20) / 8, "f")}')
def __init__(self): self.my = OperateSql() self.bf = BloomFilter() self.license_url = 'https://h5.ele.me/restapi/shopping/v1/restaurants/{}/business/qualification' self.license_path = 'https://cube.elemecdn.com/{}.jpeg' self.headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cookie": "__wpkreporterwid_=5c1b9625-e0e5-4191-94ec-617c63d85828; ubt_ssid=orikwvv98rf349gzuztl6qswlsehx95b_2019-10-31; perf_ssid=39tu4b3xkbrpyw3gwzyrroi0a37e5ltj_2019-10-31; ut_ubt_ssid=6puu2k5t5iqqze593aa84gqsb402sks6_2019-10-31; _bl_uid=IgkgO2v0e0k6LwqmCrz7e2O8wRUz; cna=gY2qExKVBQECAd9YCuTqIvMq; _utrace=70d38df1be30fbbdbbc6a5282dd6ce95_2019-10-31; track_id=1572494757|16968ea868ee345e15bdb0c6f4d78b23e2e3c4d4569c374fae|bb82e1bf879b4737f8c5f86195210994; USERID=10475002; tzyy=0a37b285480279db053bbd4cbb8d8310; UTUSER=10475002; SID=AjfMV8XFn8xrZJTbt0C8i8m1oqB2DbruQxoQ; ZDS=1.0|1572503692|RGAZN+cBWA5CW8xwYLcUH12DMCx9UoNnWZPQMDqhx1gO8fOuCaRY/HxKf+BFRrzP30TX/MIJZjpOiqnkcPPGYw==; l=dBLedHhnqFN5qTL5BOfgmuI8Si_tyIRfGsPzw4GXtICPOdCePBpVWZQZ7WYwCnGVnswvR37NvzWDBV8nkyzHQgfYduwXXEigzd8h.; pizza-rc-ca-result=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhbGciOiJIUzI1NiIsImV4cCI6MTU3MjU4MDI1NCwic2FsdDEiOiJiMGI3MzQzZWU5MTc4YzI1ODgxMWQwZDczMTg1NDU4YSIsInNhbHQyIjoiNjk5MzhlNjkwMmZkOTA5YTFjNjFhOTdlZGZiZmZiNzkifQ.dzzgRy_scGB0QBifCrNavjsDkSyX4a2a-ubs9nkX0Fo; pizza73686f7070696e67=_HHDoSEnvf2II7jNDsoNPY2O8tPmnHsxgglXdGQnv3Q27pXDbCCoc4uL5jGRs6EA; isg=BFZW_vooF5q4QyOfTzbsbr0gpwyYX_89LP606MC_QjnUg_YdKIfqQbx1H1nKK5JJ", "user-agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36" }
class DeepBloom(object): def __init__(self, model, data, fp_rate): self.model = model self.threshold = None self.fp_rate = float(fp_rate) self.fit(data) self.create_bloom_filter(data) def check(self, item): if self.model.predict(item) > self.threshold: return True return self.bloom_filter.check(item) def create_bloom_filter(self, data): print("Creating bloom filter") false_negatives = [] preds = self.model.predicts(data.positives) for i in range(len(data.positives)): if preds[i] <= self.threshold: false_negatives.append(data.positives[i]) print("Number of false negatives at bloom time", len(false_negatives)) self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2, string_digest) for fn in false_negatives: self.bloom_filter.add(fn) print("Created bloom filter") def fit(self, data): ## Split negative data into subgroups. (s1, s2) = split_negatives(data) print("Training model with train, dev, positives", len(s1), len(s2), len(data.positives)) ## Shuffle together subset of negatives and positives. ## Then, train the model on this data. shuffled = shuffle_for_training(s1, data.positives) self.model.fit(shuffled[0], shuffled[1]) print("Done fitting") ## We want a threshold such that at most s2.size * fp_rate/2 elements ## are greater than threshold. fp_index = math.ceil((len(s2) * (1 - self.fp_rate / 2))) predictions = self.model.predicts(s2) predictions.sort() self.threshold = predictions[fp_index]
def test_bloom_creation(self): k = 3 m = 128 bits = bitarray(m) bits.setall(0) bloom = BloomFilter(m, k) self.assertEqual(bloom.k, k) self.assertEqual(bloom.size, m) self.assertEqual(bloom.bits, bits) self.assertEqual(len(bloom.hashFunctions), k)
def test_contain(self): k = 3 m = 128 bloom = BloomFilter(m, k) self.assertFalse(bloom.test('#')) bloom.add("#") self.assertTrue(bloom.test('#'))
def test_bloom_filter(): bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) word_present = ['abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial'] word_absent = ['facebook', 'twitter'] for item in word_present: bloomfilter.add(item) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomfilter.is_member(word): if word in word_absent: print(f"'{word}' is a false positive!") else: print(f"'{word}' is probably present!") else: print(f"'{word}' is definitely not present!")
def main(): parser = ag.ArgumentParser() parser_build = ag.ArgumentParser(add_help=False) parser_query = ag.ArgumentParser(add_help=False) #subparsers = parser.add_subparsers() #parser_build.add_argument('command', type = str, default = 'build') parser_build.add_argument('-k', type=str, help="Key File", required=True, dest='k') parser_build.add_argument('-f', type=float, help="FPR", required=True, dest='f') parser_build.add_argument('-n', type=int, help="Number of distinct keys", required=True) parser_build.add_argument('-o', type=str, help="Output file to store input", required=True) parser_query.add_argument('-i', type=str, help="Input file containing bloomFilter array", required=True) parser_query.add_argument('-q', type=str, help="Input file containing queries", required=True) subparsers = parser.add_subparsers() subparser_build = subparsers.add_parser("build", parents=[parser_build]) subparser_build.set_defaults(which="build") subparser_query = subparsers.add_parser("query", parents=[parser_query]) subparser_query.set_defaults(which="query") args = parser.parse_args() #print(parser_build) if (args.which == 'build'): #args = parser.parse_args() BF = BloomFilter(args.f, args.n) BF.insert(args.k) with open(args.o, "wb") as f: Pi.dump(BF, f) elif (args.which == 'query'): if (os.path.exists(args.i)): with open(args.i, "rb") as f: BF = Pi.load(f) BF.query_file(args.q) else: sys.exit("Input file does not exists")
def create_bloom_filter(self, data): print("Creating bloom filter") false_negatives = [] preds = [] for i in range(self.k): preds.append(self.models[i].predicts(data.positives)) for j in range(len(data.positives)): is_false = True for i in range(self.k): pred = preds[i][j] if pred > self.thresholds[i]: is_false = False if is_false: false_negatives.append(data.positives[j]) print("Number of false negatives at bloom time", len(false_negatives)) print("Effective bloom filter false positive rate", self.fp_rate_bloom) self.bloom_filter = BloomFilter( len(false_negatives), self.fp_rate_bloom, string_digest ) for fn in false_negatives: self.bloom_filter.add(fn) print("Created bloom filter")
def url_test(positives, negatives, fp_rate): bf = BloomFilter(len(positives), fp_rate, string_digest) for pos in positives: bf.add(pos) assert (bf.check(pos)) print("Bits needed", bf.size) print("Hash functions needed", bf.hash_count) fp = 0.0 for neg in negatives: if bf.check(neg): fp += 1 print("False positives", fp / len(negatives))
def __init__(self, name,path,switch=False,solr=None): self.name=name self.switch=switch self.path=path+'\logs\\'+self.name if not os.path.exists(path+'\logs'): os.mkdir(path+'\logs') if not os.path.exists(self.path): os.mkdir(self.path) self.routor=Routor(name,path) self.queue=Queue.Queue(maxsize=0) self.failQueue=Queue.Queue(maxsize=0)#失败队列 self.logger=self.newLogging(name) self.bloomfilter=BloomFilter() self.count=0 self.queue.put(self.routor.route[0]['pattern']) self.sleeptime=self.routor.route[-1]['sleeptime'] self.block=SleepTime(self.sleeptime)#屏蔽模块 self.fail=0 self.job=deal(name,'job',path,solr) self.company=deal(name,'company',path,solr)
class BloomFilterTestCase(unittest.TestCase): def setUp(self): self.bf = BloomFilter(262144, 14) lines = open("/usr/share/dict/american-english").read().splitlines() for line in lines: self.bf.update(line) def tearDown(self): pass def test_probably(self): self.assertEqual(self.bf.lookup("Max"), "Probably") self.assertEqual(self.bf.lookup("mice"), "Probably") def test_nope(self): self.assertEqual(self.bf.lookup("3"), "Nope") self.assertEqual(self.bf.lookup("google"), "Nope")
from BloomFilter import BloomFilter, TrackingHasher if __name__ == '__main__': theHasher = TrackingHasher() theBloomFilter = BloomFilter( theHasher ) # theBloomFilter = BloomFilter() [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')] theHasher.printTrackingData() while True: theInput = raw_input( 'Enter a word: ' ) if theInput == 'q!': break if ( theBloomFilter.checkWord( theInput ) ): print theInput + ' was found!' else: print theInput + ' was not found...' theBloomFilter.printDebuggingData( theInput )
class TestBloomFilter(unittest.TestCase): def setUp( self ): self.myHasherForTest = HasherForTest() self.myBloomFilter = BloomFilter( self.myHasherForTest ) def testHasBitField(self): self.assertIsNotNone( self.myBloomFilter.myBitArray, 'bitfield does not exist' ) def testBitFieldDefaultsToFalse(self): self.assertFalse( self.myBloomFilter.myBitArray.any(), 'bitfield should be initialized to all Falses' ) def testAddWord(self): self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ] self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ] self.myBloomFilter.addWord( 'foo' ) self.myBloomFilter.addWord( 'bar' ) self.assertThatOnlyIndexesAreTrue( [ 4, 5, 2, 7 ] ) def assertThatOnlyIndexesAreTrue(self, anIndexes): theBitArrayCopy = bitarray( self.myBloomFilter.myBitArray ) for theIndex in anIndexes: self.assertTrue( self.myBloomFilter.myBitArray[theIndex], 'index ' + str( theIndex ) + ' is false when it should be true' ) theBitArrayCopy[theIndex] = False self.assertFalse( theBitArrayCopy.any(), 'an unexpected index(es) were True, they should be false: ' + str( theBitArrayCopy ) ) def testCheckWord(self): self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ] self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ] self.myBloomFilter.addWord( 'foo' ) self.assertTrue( self.myBloomFilter.checkWord( 'foo' ), 'word foo should be in the filter' ) self.assertFalse( self.myBloomFilter.checkWord( 'bar' ), 'word bar should NOT be in the filter' ) def testCheckWordWithRealHasher(self): theBloomFilter = BloomFilter() self.validateAddingWord( theBloomFilter, "foo" ); self.validateAddingWord( theBloomFilter, "bar" ); self.validateAddingWord( theBloomFilter, "barf" ); self.validateAddingWord( theBloomFilter, "barge" ); self.validateAddingWord( theBloomFilter, "barn" ); self.validateAddingWord( theBloomFilter, "bart" ); self.validateAddingWord( theBloomFilter, "fnarfle-pants" ); self.validateAddingWord( theBloomFilter, "BLARG" ); self.validateAddingWord( theBloomFilter, "blarg" ); self.validateAddingWord( theBloomFilter, "a" ); self.validateAddingWord( theBloomFilter, "aardvark" ); self.validateAddingWord( theBloomFilter, "platypus" ); self.validateAddingWord( theBloomFilter, "melee" ); self.validateAddingWord( theBloomFilter, "somethingreallylong" ); self.validateAddingWord( theBloomFilter, "carrot" ); self.validateAddingWord( theBloomFilter, "derpa derpa der" ); self.validateAddingWord( theBloomFilter, "b" ); self.validateAddingWord( theBloomFilter, "#winning" ); self.assertFalse( theBloomFilter.checkWord( "bat" )); self.assertFalse( theBloomFilter.checkWord( "mele" )); self.assertFalse( theBloomFilter.checkWord( "blah" )); def validateAddingWord(self, aBloomFilter, aWordToTest): self.assertFalse( aBloomFilter.checkWord( aWordToTest ) ); aBloomFilter.addWord( aWordToTest ); self.assertTrue( aBloomFilter.checkWord( aWordToTest ) );
def setUp( self ): self.myHasherForTest = HasherForTest() self.myBloomFilter = BloomFilter( self.myHasherForTest )
def setUp(self): self.bf = BloomFilter(262144, 14) lines = open("/usr/share/dict/american-english").read().splitlines() for line in lines: self.bf.update(line)
from BloomFilter import BloomFilter import sys import time #import resource if(len(sys.argv)!=4): print "Invalid input arguments! The number of them is not right!" exit(0) bloom=BloomFilter() try: input_file=open(sys.argv[1],"r") output_file=open(sys.argv[2],"a+") p=float(sys.argv[3]) except IOError: print "Invalid input arguments! Check that the input file exists!" exit(0) except ValueError as e: print "Third argument not a float number!" exit(0) lines=input_file.readlines() bloom.initialize_using_np(len(lines),float(sys.argv[3])) start_time=time.time() for line in lines: line=line.strip().rstrip() bloom.add(line) input_file.close()
from BloomFilter import BloomFilter bf = BloomFilter() bf.add("python") bf.add("vk") assert(bf.check("python") == True) assert(bf.check("vk") == True) assert(bf.check("kaboom") == False)
def __init__(self): self.myBloomFilter = BloomFilter()
class Strategy(object): """爬取策略""" def __init__(self, name,path,switch=False,solr=None): self.name=name self.switch=switch self.path=path+'\logs\\'+self.name if not os.path.exists(path+'\logs'): os.mkdir(path+'\logs') if not os.path.exists(self.path): os.mkdir(self.path) self.routor=Routor(name,path) self.queue=Queue.Queue(maxsize=0) self.failQueue=Queue.Queue(maxsize=0)#失败队列 self.logger=self.newLogging(name) self.bloomfilter=BloomFilter() self.count=0 self.queue.put(self.routor.route[0]['pattern']) self.sleeptime=self.routor.route[-1]['sleeptime'] self.block=SleepTime(self.sleeptime)#屏蔽模块 self.fail=0 self.job=deal(name,'job',path,solr) self.company=deal(name,'company',path,solr) def newLogging(self,name): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.path+'\\'+name+'.log') fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler logger.addHandler(fh) logger.addHandler(ch) return logger """功能函数包装""" def link_and_check(func): def _wrapper(*args,**kw): url=unicode(args[1]) headers ={'Accept':'text/html;q=0.9,*/*;q=0.8','Accept-Charset':'ISO-8859-1,utf-8,gb2312;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close','Referer':None} headers['User-Agent']=getHeader() #注意如果依然不能抓取的话,Referer可以设置抓取网站的host try: try: req=requests.get(url,timeout=5,headers=headers) except Exception as e: raise FailException(args[0],'bad requests:'+str(type(e))[8:]) if req: if req.content: kw['content']=req.content return func(*args,**kw) else: args[0].logger.warning('No Content in URL: %s'%url) raise FailException(args[0],'No Content in URL') except FailException as e: args[0].logger.warning('URL: %s | info: %s'%(url,e.info)) args[0].logger.warning('fail: %s | Stime: %s'%(args[0].fail,args[0].sleeptime)) if args[0].switch: print 'put in failQueue' args[0].failQueue.put(url) finally: if args[0].switch: tim=args[0].block.isBlocked(args[0].fail) if isinstance(tim,tuple): if tim[1]: #学习停止 args[0].switch=False args[0].routor.setST(tim[0]) tim=tim[0] args[0].sleeptime=tim print args[0].sleeptime if args[0].fail==0 or not args[0].switch: if not args[0].failQueue.empty(): for x in range(args[0].failQueue.qsize()): u=args[0].failQueue.get() args[0].queue.put(u) return _wrapper """功能函数""" @link_and_check def enter(self,url,**kw): #处理需要进入并且获取网页指定区域子连接的URL text=kw['content'] area=self.getArea(text,kw['loc']) linklist=self.getAllAch(area) for link in linklist: if not self.bloomfilter.isContain(link): self.queue.put(link) self.bloomfilter.insert(link) self.fail=0 @link_and_check def need(self,url,**kw): #处理目标页面的文本信息,直接下载到本地 text=kw['content'] if kw['ctg']=='job': filename='\job_save.log' self.job.txt=text self.job.url=url forsave= self.job.send('update')[0] else: filename='\company_save.log' self.company.txt=text self.company.url=url forsave= self.company.send('update')[0] with open(self.path+filename,'a') as f: f.write(forsave) self.fail=0 def auto(self,url,**kw): #处理需要调用URLgenerator的URL self.logger.warning('BEGIN USING ATUO generator!') self.routor.match(url,submodel=True) if len(kw['replace'])==2: replace=[str(n) for n in xrange(kw['replace'][0],kw['replace'][1])] else: replace=kw['replace'] for x in replace: u=urlGenerator(url,kw['between'],x) if not self.bloomfilter.isContain(u): self.distributor(u) self.bloomfilter.insert(u) self.routor.match(url,submodel=False) """策略核心""" def core(self): ti=time.time() isFinish=False #退出判定 t=time.time()-ti #计时退出 # try: # while not isFinish: # # size=self.queue.qsize() # # self.logger.info('before get url, Queue size = %s'%size) # url=self.queue.get() # self.distributor(url) # t=time.time()-ti # if t>3600: # isFinish=True # print 'COUNTE = ',self.count # self.logger.info('COUNT = %s'%self.count) # except: # print 'FINISH ! In Time:',t # print self.queue.qsize() # self.logger.info('FINISH ! In Time: %s'%t) while not isFinish: url=self.queue.get() self.distributor(url) #退出机制,测试用 # t=time.time()-ti # if t>3600: # isFinish=True # print 'COUNTE = ',self.count # self.logger.info('COUNT = %s'%self.count) print 'FINISH ! In Time:',t print self.queue.qsize() self.logger.info('FINISH ! In Time: %s'%t) def distributor(self,url): #分发链接 afterRoute=self.routor.match(url) if afterRoute: self.count+=1 self.logger.info('%s: %s'%(afterRoute['model'],url)) if afterRoute['model']=='enter': self.enter(url,**afterRoute['args']) elif afterRoute['model']=='need': self.need(url,**afterRoute['args']) elif afterRoute['model']=='auto': self.auto(url,**afterRoute['args']) # time.sleep(self.sleeptime) else: self.logger.warning('URL: %s is not found in Pattern !'%url) """工具方法""" def getArea(self,text,loc): #获取指定文本之间的文本 for k,v in loc.iteritems(): l=[k,v] t=getContent(text,l) if t: return t print text raise FailException(self,'No Area is Done') def getAllAch(self,text): #获取指定文本中的链接,并查重,返回list soup=BeautifulSoup(text) linklist=[link.get('href') for link in soup.find_all('a')] if len(linklist)==0: raise FailException(self,'No link in content') legallink=[] for link in linklist: link=str(link) if re.match(r'http://.*',link): legallink.append(link) linklist=legallink for script in soup.find_all('script'): scr=str(script) r=re.findall(r'"http://.*?"',scr) for sc in r: if sc: rs=re.search(r'"http://.*?"',sc) if rs: l=rs.group().replace('"','') linklist.append(l) return linklist
from BloomFilter import BloomFilter import random import string def removeMatchingWord( aWordToFind, aWords ): if aWordToFind in aWords: aWords.remove( aWordToFind ) if __name__ == '__main__': theBloomFilter = BloomFilter() [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')] theWrongWords = [] theCountOfRandomWords = 0 while len(theWrongWords) < 300: theCountOfRandomWords = theCountOfRandomWords + 1 theRandomString = ''.join(random.choice(string.ascii_lowercase) for x in range(5)) if ( theBloomFilter.checkWord( theRandomString ) ): theWrongWords.append( theRandomString ) print '# random words checked: ' + str( theCountOfRandomWords ) [ removeMatchingWord( line.strip(), theWrongWords ) for line in open('/usr/share/dict/words')] print '# false positives: ' + str( len( theWrongWords ) ) print 'false positive rate: ' + str( 100.0 * float( len( theWrongWords ) ) / float( theCountOfRandomWords ) )