def test_ReadFile_is_public(self): self.assertEquals( isinstance(pybloomfilter.BloomFilter.ReadFile, object), True) bf = pybloomfilter.BloomFilter(100, 0.01) bf2 = pybloomfilter.BloomFilter(100, 0.01) self.assertEquals(bf.ReadFile, bf2.ReadFile) self.assertEquals(pybloomfilter.BloomFilter.ReadFile, bf.ReadFile)
def __init__(self, data_dir, rule): self.data_dir = data_dir self.rule = rule os.system('mkdir -p ' + data_dir) url_file = os.path.join(data_dir, 'urls.urls') savefile = open(url_file, 'a' if os.path.exists(url_file) else 'w') parsed_ff = os.path.join(data_dir, 'parsed.bloomfilter') parsed_filter = pybf.BloomFilter( pybf.BloomFilter.ReadFile if os.path.exists(parsed_ff) else 10000000, 0.0001, parsed_ff) saved_ff = os.path.join(data_dir, 'saved.bloomfilter') saved_filter = pybf.BloomFilter( pybf.BloomFilter.ReadFile if os.path.exists(saved_ff) else 10000000, 0.0001, saved_ff) self.dyn_filter = set() self.queue = Queue.LifoQueue() if os.path.exists(os.path.join(self.data_dir, 'queue.json')): for item in reversed([ json.loads(line) for line in open(os.path.join(self.data_dir, 'queue.json')) ]): self.queue.put(item) os.remove(os.path.join(self.data_dir, 'queue.json')) self.saved_filter = saved_filter self.parsed_filter = parsed_filter self.savefile = savefile
def __init__(self, active_rules=None): self.nineteen_mers = pybloomfilter.BloomFilter(50000000, 0.0001) self.tmp_nineteen_mers = pybloomfilter.BloomFilter(50000, 0.0001) if active_rules is None: self.active_rules = [ # FastDNARules.a_permutation, # FastDNARules.t_permutation, # FastDNARules.c_permutation, # FastDNARules.g_permutation, # FastDNARules.dinucleotid_runs, # FastDNARules.homopolymers, partial(FastDNARules.homopolymers, probs=three_strict_homopolymers()), # FastDNARules.overall_gc_content, # To change the GC error function: partial(FastDNARules.overall_gc_content, calc_func=fs_gc_error_calculation), # FastDNARules.windowed_gc_content, partial(FastDNARules.windowed_gc_content, calc_func=ts_gc_error_calculation), # FastDNARules.long_strands, # FastDNARules.illegal_symbols, # FastDNARules.trinucleotid_runs, # FastDNARules.random_permutations, FastDNARules.motif_search # FastDNARules.motif_regex_search, # FastDNARules.repeatRegion, # FastDNARules.smallRepeatRegion, # self.check_and_add_mers ] else: self.active_rules = active_rules
def test_errors(error_rate, filter_size, correct_overlap, num_test_words): bloom_file = tempfile.NamedTemporaryFile() try: bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name) except TypeError: bf = pybloomfilter.BloomFilter(filter_size, error_rate) with open(WORDS_FILE) as source_file: with open(TEST_WORDS) as test_file: run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate)
def test_bit_count(self): bf0 = pybloomfilter.BloomFilter(100, 0.1) bf1 = pybloomfilter.BloomFilter(100, 0.1) bf1.add('a') bf100 = pybloomfilter.BloomFilter(100, 0.1) for i in range(100): bf100.add(str(i)) assert bf0.bit_count == 0 assert bf1.bit_count == bf1.num_hashes assert bf100.bit_count == bin(bf100.bit_array).count('1')
def setUp(self): # Convenience file-backed bloomfilter self.tempfile = tempfile.NamedTemporaryFile(suffix='.bloom', delete=False) self.bf = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, self.tempfile.name) # Convenience memory-backed bloomfilter self.bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE)
def test_approximate_size_after_union_called(self): bf1 = pybloomfilter.BloomFilter(100, 0.1, self.tempfile.name, hash_seeds=[1, 2, 3]) for i in range(0, 20): bf1.add(str(i)) bf2 = pybloomfilter.BloomFilter(100, 0.1, self.tempfile.name + '.50', hash_seeds=[1, 2, 3]) for i in range(10, 30): # intersectoin size: 10 bf2.add(str(i)) union_bf = bf1.copy(self.tempfile.name + '.union') union_bf.union(bf2) assert len(union_bf) == 29 # approximate size intersection = len(bf1) + len(bf2) - len(union_bf) assert intersection == 11 # approximate size
def test_write_operation_on_readonly_file_raises_exception(self): with tempfile.NamedTemporaryFile() as tmp1: with tempfile.NamedTemporaryFile() as tmp2: pybloomfilter.BloomFilter(1000, 0.01, tmp1.name) bf1 = pybloomfilter.BloomFilter.open(tmp1.name) bf2 = bf1.copy_template(tmp2.name) bf2.add('bf2') with self.assertRaises(ValueError): bf1.clear_all() with self.assertRaises(ValueError): bf1.add('test') with self.assertRaises(ValueError): bf1 |= bf2 with self.assertRaises(ValueError): bf1 &= bf2 with self.assertRaises(ValueError): bf1.union(bf2) with self.assertRaises(ValueError): bf1.intersection(bf2)
def test_write_operation_on_writable_files_does_not_raise_exception(self): with tempfile.NamedTemporaryFile() as tmp1: with tempfile.NamedTemporaryFile() as tmp2: bf1 = pybloomfilter.BloomFilter(1000, 0.01, tmp1.name) bf2 = bf1.copy_template(tmp2.name) bf2.add('bf2') bf1.clear_all() bf1.add('test') bf1 |= bf2 bf1 &= bf2 bf1.union(bf2) bf1.intersection(bf2) bf1.close() bf3 = pybloomfilter.BloomFilter.open(tmp1.name, mode='rw') bf3.clear_all() bf3.add('test') bf3 |= bf2 bf3 &= bf2 bf3.union(bf2) bf3.intersection(bf2)
def test_strings(self): random_strings = list( set(''.join( random.choice(string.lowercase + string.uppercase) for _ in range(6)) for _ in range(10000))) random.shuffle(random_strings) for accuracy in (0.1, 0.01, 0.001): bf = pybloomfilter.BloomFilter(8000, accuracy) bf.update(random_strings[:8000]) false_pos, false_neg = 0, 0 for test in random_strings[8000:10000]: if test in bf: false_pos += 1 for test in random_strings[6000:8000]: if test not in bf: false_neg += 1 false_pos_rate = false_pos / 2000. false_neg_rate = false_neg / 2000. self.assertTrue( false_pos_rate <= accuracy * 2, "accuracy fail: %0.4f > %0.4f" % (false_pos_rate, accuracy)) self.assertEqual( false_neg_rate, 0.0, "false negative rate is nonzero: %0.4f" % (false_neg_rate, )) del bf print false_pos_rate, accuracy
def test_bloom_filter_removes_training_set_items(trained_model, sim_eval_dataset): user_id = 0 interactions = sim_eval_dataset[user_id]["interactions"].coalesce() item_ids = list(interactions.indices()[1].numpy()) # TODO: See if these assertions can be moved to test the simulated dataset assert len(item_ids) > 0 assert item_ids[0] < trained_model.hparams.num_items assert type(item_ids) == list bloom_filter = pbf.BloomFilter(100, 0.1) bloom_filter.update(item_ids) all_items = AllItems(trained_model.hparams.num_items) user_recs = all_items.run(UserRecs(user_id=user_id)) length_before = len(user_recs.candidates) stage = BloomFilter({0: bloom_filter}) user_recs = stage.run(user_recs) length_after = len(user_recs.candidates) assert length_after > length_before
def test_bit_array(self): bf = pybloomfilter.BloomFilter(1000, 0.01, self.tempfile.name) bf.add("apple") # Count the number of 1s total_ones = 0 bit_array_str = bin(bf.bit_array) for c in bit_array_str: if c == "1": total_ones += 1 # For the first item addition, BF should contain # the same amount of 1s as the number of hashes # performed assert total_ones == bf.num_hashes for i in range(1000): bf.add(randint(0, 1000)) bf.add("apple") ba_1 = bf.bit_array bf.add("apple") ba_2 = bf.bit_array # Should be the same assert ba_1 ^ ba_2 == 0 bf.add("pear") bf.add("mango") ba_3 = bf.bit_array # Should not be the same assert ba_1 ^ ba_3 != 0
def create_cbloomfilter(*args): args = list(args) f = tempfile.NamedTemporaryFile() tempfiles.append(f) os.unlink(f.name) args.append(f.name) return pybloomfilter.BloomFilter(*tuple(args))
def main(input_path, output_path): all = pybloomfilter.BloomFilter(NUM_LABELS, 0.0001) dups = pybloomfilter.BloomFilter(NUM_LABELS, 0.0001) logging.info('creating bloom filter of %dMBs', all.num_bits / 8 // (1024 * 1024)) num_distinct = 0 num_dups = 0 input = smart_open.smart_open(input_path, 'rb') tmp_output = TemporaryFile(mode='w+', encoding='utf-8') for i, line in enumerate(input): if i % 10000 == 0: logging.info('processing line %d. %d unique, %d dups', i, num_distinct - num_dups, num_dups) try: line = line.decode('utf-8').strip() if len(line) <= 2: continue # opening or closing if line[-1] == ',': line = line[:-1] data = json.loads(line) result = [] for labels in data['labels'].values(): lang = labels['language'] word = lang + '.wikipedia:' + normalize(labels['value']) if word in all: dups.add(word) num_dups += 1 else: result.append(word) all.add(word) num_distinct += 1 tmp_output.write('\t'.join(result)) tmp_output.write('\n') except: sys.stderr.write('error while processing line: %s' % (line)) traceback.print_exc() logging.info('found %d unique, %d dups', num_distinct - num_dups, num_dups) tmp_output.seek(0) with open(output_path, 'w', encoding='utf-8') as output: for line in tmp_output: words = [ w for w in line.strip().split('\t') if w not in dups ] if len(words) > 1: output.write('\t'.join(words)) output.write('\n')
def simulate(): global addr_spec, addr_func Triton.setArchitecture(ARCH.X86_64) Triton.setMode(MODE.ALIGNED_MEMORY, True) Triton.setMode(MODE.ONLY_ON_SYMBOLIZED, True) Triton.enableTaintEngine(False) ENTRY = loadBinary(sys.argv[1]) spec = imp.load_source('name', './spec.py') func_spec = spec.func_spec addr_spec = {} for func, sp in func_spec.items(): addr_spec[gbinary.get_function_address(func)] = sp addr_func = {} for func in func_spec: addr_func[gbinary.get_function_address(func)] = func lastInput = pybloomfilter.BloomFilter(100000, 0.001, b'filter.bloom') worklist = [{}] while worklist: Triton.reset() Triton.setArchitecture(ARCH.X86_64) Triton.setMode(MODE.ALIGNED_MEMORY, True) Triton.setMode(MODE.ONLY_ON_SYMBOLIZED, True) Triton.enableTaintEngine(False) ENTRY = loadBinary(sys.argv[1]) flag = 0 for addr in addr_spec: if addr not in func_error_seed: flag = 1 if flag == 0: break seed = worklist[0] print("seed: " + str(seed)) symbolizeInputs(seed) initContext() lastInput.add(hdict(seed)) del worklist[0] run(ENTRY, seed) if hdict(seed) not in lastInput and seed not in worklist: worklist.append(seed) newInputs = getNewInput() for inputs in newInputs: if hdict(inputs) not in lastInput and inputs not in worklist: worklist.append(dict(inputs)) print("seed " + str(len(worklist)))
def test_create_with_hash_seeds(self): cust_seeds = [getrandbits(32) for i in range(30)] bf = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, self.tempfile.name, hash_seeds=cust_seeds) bf_seeds = bf.hash_seeds.tolist() self.assertEqual(cust_seeds, bf_seeds)
def second_implementation_results(seq, capacity, k): bf = pybloomfilter.BloomFilter(capacity, 0.01, None) start = time.time() counts = second_count_table(seq, k, bf) end = time.time() this_time = (end - start) * 1000 size = sys.getsizeof(counts) return (this_time, size, counts)
def __init__(self, startURL): self.startURL = startURL self.targetNetloc = urlparse.urlparse(startURL).netloc self.downloadList = [] self.lurl = [] self.bloomfilter = pybloomfilter.BloomFilter(100000, 0.1, '/tmp/words.bloom') self.dep = 0
def __init__(self): # Use an in-memory bloomfilter for now, maybe move to pyreBloom if we need something threadsafe? bloomfilter_filepath = tempfile.NamedTemporaryFile(delete=False).name logger.debug('Saving bloomfilter to %s', bloomfilter_filepath) # pybloomfilter.BloomFilter(capacity, error_rate, filename) self.bloomfilter = pybloomfilter.BloomFilter(10000000, 0.001, bloomfilter_filepath) self.seen = dict()
def reloadRobotUser(mode): sql = '''select user_id from user_robot_info where user_id%100={0} '''.format(mode) query_results = db_tools.db_midware('waterworld').query(sql) sbf = pybloomfilter.BloomFilter(capacity=len(query_results), error_rate=0.001) [sbf.add(int(item['user_id'])) for item in query_results] myBF.robot_last_ts[mode] = time.time() myBF.robot_bloom_filter[mode] = sbf
def reloadTestUser(): sql = '''select user_id from test_user where status=0 UNION DISTINCT select user_id from user_evil where kind='test' and update_ts>now()- interval 2 day ''' query_results = db_tools.db_midware('waterworld').query(sql) sbf = pybloomfilter.BloomFilter(len(query_results), 0.001) [sbf.add(int(item['user_id'])) for item in query_results] myBF.test_user_last_ts = time.time() myBF.test_user_bloom_filter = sbf
def creatBf(filename): f = file(filename) data = f.readlines() f.close() l = len(data) dic = pybloomfilter.BloomFilter(l, 0.0001, filename.replace('.txt', '.bloom')) data = [ele.strip() for ele in data] dic.update(data) return dic
def test_read_only_set_operations_is_value_error(self): bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE) bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") self.assertEqual(bfro.read_only, True) self.assertRaises(ValueError, bfro.union, bf_mem) self.assertRaises(ValueError, bfro.intersection, bf_mem) self.assertRaises(ValueError, bfro.__ior__, bf_mem) self.assertRaises(ValueError, bfro.__iand__, bf_mem)
def check_differences(capacity, seq, k): bf = pybloomfilter.BloomFilter(capacity, 0.01, None) bf2 = pybloomfilter.BloomFilter(capacity, 0.01, None) counts_one = create_count_table(seq, k, bf) counts_two = second_count_table(seq, k, bf2) kmerCount = 0 diffCount = 0 kmer_set = set() for read in seq: kmers = make_kmers(read, k) for kmer in kmers: if kmer not in kmer_set: kmer_set.add(kmer) kmerCount = kmerCount + 1 first = get_count(bf, counts_one, kmer) second = get_count(bf2, counts_two, kmer) if first != second: diffCount = diffCount + 1 return (kmerCount, diffCount)
def test_remove(self): bf = pybloomfilter.BloomFilter(100, 0.01) for i in range(0, 20): bf.add(str(i)) assert bf.remove("1") assert "1" not in bf assert "2" in bf assert len(bf) == 19 # approximate size assert not bf.remove("1") assert not bf.remove("30")
def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0', } if os.path.exists('douban_spider.bloom'): self.blfilter = pybloomfilter.BloomFilter.open( 'douban_spider.bloom') else: self.blfilter = pybloomfilter.BloomFilter( user_settings.FILTER_SIZE, user_settings.FILTER_ERROR_RATE, 'douban_spider.bloom')
def test_union_without_copy_template(self): with tempfile.NamedTemporaryFile() as tmp1: with tempfile.NamedTemporaryFile() as tmp2: bf1 = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, tmp1.name, seed=100) bf2 = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, tmp2.name, seed=100) for i in range(100): bf1.add(i) for i in range(100, 200): bf2.add(i) bf2.union(bf1) # Should not fail self.assertTrue(all(i in bf2 for i in range(200)))
def test_bit_array_same_hashes(self): capacity = 100 * 100 items = [] for i in range(capacity): items.append(randint(0, 1000)) # File-backed bf1 = pybloomfilter.BloomFilter(capacity, 0.01, self.tempfile.name) bf1.update(items) bf1_hs = bf1.hash_seeds bf1_ba = bf1.bit_array # In-memory bf2 = pybloomfilter.BloomFilter(capacity, 0.01, hash_seeds=bf1_hs) bf2.update(items) bf2_ba = bf2.bit_array # Should be identical as data was hashed into the same locations assert bf1_ba ^ bf2_ba == 0
def test_create_with_hash_seeds_and_compare(self): test_data = "test" bf1 = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, self.tempfile.name) bf1.add(test_data) bf1_seeds = bf1.hash_seeds.tolist() bf1_ba = bf1.bit_array bf2 = pybloomfilter.BloomFilter(self.FILTER_SIZE, self.FILTER_ERROR_RATE, self.tempfile.name, hash_seeds=bf1_seeds) bf2.add(test_data) bf2_seeds = bf2.hash_seeds.tolist() bf2_ba = bf2.bit_array self.assertEqual(bf1_seeds, bf2_seeds) # Expecting same hashing sequence self.assertEqual(bf1_ba, bf2_ba)
def __init__(self, num_samples, expected_num_points, batch_size, error_rate=0.000001): self._num_samples = num_samples self._batch_size = batch_size self._expected_num_points = expected_num_points self._denominator = [0.] * num_samples self._numerator = [0.] * num_samples self._bf = pybloomfilter.BloomFilter(expected_num_points, error_rate) self._actual_num_points = 0.