def test_ReadFile_is_public(self):
     self.assertEquals(
         isinstance(pybloomfilter.BloomFilter.ReadFile, object), True)
     bf = pybloomfilter.BloomFilter(100, 0.01)
     bf2 = pybloomfilter.BloomFilter(100, 0.01)
     self.assertEquals(bf.ReadFile, bf2.ReadFile)
     self.assertEquals(pybloomfilter.BloomFilter.ReadFile, bf.ReadFile)
Esempio n. 2
0
    def __init__(self, data_dir, rule):
        self.data_dir = data_dir
        self.rule = rule
        os.system('mkdir -p ' + data_dir)

        url_file = os.path.join(data_dir, 'urls.urls')
        savefile = open(url_file, 'a' if os.path.exists(url_file) else 'w')

        parsed_ff = os.path.join(data_dir, 'parsed.bloomfilter')
        parsed_filter = pybf.BloomFilter(
            pybf.BloomFilter.ReadFile
            if os.path.exists(parsed_ff) else 10000000, 0.0001, parsed_ff)

        saved_ff = os.path.join(data_dir, 'saved.bloomfilter')
        saved_filter = pybf.BloomFilter(
            pybf.BloomFilter.ReadFile
            if os.path.exists(saved_ff) else 10000000, 0.0001, saved_ff)

        self.dyn_filter = set()
        self.queue = Queue.LifoQueue()
        if os.path.exists(os.path.join(self.data_dir, 'queue.json')):
            for item in reversed([
                    json.loads(line)
                    for line in open(os.path.join(self.data_dir, 'queue.json'))
            ]):
                self.queue.put(item)
            os.remove(os.path.join(self.data_dir, 'queue.json'))
        self.saved_filter = saved_filter
        self.parsed_filter = parsed_filter
        self.savefile = savefile
Esempio n. 3
0
 def __init__(self, active_rules=None):
     self.nineteen_mers = pybloomfilter.BloomFilter(50000000, 0.0001)
     self.tmp_nineteen_mers = pybloomfilter.BloomFilter(50000, 0.0001)
     if active_rules is None:
         self.active_rules = [
             # FastDNARules.a_permutation,
             # FastDNARules.t_permutation,
             # FastDNARules.c_permutation,
             # FastDNARules.g_permutation,
             # FastDNARules.dinucleotid_runs,
             # FastDNARules.homopolymers,
             partial(FastDNARules.homopolymers, probs=three_strict_homopolymers()),
             # FastDNARules.overall_gc_content,
             # To change the GC error function:
             partial(FastDNARules.overall_gc_content, calc_func=fs_gc_error_calculation),
             # FastDNARules.windowed_gc_content,
             partial(FastDNARules.windowed_gc_content, calc_func=ts_gc_error_calculation),
             #  FastDNARules.long_strands,
             #  FastDNARules.illegal_symbols,
             # FastDNARules.trinucleotid_runs,
             # FastDNARules.random_permutations,
             FastDNARules.motif_search
             # FastDNARules.motif_regex_search,
             # FastDNARules.repeatRegion,
             # FastDNARules.smallRepeatRegion,
             # self.check_and_add_mers
         ]
     else:
         self.active_rules = active_rules
def test_errors(error_rate, filter_size, correct_overlap, num_test_words):
    bloom_file = tempfile.NamedTemporaryFile()
    try:
        bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name)
    except TypeError:
        bf = pybloomfilter.BloomFilter(filter_size, error_rate)

    with open(WORDS_FILE) as source_file:
        with open(TEST_WORDS) as test_file:
            run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate)
Esempio n. 5
0
    def test_bit_count(self):
        bf0 = pybloomfilter.BloomFilter(100, 0.1)
        bf1 = pybloomfilter.BloomFilter(100, 0.1)
        bf1.add('a')
        bf100 = pybloomfilter.BloomFilter(100, 0.1)
        for i in range(100):
            bf100.add(str(i))

        assert bf0.bit_count == 0
        assert bf1.bit_count == bf1.num_hashes
        assert bf100.bit_count == bin(bf100.bit_array).count('1')
    def setUp(self):
        # Convenience file-backed bloomfilter
        self.tempfile = tempfile.NamedTemporaryFile(suffix='.bloom',
                                                    delete=False)
        self.bf = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                            self.FILTER_ERROR_RATE,
                                            self.tempfile.name)

        # Convenience memory-backed bloomfilter
        self.bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                                self.FILTER_ERROR_RATE)
Esempio n. 7
0
    def test_approximate_size_after_union_called(self):
        bf1 = pybloomfilter.BloomFilter(100, 0.1, self.tempfile.name,
                                        hash_seeds=[1, 2, 3])
        for i in range(0, 20):
            bf1.add(str(i))
        bf2 = pybloomfilter.BloomFilter(100, 0.1, self.tempfile.name + '.50',
                                        hash_seeds=[1, 2, 3])
        for i in range(10, 30):  # intersectoin size: 10
            bf2.add(str(i))
        union_bf = bf1.copy(self.tempfile.name + '.union')
        union_bf.union(bf2)

        assert len(union_bf) == 29  # approximate size
        intersection = len(bf1) + len(bf2) - len(union_bf)
        assert intersection == 11  # approximate size
    def test_write_operation_on_readonly_file_raises_exception(self):
        with tempfile.NamedTemporaryFile() as tmp1:
            with tempfile.NamedTemporaryFile() as tmp2:
                pybloomfilter.BloomFilter(1000, 0.01, tmp1.name)
                bf1 = pybloomfilter.BloomFilter.open(tmp1.name)

                bf2 = bf1.copy_template(tmp2.name)
                bf2.add('bf2')

                with self.assertRaises(ValueError):
                    bf1.clear_all()

                with self.assertRaises(ValueError):
                    bf1.add('test')

                with self.assertRaises(ValueError):
                    bf1 |= bf2

                with self.assertRaises(ValueError):
                    bf1 &= bf2

                with self.assertRaises(ValueError):
                    bf1.union(bf2)

                with self.assertRaises(ValueError):
                    bf1.intersection(bf2)
    def test_write_operation_on_writable_files_does_not_raise_exception(self):
        with tempfile.NamedTemporaryFile() as tmp1:
            with tempfile.NamedTemporaryFile() as tmp2:
                bf1 = pybloomfilter.BloomFilter(1000, 0.01, tmp1.name)

                bf2 = bf1.copy_template(tmp2.name)
                bf2.add('bf2')

                bf1.clear_all()
                bf1.add('test')
                bf1 |= bf2
                bf1 &= bf2
                bf1.union(bf2)
                bf1.intersection(bf2)

                bf1.close()

                bf3 = pybloomfilter.BloomFilter.open(tmp1.name, mode='rw')

                bf3.clear_all()
                bf3.add('test')
                bf3 |= bf2
                bf3 &= bf2
                bf3.union(bf2)
                bf3.intersection(bf2)
Esempio n. 10
0
    def test_strings(self):
        random_strings = list(
            set(''.join(
                random.choice(string.lowercase + string.uppercase)
                for _ in range(6)) for _ in range(10000)))
        random.shuffle(random_strings)

        for accuracy in (0.1, 0.01, 0.001):
            bf = pybloomfilter.BloomFilter(8000, accuracy)
            bf.update(random_strings[:8000])
            false_pos, false_neg = 0, 0
            for test in random_strings[8000:10000]:
                if test in bf:
                    false_pos += 1
            for test in random_strings[6000:8000]:
                if test not in bf:
                    false_neg += 1
            false_pos_rate = false_pos / 2000.
            false_neg_rate = false_neg / 2000.
            self.assertTrue(
                false_pos_rate <= accuracy * 2,
                "accuracy fail: %0.4f > %0.4f" % (false_pos_rate, accuracy))
            self.assertEqual(
                false_neg_rate, 0.0,
                "false negative rate is nonzero: %0.4f" % (false_neg_rate, ))
            del bf
            print false_pos_rate, accuracy
Esempio n. 11
0
def test_bloom_filter_removes_training_set_items(trained_model,
                                                 sim_eval_dataset):
    user_id = 0

    interactions = sim_eval_dataset[user_id]["interactions"].coalesce()
    item_ids = list(interactions.indices()[1].numpy())

    # TODO: See if these assertions can be moved to test the simulated dataset
    assert len(item_ids) > 0
    assert item_ids[0] < trained_model.hparams.num_items
    assert type(item_ids) == list

    bloom_filter = pbf.BloomFilter(100, 0.1)
    bloom_filter.update(item_ids)

    all_items = AllItems(trained_model.hparams.num_items)
    user_recs = all_items.run(UserRecs(user_id=user_id))

    length_before = len(user_recs.candidates)

    stage = BloomFilter({0: bloom_filter})
    user_recs = stage.run(user_recs)

    length_after = len(user_recs.candidates)

    assert length_after > length_before
Esempio n. 12
0
    def test_bit_array(self):
        bf = pybloomfilter.BloomFilter(1000, 0.01, self.tempfile.name)
        bf.add("apple")

        # Count the number of 1s
        total_ones = 0
        bit_array_str = bin(bf.bit_array)
        for c in bit_array_str:
            if c == "1":
                total_ones += 1

        # For the first item addition, BF should contain
        # the same amount of 1s as the number of hashes
        # performed
        assert total_ones == bf.num_hashes

        for i in range(1000):
            bf.add(randint(0, 1000))

        bf.add("apple")
        ba_1 = bf.bit_array

        bf.add("apple")
        ba_2 = bf.bit_array

        # Should be the same
        assert ba_1 ^ ba_2 == 0

        bf.add("pear")
        bf.add("mango")
        ba_3 = bf.bit_array

        # Should not be the same
        assert ba_1 ^ ba_3 != 0
Esempio n. 13
0
def create_cbloomfilter(*args):
    args = list(args)
    f = tempfile.NamedTemporaryFile()
    tempfiles.append(f)
    os.unlink(f.name)
    args.append(f.name)
    return pybloomfilter.BloomFilter(*tuple(args))
def main(input_path, output_path):
    all = pybloomfilter.BloomFilter(NUM_LABELS, 0.0001)
    dups = pybloomfilter.BloomFilter(NUM_LABELS, 0.0001)
    logging.info('creating bloom filter of %dMBs', all.num_bits / 8 // (1024 * 1024))
    num_distinct = 0
    num_dups = 0

    input = smart_open.smart_open(input_path, 'rb')
    tmp_output = TemporaryFile(mode='w+', encoding='utf-8')
    for i, line in enumerate(input):
        if i % 10000 == 0:
            logging.info('processing line %d. %d unique, %d dups', i, num_distinct - num_dups, num_dups)
        try:
            line = line.decode('utf-8').strip()
            if len(line) <= 2: continue # opening or closing
            if line[-1] == ',': line = line[:-1]
            data = json.loads(line)
            result = []
            for labels in data['labels'].values():
                lang = labels['language']
                word = lang + '.wikipedia:' + normalize(labels['value'])
                if word in all:
                    dups.add(word)
                    num_dups += 1
                else:
                    result.append(word)
                    all.add(word)
                    num_distinct += 1
            tmp_output.write('\t'.join(result))
            tmp_output.write('\n')
        except:
            sys.stderr.write('error while processing line: %s' % (line))
            traceback.print_exc()

    logging.info('found %d unique, %d dups', num_distinct - num_dups, num_dups)
    tmp_output.seek(0)

    with open(output_path, 'w', encoding='utf-8') as output:
        for line in tmp_output:
            words = [
                w
                for w in line.strip().split('\t')
                if w not in dups
            ]
            if len(words) > 1:
                output.write('\t'.join(words))
                output.write('\n')
Esempio n. 15
0
def simulate():
    global addr_spec, addr_func
    Triton.setArchitecture(ARCH.X86_64)
    Triton.setMode(MODE.ALIGNED_MEMORY, True)
    Triton.setMode(MODE.ONLY_ON_SYMBOLIZED, True)
    Triton.enableTaintEngine(False)

    ENTRY = loadBinary(sys.argv[1])

    spec = imp.load_source('name', './spec.py')
    func_spec = spec.func_spec

    addr_spec = {}
    for func, sp in func_spec.items():
        addr_spec[gbinary.get_function_address(func)] = sp

    addr_func = {}
    for func in func_spec:
        addr_func[gbinary.get_function_address(func)] = func

    lastInput = pybloomfilter.BloomFilter(100000, 0.001, b'filter.bloom')

    worklist = [{}]

    while worklist:

        Triton.reset()
        Triton.setArchitecture(ARCH.X86_64)
        Triton.setMode(MODE.ALIGNED_MEMORY, True)
        Triton.setMode(MODE.ONLY_ON_SYMBOLIZED, True)
        Triton.enableTaintEngine(False)
        ENTRY = loadBinary(sys.argv[1])

        flag = 0
        for addr in addr_spec:
            if addr not in func_error_seed:
                flag = 1
        if flag == 0:
            break

        seed = worklist[0]
        print("seed: " + str(seed))

        symbolizeInputs(seed)

        initContext()

        lastInput.add(hdict(seed))
        del worklist[0]
        run(ENTRY, seed)

        if hdict(seed) not in lastInput and seed not in worklist:
            worklist.append(seed)

        newInputs = getNewInput()
        for inputs in newInputs:
            if hdict(inputs) not in lastInput and inputs not in worklist:
                worklist.append(dict(inputs))
        print("seed " + str(len(worklist)))
Esempio n. 16
0
 def test_create_with_hash_seeds(self):
     cust_seeds = [getrandbits(32) for i in range(30)]
     bf = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                    self.FILTER_ERROR_RATE,
                                    self.tempfile.name,
                                    hash_seeds=cust_seeds)
     bf_seeds = bf.hash_seeds.tolist()
     self.assertEqual(cust_seeds, bf_seeds)
Esempio n. 17
0
def second_implementation_results(seq, capacity, k):
	bf = pybloomfilter.BloomFilter(capacity, 0.01, None)
	start = time.time()
	counts = second_count_table(seq, k, bf)
	end = time.time()
	this_time = (end - start) * 1000
	size = sys.getsizeof(counts)
	return (this_time, size, counts)
Esempio n. 18
0
 def __init__(self, startURL):
     self.startURL = startURL
     self.targetNetloc = urlparse.urlparse(startURL).netloc
     self.downloadList = []
     self.lurl = []
     self.bloomfilter = pybloomfilter.BloomFilter(100000, 0.1,
                                                  '/tmp/words.bloom')
     self.dep = 0
Esempio n. 19
0
 def __init__(self):
     # Use an in-memory bloomfilter for now, maybe move to pyreBloom if we need something threadsafe?
     bloomfilter_filepath = tempfile.NamedTemporaryFile(delete=False).name
     logger.debug('Saving bloomfilter to %s', bloomfilter_filepath)
     # pybloomfilter.BloomFilter(capacity, error_rate, filename)
     self.bloomfilter = pybloomfilter.BloomFilter(10000000, 0.001,
                                                  bloomfilter_filepath)
     self.seen = dict()
Esempio n. 20
0
 def reloadRobotUser(mode):
     sql = '''select user_id from user_robot_info where user_id%100={0}
     '''.format(mode)
     query_results = db_tools.db_midware('waterworld').query(sql)
     sbf = pybloomfilter.BloomFilter(capacity=len(query_results),
                                     error_rate=0.001)
     [sbf.add(int(item['user_id'])) for item in query_results]
     myBF.robot_last_ts[mode] = time.time()
     myBF.robot_bloom_filter[mode] = sbf
Esempio n. 21
0
 def reloadTestUser():
     sql = '''select user_id from test_user where status=0
     UNION DISTINCT select user_id from user_evil where kind='test'
     and update_ts>now()- interval 2 day '''
     query_results = db_tools.db_midware('waterworld').query(sql)
     sbf = pybloomfilter.BloomFilter(len(query_results), 0.001)
     [sbf.add(int(item['user_id'])) for item in query_results]
     myBF.test_user_last_ts = time.time()
     myBF.test_user_bloom_filter = sbf
def creatBf(filename):
    f = file(filename)
    data = f.readlines()
    f.close()
    l = len(data)
    dic = pybloomfilter.BloomFilter(l, 0.0001,
                                    filename.replace('.txt', '.bloom'))
    data = [ele.strip() for ele in data]
    dic.update(data)
    return dic
Esempio n. 23
0
    def test_read_only_set_operations_is_value_error(self):
        bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                           self.FILTER_ERROR_RATE)

        bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r")
        self.assertEqual(bfro.read_only, True)
        self.assertRaises(ValueError, bfro.union, bf_mem)
        self.assertRaises(ValueError, bfro.intersection, bf_mem)
        self.assertRaises(ValueError, bfro.__ior__, bf_mem)
        self.assertRaises(ValueError, bfro.__iand__, bf_mem)
Esempio n. 24
0
def check_differences(capacity, seq, k):
	bf = pybloomfilter.BloomFilter(capacity, 0.01, None)
	bf2 = pybloomfilter.BloomFilter(capacity, 0.01, None)
	counts_one = create_count_table(seq, k, bf)
	counts_two = second_count_table(seq, k, bf2)
	kmerCount = 0
	diffCount = 0
	kmer_set = set()
	for read in seq:
		kmers = make_kmers(read, k)
		for kmer in kmers:
			if kmer not in kmer_set:
				kmer_set.add(kmer)
				kmerCount = kmerCount + 1
				first = get_count(bf, counts_one, kmer)
				second = get_count(bf2, counts_two, kmer)
				if first != second:
					diffCount = diffCount + 1
	return (kmerCount, diffCount)
Esempio n. 25
0
    def test_remove(self):
        bf = pybloomfilter.BloomFilter(100, 0.01)
        for i in range(0, 20):
            bf.add(str(i))

        assert bf.remove("1")
        assert "1" not in bf
        assert "2" in bf
        assert len(bf) == 19  # approximate size
        assert not bf.remove("1")
        assert not bf.remove("30")
Esempio n. 26
0
 def __init__(self):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
     }
     if os.path.exists('douban_spider.bloom'):
         self.blfilter = pybloomfilter.BloomFilter.open(
             'douban_spider.bloom')
     else:
         self.blfilter = pybloomfilter.BloomFilter(
             user_settings.FILTER_SIZE, user_settings.FILTER_ERROR_RATE,
             'douban_spider.bloom')
Esempio n. 27
0
    def test_union_without_copy_template(self):
        with tempfile.NamedTemporaryFile() as tmp1:
            with tempfile.NamedTemporaryFile() as tmp2:
                bf1 = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                                self.FILTER_ERROR_RATE,
                                                tmp1.name,
                                                seed=100)
                bf2 = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                                self.FILTER_ERROR_RATE,
                                                tmp2.name,
                                                seed=100)

                for i in range(100):
                    bf1.add(i)

                for i in range(100, 200):
                    bf2.add(i)

                bf2.union(bf1)  # Should not fail

                self.assertTrue(all(i in bf2 for i in range(200)))
Esempio n. 28
0
    def test_bit_array_same_hashes(self):
        capacity = 100 * 100
        items = []
        for i in range(capacity):
            items.append(randint(0, 1000))

        # File-backed
        bf1 = pybloomfilter.BloomFilter(capacity, 0.01, self.tempfile.name)
        bf1.update(items)

        bf1_hs = bf1.hash_seeds
        bf1_ba = bf1.bit_array

        # In-memory
        bf2 = pybloomfilter.BloomFilter(capacity, 0.01, hash_seeds=bf1_hs)
        bf2.update(items)

        bf2_ba = bf2.bit_array

        # Should be identical as data was hashed into the same locations
        assert bf1_ba ^ bf2_ba == 0
Esempio n. 29
0
    def test_create_with_hash_seeds_and_compare(self):
        test_data = "test"
        bf1 = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                        self.FILTER_ERROR_RATE,
                                        self.tempfile.name)
        bf1.add(test_data)
        bf1_seeds = bf1.hash_seeds.tolist()
        bf1_ba = bf1.bit_array

        bf2 = pybloomfilter.BloomFilter(self.FILTER_SIZE,
                                        self.FILTER_ERROR_RATE,
                                        self.tempfile.name,
                                        hash_seeds=bf1_seeds)
        bf2.add(test_data)
        bf2_seeds = bf2.hash_seeds.tolist()
        bf2_ba = bf2.bit_array

        self.assertEqual(bf1_seeds, bf2_seeds)

        # Expecting same hashing sequence
        self.assertEqual(bf1_ba, bf2_ba)
    def __init__(self,
                 num_samples,
                 expected_num_points,
                 batch_size,
                 error_rate=0.000001):
        self._num_samples = num_samples
        self._batch_size = batch_size
        self._expected_num_points = expected_num_points
        self._denominator = [0.] * num_samples
        self._numerator = [0.] * num_samples

        self._bf = pybloomfilter.BloomFilter(expected_num_points, error_rate)
        self._actual_num_points = 0.