class TestHugeBloom(object): ELEMENTS = 1000000000 MAX_FAILURE_RATE = 0.001 def setup(self): import struct if 8 * struct.calcsize("P") == 32: raise SkipTest("Skip HugeBloom tests on 32-bit platforms") self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE) def test_one(self): self.bf.add("a") assert self.bf.contains("a") assert not self.bf.contains("b")
class TestHugeBloom(): ELEMENTS = 1000000000 MAX_FAILURE_RATE = 0.001 def setup(self): import struct if 8 * struct.calcsize("P") == 32: raise SkipTest("Skip HugeBloom tests on 32-bit platforms") self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE) def test_one(self): self.bf.add("a") assert self.bf.contains("a") assert not self.bf.contains("b")
def test_unicrap(): filter = WritingBloomFilter(100000, 0.1) assert u'\u2019' not in filter assert u'\u2018' not in filter filter.add(u'\u2018') filter.add(u'\u2019') filter.add('just a plain string') assert u'\u2019' in filter assert u'\u2018' in filter assert 'just a plain string' in filter assert filter[u'\u2019'] == 1 assert filter[u'\u2018'] == 1 assert filter['just a plain string'] == 1
def test_unicrap(): bf = WritingBloomFilter(100000, 0.1) assert u'\u2019' not in bf assert u'\u2018' not in bf bf.add(u'\u2018') bf.add(u'\u2019') bf.add('just a plain string') assert u'\u2019' in bf assert u'\u2018' in bf assert 'just a plain string' in bf assert bf[u'\u2019'] == 1 assert bf[u'\u2018'] == 1 assert bf['just a plain string'] == 1
class TestBloomFilter(object): ELEMENTS = 10000 MAX_FAILURE_RATE = 0.1 def setup(self): self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE) def _testFalsePositives(self, filter, keys, otherkeys): fp = 0 assert len(keys) == len(otherkeys) for key in keys: filter.add(key) for key in otherkeys: if filter.contains(key): fp += 1 bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS) spec = BloomCalculations.computeBloomSpec2( bucketsPerElement, self.MAX_FAILURE_RATE) fp_ratio = fp / ( len(keys) * BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100 assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \ "false positives found. {:0.3f}%".format(fp_ratio) print("OK: Got {:0.3f}% of the expected false positives ".format( fp_ratio)) # False negatives never occur - this should always work for k in keys: assert filter.contains(k) def testBloomLimits1(slef): maxBuckets = len(BloomCalculations.PROBS) - 1 maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1 # possible BloomCalculations.computeBloomSpec2( maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK]) # impossible, throws try: BloomCalculations.computeBloomSpec2( maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2) raise RuntimeError except UnsupportedOperationException: pass def test_one(self): self.bf.add("a") self.bf.contains("a") assert not self.bf.contains("b") def testFalsePositivesInt(self): keygen = KeyGenerator() self._testFalsePositives(self.bf, [str(x) for x in range(10000)], keygen.randomKeys(10000)) def testFalsePositivesRandom(self): keygen1 = KeyGenerator(314159) self._testFalsePositives( self.bf, [keygen1.random_string() for i in range(10000)], [keygen1.random_string() for i in range(10000)],) def testWords(self): keygen1 = KeyGenerator() bf = WritingBloomFilter( len(keygen1)/2, self.MAX_FAILURE_RATE, ignore_case=False) even_keys = keygen1[::2] odd_keys = keygen1[1::2] self._testFalsePositives(bf, even_keys, odd_keys) def testNullKeys(self): assert 'foo' not in self.bf assert 'foo\0bar' not in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo') assert 'foo' in self.bf assert 'foo\0bar' not in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo\0bar') assert 'foo\0bar' in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo\0baz') assert 'foo\0baz' in self.bf
class TestBloomFilter(object): ELEMENTS = 10000 MAX_FAILURE_RATE = 0.1 def setup(self): self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE) def _test_false_positives(self, bf, keys, otherkeys): fp = 0 assert len(keys) == len(otherkeys) for key in keys: bf.add(key) for key in otherkeys: if bf.contains(key): fp += 1 bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS) spec = BloomCalculations.computeBloomSpec2(bucketsPerElement, self.MAX_FAILURE_RATE) fp_ratio = fp / ( len(keys) * BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100 assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \ "false positives found. {:0.3f}%".format(fp_ratio) print("OK: Got {:0.3f}% of the expected false positives ".format( fp_ratio)) # False negatives never occur - this should always work for k in keys: assert bf.contains(k) def test_bloom_limits1(self): maxBuckets = len(BloomCalculations.PROBS) - 1 maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1 # possible BloomCalculations.computeBloomSpec2( maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK]) # impossible, throws try: BloomCalculations.computeBloomSpec2( maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2) raise RuntimeError except UnsupportedOperationException: pass def test_one(self): self.bf.add("a") self.bf["aa"] = 0 assert self.bf.contains("a") assert "aa" in self.bf assert not self.bf.contains("b") assert "b" not in self.bf def test_false_positives_int(self): keygen = KeyGenerator() self._test_false_positives(self.bf, [str(x) for x in range(10000)], keygen.randomKeys(10000)) def test_false_positives_random(self): keygen1 = KeyGenerator(314159) self._test_false_positives( self.bf, [keygen1.random_string() for i in range(10000)], [keygen1.random_string() for i in range(10000)], ) def test_words(self): keygen1 = KeyGenerator() bf = WritingBloomFilter(len(keygen1) / 2, self.MAX_FAILURE_RATE, ignore_case=False) even_keys = keygen1[::2] odd_keys = keygen1[1::2] self._test_false_positives(bf, even_keys, odd_keys) def test_null_keys(self): assert 'foo' not in self.bf assert 'foo\0bar' not in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo') assert 'foo' in self.bf assert 'foo\0bar' not in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo\0bar') assert 'foo\0bar' in self.bf assert 'foo\0baz' not in self.bf self.bf.add('foo\0baz') assert 'foo\0baz' in self.bf
# If you want to add the real k-mers #CEs=MH.import_multiple_from_single_hdf5('/home/dkoslicki/Desktop/CMash/data/SmallGenomes.h5') #all_kmers = set() #for CE in CEs: # for kmer in CE._kmers: # all_kmers.add("%s" % kmer) #for CE in CEs: # for kmer in CE._kmers: # bloom.add("%s" % kmer) # If you want to add random kmers all_kmers = set() for _ in range(num_in_bloom): kmer = "".join(np.random.choice(["A", "C", "T", "G"], 60)) bloom.add(kmer) all_kmers.add(kmer) # Test the timing for a bloom query N = 10000 i = 0 t0 = timeit.default_timer() for _ in range(N): #kmer = random.sample(all_kmers, 1)[0] kmer = "".join(np.random.choice(["A", "C", "T", "G"], 60)) if kmer in bloom: i += 1 else: pass t1 = timeit.default_timer() print(t1 - t0)
tree = mt.Trie(to_insert) tree.save(streaming_database_file) else: tree = mt.Trie() tree.load(streaming_database_file) # all the k-mers of interest in a set (as a pre-filter) if not hydra_file: # create one try: all_kmers_bf = WritingBloomFilter( len(sketches) * len(k_range) * num_hashes * 2, 0.01) for sketch in sketches: for kmer in sketch._kmers: for ksize in k_range: all_kmers_bf.add( kmer[0:ksize] ) # put all the k-mers and the appropriate suffixes in all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize]) ) # also add the reverse complement except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) else: # otherwise read it in try: all_kmers_bf = ReadingBloomFilter(hydra_file) except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1)
tree = mt.Trie(to_insert) tree.save(streaming_database_file) else: tree = mt.Trie() tree.load(streaming_database_file) # all the k-mers of interest in a set (as a pre-filter) if not hydra_file: # create one try: all_kmers_bf = WritingBloomFilter( len(sketches) * len(k_range) * num_hashes, 0.01) for sketch in sketches: for kmer in sketch._kmers: for ksize in k_range: all_kmers_bf.add( kmer[0:ksize] ) # put all the k-mers and the appropriate suffixes in except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) else: # otherwise read it in try: all_kmers_bf = ReadingBloomFilter(hydra_file) except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) # Seen k-mers (set of k-mers that already hit the trie, so don't need to check again) seen_kmers = set()