Exemple #1
0
class TestHugeBloom(object):
    ELEMENTS = 1000000000
    MAX_FAILURE_RATE = 0.001

    def setup(self):
        import struct
        if 8 * struct.calcsize("P") == 32:
            raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def test_one(self):
        self.bf.add("a")
        assert self.bf.contains("a")
        assert not self.bf.contains("b")
Exemple #2
0
class TestHugeBloom():
    ELEMENTS = 1000000000
    MAX_FAILURE_RATE = 0.001

    def setup(self):
        import struct
        if 8 * struct.calcsize("P") == 32:
            raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def test_one(self):
        self.bf.add("a")
        assert self.bf.contains("a")
        assert not self.bf.contains("b")
Exemple #3
0
def test_unicrap():
    filter = WritingBloomFilter(100000, 0.1)
    assert u'\u2019' not in filter
    assert u'\u2018' not in filter

    filter.add(u'\u2018')
    filter.add(u'\u2019')

    filter.add('just a plain string')

    assert u'\u2019' in filter
    assert u'\u2018' in filter
    assert 'just a plain string' in filter

    assert filter[u'\u2019'] == 1
    assert filter[u'\u2018'] == 1
    assert filter['just a plain string'] == 1
Exemple #4
0
def test_unicrap():
    bf = WritingBloomFilter(100000, 0.1)
    assert u'\u2019' not in bf
    assert u'\u2018' not in bf

    bf.add(u'\u2018')
    bf.add(u'\u2019')

    bf.add('just a plain string')

    assert u'\u2019' in bf
    assert u'\u2018' in bf
    assert 'just a plain string' in bf

    assert bf[u'\u2019'] == 1
    assert bf[u'\u2018'] == 1
    assert bf['just a plain string'] == 1
Exemple #5
0
class TestBloomFilter(object):
    ELEMENTS = 10000
    MAX_FAILURE_RATE = 0.1

    def setup(self):
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def _testFalsePositives(self, filter, keys, otherkeys):
        fp = 0

        assert len(keys) == len(otherkeys)

        for key in keys:
            filter.add(key)

        for key in otherkeys:
            if filter.contains(key):
                fp += 1

        bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS)
        spec = BloomCalculations.computeBloomSpec2(
            bucketsPerElement, self.MAX_FAILURE_RATE)

        fp_ratio = fp / (
            len(keys) *
            BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100
        assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \
            "false positives found. {:0.3f}%".format(fp_ratio)
        print("OK: Got {:0.3f}% of the expected false positives ".format(
            fp_ratio))

        # False negatives never occur - this should always work
        for k in keys:
            assert filter.contains(k)

    def testBloomLimits1(slef):
        maxBuckets = len(BloomCalculations.PROBS) - 1
        maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1

        # possible
        BloomCalculations.computeBloomSpec2(
            maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK])

        # impossible, throws
        try:
            BloomCalculations.computeBloomSpec2(
                maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2)
            raise RuntimeError
        except UnsupportedOperationException:
            pass

    def test_one(self):
        self.bf.add("a")
        self.bf.contains("a")
        assert not self.bf.contains("b")

    def testFalsePositivesInt(self):
        keygen = KeyGenerator()
        self._testFalsePositives(self.bf,
                                 [str(x) for x in range(10000)],
                                 keygen.randomKeys(10000))

    def testFalsePositivesRandom(self):
        keygen1 = KeyGenerator(314159)
        self._testFalsePositives(
            self.bf,
            [keygen1.random_string() for i in range(10000)],
            [keygen1.random_string() for i in range(10000)],)

    def testWords(self):
        keygen1 = KeyGenerator()
        bf = WritingBloomFilter(
            len(keygen1)/2, self.MAX_FAILURE_RATE, ignore_case=False)

        even_keys = keygen1[::2]
        odd_keys = keygen1[1::2]
        self._testFalsePositives(bf, even_keys, odd_keys)

    def testNullKeys(self):
        assert 'foo' not in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo')

        assert 'foo' in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0bar')

        assert 'foo\0bar' in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0baz')

        assert 'foo\0baz' in self.bf
Exemple #6
0
class TestBloomFilter(object):
    ELEMENTS = 10000
    MAX_FAILURE_RATE = 0.1

    def setup(self):
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def _test_false_positives(self, bf, keys, otherkeys):
        fp = 0

        assert len(keys) == len(otherkeys)

        for key in keys:
            bf.add(key)

        for key in otherkeys:
            if bf.contains(key):
                fp += 1

        bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS)
        spec = BloomCalculations.computeBloomSpec2(bucketsPerElement,
                                                   self.MAX_FAILURE_RATE)

        fp_ratio = fp / (
            len(keys) *
            BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100
        assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \
            "false positives found. {:0.3f}%".format(fp_ratio)
        print("OK: Got {:0.3f}% of the expected false positives ".format(
            fp_ratio))

        # False negatives never occur - this should always work
        for k in keys:
            assert bf.contains(k)

    def test_bloom_limits1(self):
        maxBuckets = len(BloomCalculations.PROBS) - 1
        maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1

        # possible
        BloomCalculations.computeBloomSpec2(
            maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK])

        # impossible, throws
        try:
            BloomCalculations.computeBloomSpec2(
                maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2)
            raise RuntimeError
        except UnsupportedOperationException:
            pass

    def test_one(self):
        self.bf.add("a")
        self.bf["aa"] = 0
        assert self.bf.contains("a")
        assert "aa" in self.bf
        assert not self.bf.contains("b")
        assert "b" not in self.bf

    def test_false_positives_int(self):
        keygen = KeyGenerator()
        self._test_false_positives(self.bf, [str(x) for x in range(10000)],
                                   keygen.randomKeys(10000))

    def test_false_positives_random(self):
        keygen1 = KeyGenerator(314159)
        self._test_false_positives(
            self.bf,
            [keygen1.random_string() for i in range(10000)],
            [keygen1.random_string() for i in range(10000)],
        )

    def test_words(self):
        keygen1 = KeyGenerator()
        bf = WritingBloomFilter(len(keygen1) / 2,
                                self.MAX_FAILURE_RATE,
                                ignore_case=False)

        even_keys = keygen1[::2]
        odd_keys = keygen1[1::2]
        self._test_false_positives(bf, even_keys, odd_keys)

    def test_null_keys(self):
        assert 'foo' not in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo')

        assert 'foo' in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0bar')

        assert 'foo\0bar' in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0baz')

        assert 'foo\0baz' in self.bf
Exemple #7
0
# If you want to add the real k-mers
#CEs=MH.import_multiple_from_single_hdf5('/home/dkoslicki/Desktop/CMash/data/SmallGenomes.h5')
#all_kmers = set()
#for CE in CEs:
#	for kmer in CE._kmers:
#		all_kmers.add("%s" % kmer)
#for CE in CEs:
#	for kmer in CE._kmers:
# 		bloom.add("%s" % kmer)

# If you want to add random kmers
all_kmers = set()
for _ in range(num_in_bloom):
    kmer = "".join(np.random.choice(["A", "C", "T", "G"], 60))
    bloom.add(kmer)
    all_kmers.add(kmer)

# Test the timing for a bloom query
N = 10000
i = 0
t0 = timeit.default_timer()
for _ in range(N):
    #kmer = random.sample(all_kmers, 1)[0]
    kmer = "".join(np.random.choice(["A", "C", "T", "G"], 60))
    if kmer in bloom:
        i += 1
    else:
        pass
t1 = timeit.default_timer()
print(t1 - t0)
        tree = mt.Trie(to_insert)
        tree.save(streaming_database_file)
    else:
        tree = mt.Trie()
        tree.load(streaming_database_file)

    # all the k-mers of interest in a set (as a pre-filter)
    if not hydra_file:  # create one
        try:
            all_kmers_bf = WritingBloomFilter(
                len(sketches) * len(k_range) * num_hashes * 2, 0.01)
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
                        all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize])
                        )  # also add the reverse complement
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
            all_kmers_bf = ReadingBloomFilter(hydra_file)
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
Exemple #9
0
        tree = mt.Trie(to_insert)
        tree.save(streaming_database_file)
    else:
        tree = mt.Trie()
        tree.load(streaming_database_file)

    # all the k-mers of interest in a set (as a pre-filter)
    if not hydra_file:  # create one
        try:
            all_kmers_bf = WritingBloomFilter(
                len(sketches) * len(k_range) * num_hashes, 0.01)
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
            all_kmers_bf = ReadingBloomFilter(hydra_file)
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)

    # Seen k-mers (set of k-mers that already hit the trie, so don't need to check again)
    seen_kmers = set()