Esempio n. 1
0
 def testManyRandom(self):
     keygen = KeyGenerator()
     MAX_HASH_COUNT = 128
     bloom = WritingBloomFilter(15, 0.0009)
     hashes = set()
     collisions = 0
     for key in keygen.randomKeys():
         for i, hashIndex in enumerate(bloom.getHashBuckets(key, MAX_HASH_COUNT, 1024 * 1024)):
             hashes.add(hashIndex)
         collisions += MAX_HASH_COUNT - len(hashes)
         hashes.clear()
     assert collisions <= 100, "Got %d collisions." % collisions
Esempio n. 2
0
 def test_many_random(self):
     keygen = KeyGenerator()
     MAX_HASH_COUNT = 128
     bloom = WritingBloomFilter(15, 0.0009)
     hashes = set()
     collisions = 0
     for key in keygen.randomKeys():
         for i, hashIndex in enumerate(
                 bloom.getHashBuckets(key, MAX_HASH_COUNT, 1024 * 1024)):
             hashes.add(hashIndex)
         collisions += MAX_HASH_COUNT - len(hashes)
         hashes.clear()
     assert collisions <= 100, "Got {} collisions.".format(collisions)
Esempio n. 3
0
class TestHugeBloom(object):
    ELEMENTS = 1000000000
    MAX_FAILURE_RATE = 0.001

    def setup(self):
        import struct
        if 8 * struct.calcsize("P") == 32:
            raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def test_one(self):
        self.bf.add("a")
        assert self.bf.contains("a")
        assert not self.bf.contains("b")
Esempio n. 4
0
class TestHugeBloom():
    ELEMENTS = 1000000000
    MAX_FAILURE_RATE = 0.001

    def setup(self):
        import struct
        if 8 * struct.calcsize("P") == 32:
            raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def test_one(self):
        self.bf.add("a")
        assert self.bf.contains("a")
        assert not self.bf.contains("b")
Esempio n. 5
0
def test_unicrap():
    filter = WritingBloomFilter(100000, 0.1)
    assert u'\u2019' not in filter
    assert u'\u2018' not in filter

    filter.add(u'\u2018')
    filter.add(u'\u2019')

    filter.add('just a plain string')

    assert u'\u2019' in filter
    assert u'\u2018' in filter
    assert 'just a plain string' in filter

    assert filter[u'\u2019'] == 1
    assert filter[u'\u2018'] == 1
    assert filter['just a plain string'] == 1
Esempio n. 6
0
    def create_BF_prefilter(self, result_file=None) -> None:
        """
		Imports or creates the pre-filter Bloom filter
		:param result_file: (optional) if you'd like to export the bloom filter, populate that here
		:type result_file: str
		"""
        tree = self.tree
        k_range = self.k_range
        if not self.bloom_filter_file:  # create one
            try:
                # Get all the k-mers in the TST, put them in a bloom filter
                # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01)
                if result_file:
                    # save it to the file
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True,
                        filename=result_file
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                else:
                    # keep it in memory
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                for kmer_info in tree.keys():
                    kmer = kmer_info.split(
                        'x'
                    )[0]  # remove the location information and just get the kmer
                    for ksize in k_range:
                        self.all_kmers_bf.add(kmer[0:ksize])
                        self.all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize]))
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
        else:  # otherwise read it in
            try:
                self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file)
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
Esempio n. 7
0
    def test_words(self):
        keygen1 = KeyGenerator()
        bf = WritingBloomFilter(len(keygen1) / 2,
                                self.MAX_FAILURE_RATE,
                                ignore_case=False)

        even_keys = keygen1[::2]
        odd_keys = keygen1[1::2]
        self._test_false_positives(bf, even_keys, odd_keys)
Esempio n. 8
0
    def create_filter(self, filepath, size, falsepos_rate):
        """
        Create new bloom filter

        :param bytes filepath: Path to persistent bloom filter on disk
        :param int size: Maximum number of elements in bloom filter
        :param float falsepos_rate: Maximum false positive probability

        """
        self.current_filter = WritingBloomFilter(int(size), float(falsepos_rate),
                                                 filename=filepath)
Esempio n. 9
0
 def test_hash_buckets(self):
     bloom = WritingBloomFilter(15, 0.0009)
     buckets = bloom.getHashBuckets('hydra', 128, 1024 * 1024)
     assert buckets == [
         536658, 898974, 212714, 575030, 937346, 251086, 613402, 975718,
         289458, 651774, 1014090, 327830, 690146, 3886, 366202, 728518,
         42258, 404574, 766890, 80630, 442946, 805262, 119002, 481318,
         843634, 157374, 519690, 882006, 195746, 558062, 920378, 234118,
         596434, 958750, 272490, 634806, 997122, 310862, 673178, 1035494,
         349234, 711550, 25290, 387606, 749922, 63662, 425978, 788294,
         102034, 464350, 826666, 140406, 502722, 865038, 178778, 541094,
         903410, 217150, 579466, 941782, 255522, 617838, 980154, 293894,
         656210, 1018526, 332266, 694582, 8322, 370638, 732954, 46694,
         409010, 771326, 85066, 447382, 809698, 123438, 485754, 848070,
         161810, 524126, 886442, 200182, 562498, 924814, 238554, 600870,
         963186, 276926, 639242, 1001558, 315298, 677614, 1039930, 353670,
         715986, 29726, 392042, 754358, 68098, 430414, 792730, 106470,
         468786, 831102, 144842, 507158, 869474, 183214, 545530, 907846,
         221586, 583902, 946218, 259958, 622274, 984590, 298330, 660646,
         1022962, 336702, 699018, 12758, 375074, 737390, 51130, 413446
     ]
Esempio n. 10
0
 def testHashBuckets(self):
     bloom = WritingBloomFilter(15, 0.0009)
     buckets = bloom.getHashBuckets('hydra', 128, 1024 * 1024)
     assert buckets == [
         536658, 898974, 212714, 575030, 937346, 251086, 613402,
         975718, 289458, 651774, 1014090, 327830, 690146, 3886,
         366202, 728518, 42258, 404574, 766890, 80630, 442946,
         805262, 119002, 481318, 843634, 157374, 519690, 882006,
         195746, 558062, 920378, 234118, 596434, 958750, 272490,
         634806, 997122, 310862, 673178, 1035494, 349234, 711550,
         25290, 387606, 749922, 63662, 425978, 788294, 102034,
         464350, 826666, 140406, 502722, 865038, 178778, 541094,
         903410, 217150, 579466, 941782, 255522, 617838, 980154,
         293894, 656210, 1018526, 332266, 694582, 8322, 370638,
         732954, 46694, 409010, 771326, 85066, 447382, 809698,
         123438, 485754, 848070, 161810, 524126, 886442, 200182,
         562498, 924814, 238554, 600870, 963186, 276926, 639242,
         1001558, 315298, 677614, 1039930, 353670, 715986, 29726,
         392042, 754358, 68098, 430414, 792730, 106470, 468786,
         831102, 144842, 507158, 869474, 183214, 545530, 907846,
         221586, 583902, 946218, 259958, 622274, 984590, 298330,
         660646, 1022962, 336702, 699018, 12758, 375074, 737390,
         51130, 413446]
Esempio n. 11
0
def test_unicrap():
    bf = WritingBloomFilter(100000, 0.1)
    assert u'\u2019' not in bf
    assert u'\u2018' not in bf

    bf.add(u'\u2018')
    bf.add(u'\u2019')

    bf.add('just a plain string')

    assert u'\u2019' in bf
    assert u'\u2018' in bf
    assert 'just a plain string' in bf

    assert bf[u'\u2019'] == 1
    assert bf[u'\u2018'] == 1
    assert bf['just a plain string'] == 1
Esempio n. 12
0
 def setup(self):
     import struct
     if 8 * struct.calcsize("P") == 32:
         raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
     self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
Esempio n. 13
0
 def setup(self):
     import struct
     if 8 * struct.calcsize("P") == 32:
         raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
     self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
Esempio n. 14
0
        to_insert = set()
        for i in range(len(sketches)):
            for kmer_index in range(len(sketches[i]._kmers)):
                kmer = sketches[i]._kmers[kmer_index]
                to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)
                              )  # format here is kmer+x+hash_index+kmer_index
        tree = mt.Trie(to_insert)
        tree.save(streaming_database_file)
    else:
        tree = mt.Trie()
        tree.load(streaming_database_file)

    # all the k-mers of interest in a set (as a pre-filter)
    if not hydra_file:  # create one
        try:
            all_kmers_bf = WritingBloomFilter(
                len(sketches) * len(k_range) * num_hashes * 2, 0.01)
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
                        all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize])
                        )  # also add the reverse complement
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
Esempio n. 15
0
"""
This is a speed profiler for insertion and lookup
"""
import cProfile
from hydra import WritingBloomFilter
from helpers import KeyGenerator

keygen = KeyGenerator()
input_keys = [keygen.random_string() for i in range(100000)]
other_keys = [keygen.random_string() for i in range(200000)]

ELEMENTS = 10000000
MAX_FAILURE_RATE = 0.1
bf = WritingBloomFilter(ELEMENTS, MAX_FAILURE_RATE)


def test_one():
    for key in input_keys:
        bf[key] = 0

    for key in other_keys:
        key in bf


cProfile.run('test_one()')
Esempio n. 16
0
 def setup(self):
     self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
Esempio n. 17
0
class TestBloomFilter(object):
    ELEMENTS = 10000
    MAX_FAILURE_RATE = 0.1

    def setup(self):
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def _test_false_positives(self, bf, keys, otherkeys):
        fp = 0

        assert len(keys) == len(otherkeys)

        for key in keys:
            bf.add(key)

        for key in otherkeys:
            if bf.contains(key):
                fp += 1

        bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS)
        spec = BloomCalculations.computeBloomSpec2(bucketsPerElement,
                                                   self.MAX_FAILURE_RATE)

        fp_ratio = fp / (
            len(keys) *
            BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100
        assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \
            "false positives found. {:0.3f}%".format(fp_ratio)
        print("OK: Got {:0.3f}% of the expected false positives ".format(
            fp_ratio))

        # False negatives never occur - this should always work
        for k in keys:
            assert bf.contains(k)

    def test_bloom_limits1(self):
        maxBuckets = len(BloomCalculations.PROBS) - 1
        maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1

        # possible
        BloomCalculations.computeBloomSpec2(
            maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK])

        # impossible, throws
        try:
            BloomCalculations.computeBloomSpec2(
                maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2)
            raise RuntimeError
        except UnsupportedOperationException:
            pass

    def test_one(self):
        self.bf.add("a")
        self.bf["aa"] = 0
        assert self.bf.contains("a")
        assert "aa" in self.bf
        assert not self.bf.contains("b")
        assert "b" not in self.bf

    def test_false_positives_int(self):
        keygen = KeyGenerator()
        self._test_false_positives(self.bf, [str(x) for x in range(10000)],
                                   keygen.randomKeys(10000))

    def test_false_positives_random(self):
        keygen1 = KeyGenerator(314159)
        self._test_false_positives(
            self.bf,
            [keygen1.random_string() for i in range(10000)],
            [keygen1.random_string() for i in range(10000)],
        )

    def test_words(self):
        keygen1 = KeyGenerator()
        bf = WritingBloomFilter(len(keygen1) / 2,
                                self.MAX_FAILURE_RATE,
                                ignore_case=False)

        even_keys = keygen1[::2]
        odd_keys = keygen1[1::2]
        self._test_false_positives(bf, even_keys, odd_keys)

    def test_null_keys(self):
        assert 'foo' not in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo')

        assert 'foo' in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0bar')

        assert 'foo\0bar' in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0baz')

        assert 'foo\0baz' in self.bf
Esempio n. 18
0
import timeit
import random
import numpy as np
import os

#num_in_bloom = 1000*100
num_in_bloom = 10000000

# delete the old bloom filter so we don't try to stuff more things into it
try:
    os.remove("test.bloom")
    os.remove("test.bloom.desc")
except:
    pass

bloom = WritingBloomFilter(num_in_bloom, 0.01, "test.bloom")

# Read it back in
#bloom = ReadingBloomFilter("test.bloom")

# If you want to add the real k-mers
#CEs=MH.import_multiple_from_single_hdf5('/home/dkoslicki/Desktop/CMash/data/SmallGenomes.h5')
#all_kmers = set()
#for CE in CEs:
#	for kmer in CE._kmers:
#		all_kmers.add("%s" % kmer)
#for CE in CEs:
#	for kmer in CE._kmers:
# 		bloom.add("%s" % kmer)

# If you want to add random kmers
Esempio n. 19
0
class TestBloomFilter(object):
    ELEMENTS = 10000
    MAX_FAILURE_RATE = 0.1

    def setup(self):
        self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)

    def _testFalsePositives(self, filter, keys, otherkeys):
        fp = 0

        assert len(keys) == len(otherkeys)

        for key in keys:
            filter.add(key)

        for key in otherkeys:
            if filter.contains(key):
                fp += 1

        bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS)
        spec = BloomCalculations.computeBloomSpec2(
            bucketsPerElement, self.MAX_FAILURE_RATE)

        fp_ratio = fp / (
            len(keys) *
            BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100
        assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \
            "false positives found. {:0.3f}%".format(fp_ratio)
        print("OK: Got {:0.3f}% of the expected false positives ".format(
            fp_ratio))

        # False negatives never occur - this should always work
        for k in keys:
            assert filter.contains(k)

    def testBloomLimits1(slef):
        maxBuckets = len(BloomCalculations.PROBS) - 1
        maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1

        # possible
        BloomCalculations.computeBloomSpec2(
            maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK])

        # impossible, throws
        try:
            BloomCalculations.computeBloomSpec2(
                maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2)
            raise RuntimeError
        except UnsupportedOperationException:
            pass

    def test_one(self):
        self.bf.add("a")
        self.bf.contains("a")
        assert not self.bf.contains("b")

    def testFalsePositivesInt(self):
        keygen = KeyGenerator()
        self._testFalsePositives(self.bf,
                                 [str(x) for x in range(10000)],
                                 keygen.randomKeys(10000))

    def testFalsePositivesRandom(self):
        keygen1 = KeyGenerator(314159)
        self._testFalsePositives(
            self.bf,
            [keygen1.random_string() for i in range(10000)],
            [keygen1.random_string() for i in range(10000)],)

    def testWords(self):
        keygen1 = KeyGenerator()
        bf = WritingBloomFilter(
            len(keygen1)/2, self.MAX_FAILURE_RATE, ignore_case=False)

        even_keys = keygen1[::2]
        odd_keys = keygen1[1::2]
        self._testFalsePositives(bf, even_keys, odd_keys)

    def testNullKeys(self):
        assert 'foo' not in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo')

        assert 'foo' in self.bf
        assert 'foo\0bar' not in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0bar')

        assert 'foo\0bar' in self.bf
        assert 'foo\0baz' not in self.bf

        self.bf.add('foo\0baz')

        assert 'foo\0baz' in self.bf
Esempio n. 20
0
 def setup(self):
     self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
Esempio n. 21
0
        to_insert = set()
        for i in range(len(sketches)):
            for kmer_index in range(len(sketches[i]._kmers)):
                kmer = sketches[i]._kmers[kmer_index]
                to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)
                              )  # format here is kmer+x+hash_index+kmer_index
        tree = mt.Trie(to_insert)
        tree.save(streaming_database_file)
    else:
        tree = mt.Trie()
        tree.load(streaming_database_file)

    # all the k-mers of interest in a set (as a pre-filter)
    if not hydra_file:  # create one
        try:
            all_kmers_bf = WritingBloomFilter(
                len(sketches) * len(k_range) * num_hashes, 0.01)
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
            all_kmers_bf = ReadingBloomFilter(hydra_file)
        except IOError:
            print("No such file or directory/error opening file: %s" %