def test_merge(): bloomfilter_size = 250 number_hash_functions = 1 kmers1 = ["ATC", "ATG", "ATA", "ATT"] kmers2 = ["ATC", "ATG", "ATA", "TTT"] bloomfilter1 = BloomFilter(bloomfilter_size, number_hash_functions).update( convert_query_kmers(kmers1)) bloomfilter2 = BloomFilter(bloomfilter_size, number_hash_functions).update( convert_query_kmers(kmers2)) bloomfilters = [bloomfilter1, bloomfilter2] for storage in get_storages(): storage.delete_all() ksi1 = KmerSignatureIndex.create(storage, bloomfilters, bloomfilter_size, number_hash_functions) ksi2 = KmerSignatureIndex.create(storage, bloomfilters, bloomfilter_size, number_hash_functions) ksi1.merge_indexes(ksi2) assert ksi1.lookup(["ATC"]) == {"ATC": bitarray("11" * 2)} assert ksi1.lookup(["ATC", "ATC", "ATT"]) == { "ATC": bitarray("11" * 2), "ATT": bitarray("10" * 2), } assert ksi1.lookup(["ATC", "ATC", "ATT", "TTT"]) == { "ATC": bitarray("11" * 2), "ATT": bitarray("10" * 2), "TTT": bitarray("01" * 2), }
def __init__(self, config=None): if config is None: config = DEFAULT_CONFIG self.config = config self.storage = get_storage(config) SampleMetadata.__init__(self, self.storage) KmerSignatureIndex.__init__(self, self.storage) self.min_unique_kmers_in_query = ( MIN_UNIQUE_KMERS_IN_QUERY ) ## TODO this can be inferred and set at build time self.scorer=Scorer(self.num_samples)
def test_lookup1(): bloomfilter_size = 250 number_hash_functions = 3 kmers1 = ["ATC", "ATG", "ATA", "ATT"] kmers2 = ["ATC", "ATG", "ATA", "TTT"] bloomfilter1 = BloomFilter(bloomfilter_size, number_hash_functions).update( convert_query_kmers(kmers1)) # canonical bloomfilter2 = BloomFilter(bloomfilter_size, number_hash_functions).update( convert_query_kmers(kmers2)) bloomfilters = [bloomfilter1.bitarray, bloomfilter2.bitarray] for storage in get_storages(): storage.delete_all() KmerSignatureIndex.create(storage, bloomfilters, bloomfilter_size, number_hash_functions) ksi = KmerSignatureIndex(storage) assert ksi.lookup(["ATC"]) == {"ATC": bitarray("11")} print(ksi.lookup(["ATC", "ATC", "ATT"])) assert ksi.lookup(["ATC", "ATC", "ATT"]) == { "ATC": bitarray("11"), "ATT": bitarray("10"), } assert ksi.lookup(["ATC", "ATC", "ATT", "TTT"]) == { "ATC": bitarray("11"), "ATT": bitarray("10"), "TTT": bitarray("01"), }
def build(cls, config, bloomfilters, samples): storage = get_storage(config) validate_build_params(bloomfilters, samples) logger.debug("Insert sample metadata") sm = SampleMetadata(storage).add_samples(samples) logger.debug("Create signature index") ksi = KmerSignatureIndex.create( storage, bloomfilters, config["m"], config["h"], config.get("low_mem_build", False), ) storage.close() ## Need to delete LOCK files before re init return cls(config)