def convert_index(infile, config, num_samples): in_graph = db.DB() in_graph.set_cachesize(4, 0) in_graph.open(infile + "/graph", flags=db.DB_RDONLY) # Create the kmer signature index storage = get_storage(config) storage.set_integer(BLOOMFILTER_SIZE_KEY, config["m"]) storage.set_integer(NUM_HASH_FUNCTS_KEY, config["h"]) BitMatrix.create(storage=storage, rows=get_rows(in_graph, config["m"]), num_rows=config["m"], num_cols=num_samples) in_graph.close()
def create(cls, storage, bloomfilters, bloomfilter_size, num_hashes, lowmem=False): bloomfilters = [ bf.bitarray if isinstance(bf, BloomFilter) else bf for bf in bloomfilters ] storage.set_integer(BLOOMFILTER_SIZE_KEY, bloomfilter_size) storage.set_integer(NUM_HASH_FUNCTS_KEY, num_hashes) logger.debug("Transpose bitarrays") rows = transpose(bloomfilters, lowmem=lowmem) logger.debug("Insert rows") bitmatrix = BitMatrix.create( storage, rows, num_rows=bloomfilter_size, num_cols=len(bloomfilters) ) return cls(storage)
def test_get_insert_column(): rows = [ bitarray("001"), bitarray("001"), bitarray("111"), bitarray("001"), bitarray("111"), ] * 5 for storage in get_storages(): storage.delete_all() bm = BitMatrix.create(storage, rows, len(rows), len(rows[0])) assert bm.get_column(0) == bitarray("00101" * 5) bm.insert_column(bitarray("1" * 25), 0) assert bm.get_column(0) == bitarray("1" * 25) assert bm.get_row(1) == bitarray("101") bm.insert_column(bitarray("1" * 25), 3) assert bm.get_column(3) == bitarray("1" * 25) assert bm.get_row(1) == bitarray("1011")
def test_get_set(): rows = [ bitarray("001"), bitarray("001"), bitarray("111"), bitarray("001"), bitarray("111"), ] * 5 for storage in get_storages(): storage.delete_all() bm = BitMatrix.create(storage, rows, len(rows), len(rows[0])) bm.set_rows(range(25), rows) assert list(bm.get_rows(range(3))) == rows[:3] assert bm.get_column(0) == bitarray("00101" * 5) assert bm.get_column(2) == bitarray("1" * 25) assert list(bm.get_columns([0, 2])) == [ bitarray("00101" * 5), bitarray("1" * 25), ]
def __init__(self, storage): self.storage = storage self.bitmatrix = BitMatrix(storage) self.bloomfilter_size = storage.get_integer(BLOOMFILTER_SIZE_KEY) self.num_hashes = storage.get_integer(NUM_HASH_FUNCTS_KEY)
class KmerSignatureIndex: """ Methods for managing kmer signature indexes """ def __init__(self, storage): self.storage = storage self.bitmatrix = BitMatrix(storage) self.bloomfilter_size = storage.get_integer(BLOOMFILTER_SIZE_KEY) self.num_hashes = storage.get_integer(NUM_HASH_FUNCTS_KEY) @classmethod def create(cls, storage, bloomfilters, bloomfilter_size, num_hashes, lowmem=False): bloomfilters = [ bf.bitarray if isinstance(bf, BloomFilter) else bf for bf in bloomfilters ] storage.set_integer(BLOOMFILTER_SIZE_KEY, bloomfilter_size) storage.set_integer(NUM_HASH_FUNCTS_KEY, num_hashes) logger.debug("Transpose bitarrays") rows = transpose(bloomfilters, lowmem=lowmem) logger.debug("Insert rows") bitmatrix = BitMatrix.create( storage, rows, num_rows=bloomfilter_size, num_cols=len(bloomfilters) ) return cls(storage) def lookup(self, kmers, remove_trailing_zeros=True): if isinstance(kmers, str): kmers = [kmers] kmers=set(kmers) kmer_to_hashes = self.__kmers_to_hashes(kmers) hashes = {h for sublist in kmer_to_hashes.values() for h in sublist} rows = self.__batch_get_rows(hashes, remove_trailing_zeros) return self.__bitwise_and_kmers(kmer_to_hashes, rows) def insert_bloom(self, bloomfilter, column_index): self.bitmatrix.insert_column(bloomfilter, column_index) def merge_indexes(self, ksi): for i in range(self.bloomfilter_size): r1 = self.bitmatrix.get_row(i) r2 = ksi.bitmatrix.get_row(i) r1.extend(r2) self.bitmatrix.set_row(i, r1) self.bitmatrix.set_num_cols(self.bitmatrix.num_cols + ksi.bitmatrix.num_cols) def __kmers_to_hashes(self, kmers): d = {} for k in set(kmers): d[k] = set( generate_hashes( convert_query_kmer(k), self.num_hashes, self.bloomfilter_size ) ) ## use canonical kmer to generate lookup, but report query kmer return d def __batch_get_rows(self, row_indexes, remove_trailing_zeros=False): return dict(zip(row_indexes, self.bitmatrix.get_rows(row_indexes, remove_trailing_zeros=remove_trailing_zeros))) def __bitwise_and_kmers(self, kmer_to_hashes, rows): d = {} for k, hashes in kmer_to_hashes.items(): subset_rows = [rows[h] for h in hashes] d[k] = bitwise_and(subset_rows) return d