def __init__(self, db=DEFUALT_DB_DIRECTORY, cachesize=1, nproc=0, mode="c"): self.mode = mode self.nproc = nproc self.db = db try: self.metadata = self.load_metadata(mode) except (bsddb3.db.DBNoSuchFileError, bsddb3.db.DBError) as e: print(e) if isinstance(e, bsddb3.db.DBError): raise OSError( "You don't have permission to access this directory %s ." % self.db) else: raise OSError( "Cannot find a BIGSI at %s. Run `bigsi init` or BIGSI.create()" % db) else: self.metadata = self.load_metadata(mode=mode) self.bloom_filter_size = int.from_bytes( self.metadata['bloom_filter_size'], 'big') self.num_hashes = int.from_bytes(self.metadata['num_hashes'], 'big') self.kmer_size = int.from_bytes(self.metadata['kmer_size'], 'big') self.scorer = Scorer(self.get_num_colours()) self.graph = ProbabilisticBerkeleyDBStorage( filename=self.graph_filename, bloom_filter_size=self.bloom_filter_size, num_hashes=self.num_hashes, mode=mode) self.graph.sync() self.metadata.sync()
def __init__(self, config=None): if config is None: config = DEFAULT_CONFIG self.config = config self.storage = get_storage(config) SampleMetadata.__init__(self, self.storage) KmerSignatureIndex.__init__(self, self.storage) self.min_unique_kmers_in_query = ( MIN_UNIQUE_KMERS_IN_QUERY ) ## TODO this can be inferred and set at build time self.scorer=Scorer(self.num_samples)
def test_score(): s = "1111111111111111111111111111111111111111110000000000000000000000000000001111111111111111111111100000000000000000000100000010001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000000000000000100000000010001111111111000000000000100000000000000000000000000000000100000000000010000000010000001000000000010000000000000000010001111111100000000000001100010000000000000000000001000000000000110000000000000000000000100000000000000000000100000000000000001010001111111111100000000000000000000100100010011111111111111111100000000001001000001000000000000000000000000000001000000010100000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111100000010110001000100000000000000000000000000000000000001000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100010000000100000000001010000001111111111111111111111111111111111111111111111111111111111111111100100000000010000000010000000001111111111111111111111111111111111111111111111111111111111111111111111100000100000000000010000000000000010000000011111111000000100010" scorer = Scorer(5*10**5) assert scorer.score(s) == {'length': 1174, 'max_mismatches': 269, 'max_nident': 1156, 'max_pident': 98.46678023850085, 'max_score': 1119.98, 'min_mismatches': 18, 'min_nident': 905, 'min_pident': 77.08688245315162, 'min_score': 96.04, 'mismatches': 33, 'nident': 1141, 'pident': 97.18909710391823, 'score': 1064.89, 'evalue': 0.0, 'pvalue': 0.0, 'log_evalue': -1407.74, 'log_pvalue': -1407.74}
def test_score(): s = "1111111111111111111111111111111111111111110000000000000000000000000000001111111111111111111111100000000000000000000100000010001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000000000000000100000000010001111111111000000000000100000000000000000000000000000000100000000000010000000010000001000000000010000000000000000010001111111100000000000001100010000000000000000000001000000000000110000000000000000000000100000000000000000000100000000000000001010001111111111100000000000000000000100100010011111111111111111100000000001001000001000000000000000000000000000001000000010100000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111100000010110001000100000000000000000000000000000000000001000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100010000000100000000001010000001111111111111111111111111111111111111111111111111111111111111111100100000000010000000010000000001111111111111111111111111111111111111111111111111111111111111111111111100000100000000000010000000000000010000000011111111000000100010" scorer = Scorer(5 * 10**5) assert scorer.score(s) == { "length": 1174, "max_mismatches": 269, "max_nident": 1156, "max_pident": 98.46678023850085, "max_score": 1119.98, "min_mismatches": 18, "min_nident": 905, "min_pident": 77.08688245315162, "min_score": 96.04, "mismatches": 33, "nident": 1141, "pident": 97.18909710391823, "score": 1064.89, "evalue": 0.0, "pvalue": 0.0, "log_evalue": -1407.74, "log_pvalue": -1407.74, }
class BIGSI(SampleMetadata, KmerSignatureIndex): def __init__(self, config=None): if config is None: config = DEFAULT_CONFIG self.config = config self.storage = get_storage(config) SampleMetadata.__init__(self, self.storage) KmerSignatureIndex.__init__(self, self.storage) self.min_unique_kmers_in_query = ( MIN_UNIQUE_KMERS_IN_QUERY ) ## TODO this can be inferred and set at build time self.scorer=Scorer(self.num_samples) @property def kmer_size(self): return self.config["k"] @property def nproc(self): return self.config.get("nproc", DEFAULT_NPROC) @classmethod def bloom(cls, config, kmers): kmers = convert_query_kmers(kmers) ## Convert to canonical kmers bloomfilter = BloomFilter(m=config["m"], h=config["h"]) bloomfilter.update(kmers) return bloomfilter.bitarray @classmethod def build(cls, config, bloomfilters, samples): storage = get_storage(config) validate_build_params(bloomfilters, samples) logger.debug("Insert sample metadata") sm = SampleMetadata(storage).add_samples(samples) logger.debug("Create signature index") ksi = KmerSignatureIndex.create( storage, bloomfilters, config["m"], config["h"], config.get("low_mem_build", False), ) storage.close() ## Need to delete LOCK files before re init return cls(config) def search(self, seq, threshold=1.0, score=False): self.__validate_search_query(seq) assert threshold <= 1 kmers = list(self.seq_to_kmers(seq)) kmers_to_colours = self.lookup(kmers, remove_trailing_zeros=False) min_kmers = math.ceil(len(set(kmers)) * threshold) if threshold == 1.0: results = self.exact_filter(kmers_to_colours) else: results = self.inexact_filter(kmers_to_colours, min_kmers) if score: self.score(kmers, kmers_to_colours, results) return [r.todict() for r in results if not r.sample_name==DELETION_SPECIAL_SAMPLE_NAME] def exact_filter(self, kmers_to_colours): colours_with_all_kmers = non_zero_bitarrary_positions( bitwise_and(kmers_to_colours.values()) ) samples = self.get_sample_list(colours_with_all_kmers) return [ BigsiQueryResult( colour=c, sample_name=s, num_kmers=len(kmers_to_colours), num_kmers_found=len(kmers_to_colours), ) for c,s in zip(colours_with_all_kmers, samples) ] def get_sample_list(self, colours): colours_to_samples = self.colours_to_samples(colours) return [colours_to_samples[i] for i in colours] def inexact_filter(self, kmers_to_colours, min_kmers): num_kmers = unpack_and_sum_bitarrays(list(kmers_to_colours.values()), self.nproc) colours = range(self.num_samples) colours_to_kmers_found = dict(zip(colours, num_kmers)) colours_to_kmers_found_above_threshold = self.__colours_above_threshold( colours_to_kmers_found, min_kmers ) results = [ BigsiQueryResult( colour=colour, sample_name=self.colour_to_sample(colour), num_kmers_found=int(num_kmers_found), num_kmers=len(kmers_to_colours), ) for colour, num_kmers_found in colours_to_kmers_found_above_threshold.items() ] results.sort(key=lambda x: x.num_kmers_found, reverse=True) return results def score(self, kmers, kmers_to_colours, results): rows=[kmers_to_colours[kmer] for kmer in kmers] X=unpack_and_cat_bitarrays(rows, self.nproc) for res in results: col="".join([str(i) for i in X[:,res.colour].tolist()]) score_results=self.scorer.score(col) score_results["kmer-presence"]=col res.add_score(score_results) def __colours_above_threshold(self, colours_to_percent_kmers, min_kmers): return {k: v for k, v in colours_to_percent_kmers.items() if v >= min_kmers} def insert(self, bloomfilter, sample): logger.warning("Build and merge is preferable to insert in most cases") colour = self.add_sample(sample) self.insert_bloom(bloomfilter, colour - 1) def delete(self): self.storage.delete_all() def __validate_merge(self, bigsi): assert self.bloomfilter_size == bigsi.bloomfilter_size assert self.num_hashes == bigsi.num_hashes assert self.kmer_size == bigsi.kmer_size def merge(self, bigsi): self.__validate_merge(bigsi) self.merge_indexes(bigsi) self.merge_metadata(bigsi) def __validate_search_query(self, seq): kmers = set() for k in self.seq_to_kmers(seq): kmers.add(k) if len(kmers) > self.min_unique_kmers_in_query: return True else: logger.warning( "Query string should contain at least %i unique kmers. Your query contained %i unique kmers, and as a result the false discovery rate may be high. In future this will become an error." % (self.min_unique_kmers_in_query, len(kmers)) ) def seq_to_kmers(self, seq): return seq_to_kmers(seq, self.kmer_size)
class BIGSI(object): def __init__(self, db=DEFUALT_DB_DIRECTORY, cachesize=1, nproc=0, mode="c"): self.mode = mode self.nproc = nproc self.db = db try: self.metadata = self.load_metadata(mode) except (bsddb3.db.DBNoSuchFileError, bsddb3.db.DBError) as e: print(e) if isinstance(e, bsddb3.db.DBError): raise OSError( "You don't have permission to access this directory %s ." % self.db) else: raise OSError( "Cannot find a BIGSI at %s. Run `bigsi init` or BIGSI.create()" % db) else: self.metadata = self.load_metadata(mode=mode) self.bloom_filter_size = int.from_bytes( self.metadata['bloom_filter_size'], 'big') self.num_hashes = int.from_bytes(self.metadata['num_hashes'], 'big') self.kmer_size = int.from_bytes(self.metadata['kmer_size'], 'big') self.scorer = Scorer(self.get_num_colours()) self.graph = ProbabilisticBerkeleyDBStorage( filename=self.graph_filename, bloom_filter_size=self.bloom_filter_size, num_hashes=self.num_hashes, mode=mode) self.graph.sync() self.metadata.sync() def load_metadata(self, mode="c"): return BerkeleyDBStorage(filename=os.path.join(self.db, "metadata"), mode=mode) @property def graph_filename(self): return os.path.join(self.db, "graph") @property def metadata_filename(self): return os.path.join(self.db, "metadata") def load_graph(self, mode="r"): return self.graph @classmethod def create(cls, db=DEFUALT_DB_DIRECTORY, k=31, m=25000000, h=3, cachesize=1, force=False): # Initialises a BIGSI # m: bloom_filter_size # h: number of hash functions # directory - where to store the bigsi try: os.mkdir(db) except FileExistsError: if force: logger.info("Clearing and recreating %s" % db) cls(db, mode="c").delete_all() return cls.create(db=db, k=k, m=m, h=h, cachesize=cachesize, force=False) raise FileExistsError( "A BIGSI already exists at %s. Run with --force or BIGSI.create(force=True) to recreate." % db) else: logger.info("Initialising BIGSI at %s" % db) metadata_filepath = os.path.join(db, "metadata") metadata = BerkeleyDBStorage(filename=metadata_filepath, mode="c") metadata["bloom_filter_size"] = (int(m)).to_bytes(4, byteorder='big') metadata["num_hashes"] = (int(h)).to_bytes(4, byteorder='big') metadata["kmer_size"] = (int(k)).to_bytes(4, byteorder='big') metadata.sync() return cls(db=db, cachesize=cachesize, mode="c") def build(self, bloomfilters, samples, lowmem=False): # Need to open with read and write access if not len(bloomfilters) == len(samples): raise ValueError( "There must be the same number of bloomfilters and sample names" ) graph = self.load_graph(mode="w") bloom_filter_size = len(bloomfilters[0]) logger.debug("Adding samples") [self._add_sample(s, sync=False) for s in samples] logger.debug("transpose") bigsi = transpose(bloomfilters, lowmem=lowmem) logger.debug("insert") for i, ba in enumerate(bigsi): graph[i] = ba.tobytes() self.sync() def merge(self, merged_bigsi): logger.info("Starting merge") # Check that they're the same length assert self.metadata["bloom_filter_size"] == merged_bigsi.metadata[ "bloom_filter_size"] assert self.metadata["num_hashes"] == merged_bigsi.metadata[ "num_hashes"] assert self.metadata["kmer_size"] == merged_bigsi.metadata["kmer_size"] self._merge_graph(merged_bigsi) self._merge_metadata(merged_bigsi) def _merge_graph(self, merged_bigsi): graph = self.load_graph(mode="w") # Update graph for i in range(self.bloom_filter_size): r = graph.get_row(i)[:self.get_num_colours()] r2 = merged_bigsi.graph.get_row(i)[:merged_bigsi.get_num_colours()] r.extend(r2) graph.set_row(i, r) graph.sync() def _merge_metadata(self, merged_bigsi): # Update metadata for c in range(merged_bigsi.get_num_colours()): sample = merged_bigsi.colour_to_sample(c) try: self._add_sample(sample, sync=False) except ValueError: self._add_sample(sample + "_duplicate_in_merge", sync=False) self.metadata.sync() @convert_kmers_to_canonical def bloom(self, kmers): logger.info("Building bloom filter") return self.load_graph().bloomfilter.create(kmers) def insert(self, bloom_filter, sample): """ Insert kmers into the multicoloured graph. sample can not already exist in the graph """ try: self.load_graph()[0] except: logger.error( "No existing index. Run `init` and `build` before `insert` or `search`" ) raise ValueError( "No existing index. Run `init` and `build` before `insert` or `search`" ) colour = self._add_sample(sample) logger.info("Inserting sample %s into colour %i" % (sample, colour)) self._insert(bloom_filter, colour) self.sync() def search(self, seq, threshold=1, score=False): assert threshold <= 1 return self._search(self.seq_to_kmers(seq), threshold=threshold, score=score) def lookup(self, kmers): """Return sample names where these kmers is present""" if isinstance(kmers, str) and len(kmers) > self.kmer_size: kmers = self.seq_to_kmers(kmers) out = {} if isinstance(kmers, str): out[kmers] = self._lookup(kmers) else: for kmer in kmers: out[kmer] = self._lookup(kmer) return out def lookup_raw(self, kmer): return self._lookup_raw(kmer) def seq_to_kmers(self, seq): return seq_to_kmers(seq, self.kmer_size) def metadata_set(self, metadata_key, value, sync=True): metadata = self.metadata metadata[metadata_key] = pickle.dumps(value) if sync: self.sync() def metadata_hgetall(self, metadata_key): return pickle.loads(self.metadata.get(metadata_key, pickle.dumps({}))) def metadata_hget(self, metadata_key, key): return self.metadata_hgetall(metadata_key).get(key) def add_sample_metadata(self, sample, key, value, overwrite=False, sync=True): metadata_key = "ss_%s" % sample self.metadata_hset(metadata_key, key, value, overwrite=overwrite, sync=sync) def lookup_sample_metadata(self, sample): metadata_key = "ss_%s" % sample return self.metadata_hgetall(metadata_key) def metadata_hset(self, metadata_key, key, value, overwrite=False, sync=True): metadata_values = self.metadata_hgetall(metadata_key) if key in metadata_values and not overwrite: raise ValueError( "%s is already in the metadata of %s with value %s " % (key, metadata_key, metadata_values[key])) else: metadata_values[key] = value self.metadata_set(metadata_key, metadata_values, sync=sync) def set_colour(self, colour, sample, overwrite=False, sync=True): colour = int(colour) metadata = self.metadata metadata["colour%i" % colour] = sample if sync: self.sync() def sample_to_colour(self, sample): return self.lookup_sample_metadata(sample).get('colour') def colour_to_sample(self, colour): metadata = self.metadata r = metadata["colour%i" % colour].decode('utf-8') if r: return r else: return str(colour) def delete_sample(self, sample_name): try: colour = self.sample_to_colour(sample_name) except: raise ValueError("Can't find sample %s" % sample_name) else: self.set_colour(colour, "DELETED") self.delete_sample(sample_name) del self.metadata_hgetall[sample_name] @convert_kmers_to_canonical def _lookup_raw(self, kmer, canonical=False): return self.graph.lookup(kmer).tobytes() def get_bloom_filter(self, sample): colour = self.sample_to_colour(sample) return self.graph.get_bloom_filter(colour) def create_bloom_filter(self, kmers): return self.graph.create_bloom_filter(kmers) def _insert(self, bloomfilter, colour): graph = self.load_graph(mode="c") if bloomfilter: logger.debug("Inserting bloomfilter into colour %i" % colour) graph.insert(bloomfilter, int(colour)) graph.sync() def colours(self, kmer): return {kmer: self._colours(kmer)} @convert_kmers_to_canonical def _colours(self, kmer, canonical=False): colour_presence_boolean_array = self.load_graph().lookup(kmer) return colour_presence_boolean_array.colours() def _get_kmers_colours(self, kmers): for kmer in kmers: ba = self.load_graph().lookup(kmer) yield kmer, ba def _search(self, kmers, threshold=1, score=False): """Return sample names where this kmer is present""" if isinstance(kmers, str): return self._search_kmer(kmers) else: return self._search_kmers(kmers, threshold=threshold, score=score) @convert_kmers_to_canonical def _search_kmer(self, kmer, canonical=False): out = {} for colour in self.colours(kmer, canonical=True): sample = self.colour_to_sample(colour) if sample != "DELETED": out[sample] = 1.0 return out @convert_kmers_to_canonical def _search_kmers(self, kmers, threshold=1, score=False): if threshold == 1: return self._search_kmers_threshold_1(kmers, score=score) else: return self._search_kmers_threshold_not_1(kmers, threshold=threshold, score=score) def _search_kmers_threshold_not_1(self, kmers, threshold, score): if score: return self._search_kmers_threshold_not_1_with_scoring( kmers, threshold) else: return self._search_kmers_threshold_not_1_without_scoring( kmers, threshold) def _search_kmers_threshold_not_1_with_scoring(self, kmers, threshold): out = {} kmers = list(kmers) result = self._search_kmers_threshold_not_1_without_scoring( kmers, threshold, convert_colours=False) kmer_lookups = [self.load_graph().lookup(kmer) for kmer in kmers] for colour, r in result.items(): percent = r["percent_kmers_found"] s = "".join( [str(int(kmer_lookups[i][colour])) for i in range(len(kmers))]) sample = self.colour_to_sample(colour) out[sample] = self.scorer.score(s) out[sample]["percent_kmers_found"] = percent return out def _search_kmers_threshold_not_1_without_scoring(self, kmers, threshold, convert_colours=True): out = {} bas = [ba for _, ba in self._get_kmers_colours(kmers)] cumsum = unpack_bas(bas, j=self.nproc) lkmers = len(bas) for i, f in enumerate(cumsum): res = float(f) / lkmers if res >= threshold: if convert_colours: sample = self.colour_to_sample(i) else: sample = i if sample != "DELETED": out[sample] = {} out[sample]["percent_kmers_found"] = 100 * res return out def _search_kmers_threshold_1(self, kmers, score=False): """Special case where the threshold is 1 (can accelerate queries with AND)""" kmers = list(kmers) ba = self.load_graph().lookup_all_present(kmers) out = {} for c in ba.colours(): sample = self.colour_to_sample(c) if sample != "DELETED": if score: out[sample] = self.scorer.score( "1" * (len(kmers) + self.kmer_size - 1)) # Fix! else: out[sample] = {} out[sample]["percent_kmers_found"] = 100 return out @convert_kmers_to_canonical def _lookup(self, kmer, canonical=False): assert not isinstance(kmer, list) num_colours = self.get_num_colours() colour_presence_boolean_array = self.load_graph().lookup(kmer) samples_present = [] for i, present in enumerate(colour_presence_boolean_array): if present: samples_present.append(self.colour_to_sample(i)) if i > num_colours: break return samples_present def _add_sample(self, sample_name, sync=True): sample_name = str(sample_name) metadata = self.metadata # logger.debug("Adding sample %s" % sample_name) existing_index = self.sample_to_colour(sample_name) if existing_index is not None: raise ValueError("%s already exists in the db" % sample_name) else: colour = self.get_num_colours() if colour is None: colour = 0 else: colour = int(colour) self.add_sample_metadata(sample_name, 'colour', colour, sync=sync) self.set_colour(colour, sample_name, sync=sync) metadata.incr('num_colours') if sync: metadata.sync() return colour def get_num_colours(self): return int.from_bytes( self.metadata.get('num_colours', b'\x00\x00\x00\x00'), 'big') def sync(self): self.load_graph().storage.sync() self.metadata.sync() def delete_all(self): self.load_graph().delete_all() self.metadata.delete_all() os.rmdir(self.db)