Exemple #1
0
 def __init__(self,
              db=DEFUALT_DB_DIRECTORY,
              cachesize=1,
              nproc=0,
              mode="c"):
     self.mode = mode
     self.nproc = nproc
     self.db = db
     try:
         self.metadata = self.load_metadata(mode)
     except (bsddb3.db.DBNoSuchFileError, bsddb3.db.DBError) as e:
         print(e)
         if isinstance(e, bsddb3.db.DBError):
             raise OSError(
                 "You don't have permission to access this directory %s ." %
                 self.db)
         else:
             raise OSError(
                 "Cannot find a BIGSI at %s. Run `bigsi init` or BIGSI.create()"
                 % db)
     else:
         self.metadata = self.load_metadata(mode=mode)
         self.bloom_filter_size = int.from_bytes(
             self.metadata['bloom_filter_size'], 'big')
         self.num_hashes = int.from_bytes(self.metadata['num_hashes'],
                                          'big')
         self.kmer_size = int.from_bytes(self.metadata['kmer_size'], 'big')
         self.scorer = Scorer(self.get_num_colours())
         self.graph = ProbabilisticBerkeleyDBStorage(
             filename=self.graph_filename,
             bloom_filter_size=self.bloom_filter_size,
             num_hashes=self.num_hashes,
             mode=mode)
         self.graph.sync()
         self.metadata.sync()
Exemple #2
0
 def __init__(self, config=None):
     if config is None:
         config = DEFAULT_CONFIG
     self.config = config
     self.storage = get_storage(config)
     SampleMetadata.__init__(self, self.storage)
     KmerSignatureIndex.__init__(self, self.storage)
     self.min_unique_kmers_in_query = (
         MIN_UNIQUE_KMERS_IN_QUERY
     )  ## TODO this can be inferred and set at build time
     self.scorer=Scorer(self.num_samples)
Exemple #3
0
def test_score():
    s = "1111111111111111111111111111111111111111110000000000000000000000000000001111111111111111111111100000000000000000000100000010001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000000000000000100000000010001111111111000000000000100000000000000000000000000000000100000000000010000000010000001000000000010000000000000000010001111111100000000000001100010000000000000000000001000000000000110000000000000000000000100000000000000000000100000000000000001010001111111111100000000000000000000100100010011111111111111111100000000001001000001000000000000000000000000000001000000010100000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111100000010110001000100000000000000000000000000000000000001000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100010000000100000000001010000001111111111111111111111111111111111111111111111111111111111111111100100000000010000000010000000001111111111111111111111111111111111111111111111111111111111111111111111100000100000000000010000000000000010000000011111111000000100010"
    scorer = Scorer(5*10**5)
    assert scorer.score(s) == {'length': 1174,
                               'max_mismatches': 269,
                               'max_nident': 1156,
                               'max_pident': 98.46678023850085,
                               'max_score': 1119.98,
                               'min_mismatches': 18,
                               'min_nident': 905,
                               'min_pident': 77.08688245315162,
                               'min_score': 96.04,
                               'mismatches': 33,
                               'nident': 1141,
                               'pident': 97.18909710391823,
                               'score': 1064.89,
                               'evalue': 0.0,
                               'pvalue': 0.0,
                               'log_evalue': -1407.74,
                               'log_pvalue': -1407.74}
Exemple #4
0
def test_score():
    s = "1111111111111111111111111111111111111111110000000000000000000000000000001111111111111111111111100000000000000000000100000010001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000000000000000100000000010001111111111000000000000100000000000000000000000000000000100000000000010000000010000001000000000010000000000000000010001111111100000000000001100010000000000000000000001000000000000110000000000000000000000100000000000000000000100000000000000001010001111111111100000000000000000000100100010011111111111111111100000000001001000001000000000000000000000000000001000000010100000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111100000010110001000100000000000000000000000000000000000001000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100010000000100000000001010000001111111111111111111111111111111111111111111111111111111111111111100100000000010000000010000000001111111111111111111111111111111111111111111111111111111111111111111111100000100000000000010000000000000010000000011111111000000100010"
    scorer = Scorer(5 * 10**5)
    assert scorer.score(s) == {
        "length": 1174,
        "max_mismatches": 269,
        "max_nident": 1156,
        "max_pident": 98.46678023850085,
        "max_score": 1119.98,
        "min_mismatches": 18,
        "min_nident": 905,
        "min_pident": 77.08688245315162,
        "min_score": 96.04,
        "mismatches": 33,
        "nident": 1141,
        "pident": 97.18909710391823,
        "score": 1064.89,
        "evalue": 0.0,
        "pvalue": 0.0,
        "log_evalue": -1407.74,
        "log_pvalue": -1407.74,
    }
Exemple #5
0
class BIGSI(SampleMetadata, KmerSignatureIndex):
    def __init__(self, config=None):
        if config is None:
            config = DEFAULT_CONFIG
        self.config = config
        self.storage = get_storage(config)
        SampleMetadata.__init__(self, self.storage)
        KmerSignatureIndex.__init__(self, self.storage)
        self.min_unique_kmers_in_query = (
            MIN_UNIQUE_KMERS_IN_QUERY
        )  ## TODO this can be inferred and set at build time
        self.scorer=Scorer(self.num_samples)

    @property
    def kmer_size(self):
        return self.config["k"]

    @property
    def nproc(self):
        return self.config.get("nproc", DEFAULT_NPROC)

    @classmethod
    def bloom(cls, config, kmers):
        kmers = convert_query_kmers(kmers)  ## Convert to canonical kmers
        bloomfilter = BloomFilter(m=config["m"], h=config["h"])
        bloomfilter.update(kmers)
        return bloomfilter.bitarray

    @classmethod
    def build(cls, config, bloomfilters, samples):
        storage = get_storage(config)
        validate_build_params(bloomfilters, samples)
        logger.debug("Insert sample metadata")
        sm = SampleMetadata(storage).add_samples(samples)
        logger.debug("Create signature index")
        ksi = KmerSignatureIndex.create(
            storage,
            bloomfilters,
            config["m"],
            config["h"],
            config.get("low_mem_build", False),
        )
        storage.close()  ## Need to delete LOCK files before re init
        return cls(config)

    def search(self, seq, threshold=1.0, score=False):
        self.__validate_search_query(seq)
        assert threshold <= 1
        kmers = list(self.seq_to_kmers(seq))
        kmers_to_colours = self.lookup(kmers, remove_trailing_zeros=False)
        min_kmers = math.ceil(len(set(kmers)) * threshold)
        if threshold == 1.0:
            results = self.exact_filter(kmers_to_colours)
        else:
            results = self.inexact_filter(kmers_to_colours, min_kmers)
        if score:
            self.score(kmers, kmers_to_colours, results)
        return [r.todict() for r in results if not r.sample_name==DELETION_SPECIAL_SAMPLE_NAME]

    def exact_filter(self, kmers_to_colours):
        colours_with_all_kmers = non_zero_bitarrary_positions(
            bitwise_and(kmers_to_colours.values())
        )
        samples = self.get_sample_list(colours_with_all_kmers)
        return [
            BigsiQueryResult(
                colour=c,
                sample_name=s,
                num_kmers=len(kmers_to_colours),
                num_kmers_found=len(kmers_to_colours),
            )
            for c,s in zip(colours_with_all_kmers, samples)
        ]

    def get_sample_list(self, colours):
        colours_to_samples = self.colours_to_samples(colours)
        return [colours_to_samples[i] for i in colours]

    def inexact_filter(self, kmers_to_colours, min_kmers):
        num_kmers = unpack_and_sum_bitarrays(list(kmers_to_colours.values()), self.nproc)
        colours = range(self.num_samples)
        colours_to_kmers_found = dict(zip(colours, num_kmers))
        colours_to_kmers_found_above_threshold = self.__colours_above_threshold(
            colours_to_kmers_found, min_kmers
        )
        results = [
            BigsiQueryResult(
                colour=colour,
                sample_name=self.colour_to_sample(colour),
                num_kmers_found=int(num_kmers_found),
                num_kmers=len(kmers_to_colours),
            )
            for colour, num_kmers_found in colours_to_kmers_found_above_threshold.items()
        ]
        results.sort(key=lambda x: x.num_kmers_found, reverse=True)
        return results 

    def score(self, kmers, kmers_to_colours, results):
        rows=[kmers_to_colours[kmer] for kmer in kmers]
        X=unpack_and_cat_bitarrays(rows, self.nproc)
        for res in results:
            col="".join([str(i) for i in X[:,res.colour].tolist()])
            score_results=self.scorer.score(col)
            score_results["kmer-presence"]=col
            res.add_score(score_results)

    def __colours_above_threshold(self, colours_to_percent_kmers, min_kmers):
        return {k: v for k, v in colours_to_percent_kmers.items() if v >= min_kmers}

    def insert(self, bloomfilter, sample):
        logger.warning("Build and merge is preferable to insert in most cases")
        colour = self.add_sample(sample)
        self.insert_bloom(bloomfilter, colour - 1)

    def delete(self):
        self.storage.delete_all()

    def __validate_merge(self, bigsi):
        assert self.bloomfilter_size == bigsi.bloomfilter_size
        assert self.num_hashes == bigsi.num_hashes
        assert self.kmer_size == bigsi.kmer_size

    def merge(self, bigsi):
        self.__validate_merge(bigsi)
        self.merge_indexes(bigsi)
        self.merge_metadata(bigsi)

    def __validate_search_query(self, seq):
        kmers = set()
        for k in self.seq_to_kmers(seq):
            kmers.add(k)
            if len(kmers) > self.min_unique_kmers_in_query:
                return True
        else:
            logger.warning(
                "Query string should contain at least %i unique kmers. Your query contained %i unique kmers, and as a result the false discovery rate may be high. In future this will become an error."
                % (self.min_unique_kmers_in_query, len(kmers))
            )

    def seq_to_kmers(self, seq):
        return seq_to_kmers(seq, self.kmer_size)
Exemple #6
0
class BIGSI(object):
    def __init__(self,
                 db=DEFUALT_DB_DIRECTORY,
                 cachesize=1,
                 nproc=0,
                 mode="c"):
        self.mode = mode
        self.nproc = nproc
        self.db = db
        try:
            self.metadata = self.load_metadata(mode)
        except (bsddb3.db.DBNoSuchFileError, bsddb3.db.DBError) as e:
            print(e)
            if isinstance(e, bsddb3.db.DBError):
                raise OSError(
                    "You don't have permission to access this directory %s ." %
                    self.db)
            else:
                raise OSError(
                    "Cannot find a BIGSI at %s. Run `bigsi init` or BIGSI.create()"
                    % db)
        else:
            self.metadata = self.load_metadata(mode=mode)
            self.bloom_filter_size = int.from_bytes(
                self.metadata['bloom_filter_size'], 'big')
            self.num_hashes = int.from_bytes(self.metadata['num_hashes'],
                                             'big')
            self.kmer_size = int.from_bytes(self.metadata['kmer_size'], 'big')
            self.scorer = Scorer(self.get_num_colours())
            self.graph = ProbabilisticBerkeleyDBStorage(
                filename=self.graph_filename,
                bloom_filter_size=self.bloom_filter_size,
                num_hashes=self.num_hashes,
                mode=mode)
            self.graph.sync()
            self.metadata.sync()

    def load_metadata(self, mode="c"):
        return BerkeleyDBStorage(filename=os.path.join(self.db, "metadata"),
                                 mode=mode)

    @property
    def graph_filename(self):
        return os.path.join(self.db, "graph")

    @property
    def metadata_filename(self):
        return os.path.join(self.db, "metadata")

    def load_graph(self, mode="r"):
        return self.graph

    @classmethod
    def create(cls,
               db=DEFUALT_DB_DIRECTORY,
               k=31,
               m=25000000,
               h=3,
               cachesize=1,
               force=False):
        # Initialises a BIGSI
        # m: bloom_filter_size
        # h: number of hash functions
        # directory - where to store the bigsi
        try:
            os.mkdir(db)
        except FileExistsError:
            if force:
                logger.info("Clearing and recreating %s" % db)
                cls(db, mode="c").delete_all()
                return cls.create(db=db,
                                  k=k,
                                  m=m,
                                  h=h,
                                  cachesize=cachesize,
                                  force=False)
            raise FileExistsError(
                "A BIGSI already exists at %s. Run with --force or BIGSI.create(force=True) to recreate."
                % db)

        else:
            logger.info("Initialising BIGSI at %s" % db)
            metadata_filepath = os.path.join(db, "metadata")
            metadata = BerkeleyDBStorage(filename=metadata_filepath, mode="c")
            metadata["bloom_filter_size"] = (int(m)).to_bytes(4,
                                                              byteorder='big')
            metadata["num_hashes"] = (int(h)).to_bytes(4, byteorder='big')
            metadata["kmer_size"] = (int(k)).to_bytes(4, byteorder='big')
            metadata.sync()
            return cls(db=db, cachesize=cachesize, mode="c")

    def build(self, bloomfilters, samples, lowmem=False):
        # Need to open with read and write access
        if not len(bloomfilters) == len(samples):
            raise ValueError(
                "There must be the same number of bloomfilters and sample names"
            )
        graph = self.load_graph(mode="w")
        bloom_filter_size = len(bloomfilters[0])
        logger.debug("Adding samples")
        [self._add_sample(s, sync=False) for s in samples]
        logger.debug("transpose")
        bigsi = transpose(bloomfilters, lowmem=lowmem)
        logger.debug("insert")
        for i, ba in enumerate(bigsi):
            graph[i] = ba.tobytes()
        self.sync()

    def merge(self, merged_bigsi):
        logger.info("Starting merge")
        # Check that they're the same length
        assert self.metadata["bloom_filter_size"] == merged_bigsi.metadata[
            "bloom_filter_size"]
        assert self.metadata["num_hashes"] == merged_bigsi.metadata[
            "num_hashes"]
        assert self.metadata["kmer_size"] == merged_bigsi.metadata["kmer_size"]
        self._merge_graph(merged_bigsi)
        self._merge_metadata(merged_bigsi)

    def _merge_graph(self, merged_bigsi):
        graph = self.load_graph(mode="w")
        # Update graph
        for i in range(self.bloom_filter_size):
            r = graph.get_row(i)[:self.get_num_colours()]
            r2 = merged_bigsi.graph.get_row(i)[:merged_bigsi.get_num_colours()]
            r.extend(r2)
            graph.set_row(i, r)
        graph.sync()

    def _merge_metadata(self, merged_bigsi):
        # Update metadata
        for c in range(merged_bigsi.get_num_colours()):
            sample = merged_bigsi.colour_to_sample(c)
            try:
                self._add_sample(sample, sync=False)
            except ValueError:
                self._add_sample(sample + "_duplicate_in_merge", sync=False)
        self.metadata.sync()

    @convert_kmers_to_canonical
    def bloom(self, kmers):
        logger.info("Building bloom filter")
        return self.load_graph().bloomfilter.create(kmers)

    def insert(self, bloom_filter, sample):
        """
           Insert kmers into the multicoloured graph.
           sample can not already exist in the graph
        """
        try:
            self.load_graph()[0]
        except:
            logger.error(
                "No existing index. Run `init` and `build` before `insert` or `search`"
            )
            raise ValueError(
                "No existing index. Run `init` and `build` before `insert` or `search`"
            )
        colour = self._add_sample(sample)
        logger.info("Inserting sample %s into colour %i" % (sample, colour))
        self._insert(bloom_filter, colour)
        self.sync()

    def search(self, seq, threshold=1, score=False):
        assert threshold <= 1
        return self._search(self.seq_to_kmers(seq),
                            threshold=threshold,
                            score=score)

    def lookup(self, kmers):
        """Return sample names where these kmers is present"""
        if isinstance(kmers, str) and len(kmers) > self.kmer_size:
            kmers = self.seq_to_kmers(kmers)
        out = {}
        if isinstance(kmers, str):
            out[kmers] = self._lookup(kmers)

        else:
            for kmer in kmers:
                out[kmer] = self._lookup(kmer)

        return out

    def lookup_raw(self, kmer):
        return self._lookup_raw(kmer)

    def seq_to_kmers(self, seq):
        return seq_to_kmers(seq, self.kmer_size)

    def metadata_set(self, metadata_key, value, sync=True):
        metadata = self.metadata
        metadata[metadata_key] = pickle.dumps(value)
        if sync:
            self.sync()

    def metadata_hgetall(self, metadata_key):
        return pickle.loads(self.metadata.get(metadata_key, pickle.dumps({})))

    def metadata_hget(self, metadata_key, key):
        return self.metadata_hgetall(metadata_key).get(key)

    def add_sample_metadata(self,
                            sample,
                            key,
                            value,
                            overwrite=False,
                            sync=True):
        metadata_key = "ss_%s" % sample
        self.metadata_hset(metadata_key,
                           key,
                           value,
                           overwrite=overwrite,
                           sync=sync)

    def lookup_sample_metadata(self, sample):
        metadata_key = "ss_%s" % sample
        return self.metadata_hgetall(metadata_key)

    def metadata_hset(self,
                      metadata_key,
                      key,
                      value,
                      overwrite=False,
                      sync=True):
        metadata_values = self.metadata_hgetall(metadata_key)
        if key in metadata_values and not overwrite:
            raise ValueError(
                "%s is already in the metadata of %s with value %s " %
                (key, metadata_key, metadata_values[key]))
        else:
            metadata_values[key] = value
            self.metadata_set(metadata_key, metadata_values, sync=sync)

    def set_colour(self, colour, sample, overwrite=False, sync=True):
        colour = int(colour)
        metadata = self.metadata
        metadata["colour%i" % colour] = sample
        if sync:
            self.sync()

    def sample_to_colour(self, sample):
        return self.lookup_sample_metadata(sample).get('colour')

    def colour_to_sample(self, colour):
        metadata = self.metadata
        r = metadata["colour%i" % colour].decode('utf-8')
        if r:
            return r
        else:
            return str(colour)

    def delete_sample(self, sample_name):
        try:
            colour = self.sample_to_colour(sample_name)
        except:
            raise ValueError("Can't find sample %s" % sample_name)
        else:
            self.set_colour(colour, "DELETED")
            self.delete_sample(sample_name)
            del self.metadata_hgetall[sample_name]

    @convert_kmers_to_canonical
    def _lookup_raw(self, kmer, canonical=False):
        return self.graph.lookup(kmer).tobytes()

    def get_bloom_filter(self, sample):
        colour = self.sample_to_colour(sample)
        return self.graph.get_bloom_filter(colour)

    def create_bloom_filter(self, kmers):
        return self.graph.create_bloom_filter(kmers)

    def _insert(self, bloomfilter, colour):
        graph = self.load_graph(mode="c")
        if bloomfilter:
            logger.debug("Inserting bloomfilter into colour %i" % colour)
            graph.insert(bloomfilter, int(colour))
            graph.sync()

    def colours(self, kmer):
        return {kmer: self._colours(kmer)}

    @convert_kmers_to_canonical
    def _colours(self, kmer, canonical=False):
        colour_presence_boolean_array = self.load_graph().lookup(kmer)
        return colour_presence_boolean_array.colours()

    def _get_kmers_colours(self, kmers):
        for kmer in kmers:
            ba = self.load_graph().lookup(kmer)
            yield kmer, ba

    def _search(self, kmers, threshold=1, score=False):
        """Return sample names where this kmer is present"""
        if isinstance(kmers, str):
            return self._search_kmer(kmers)
        else:
            return self._search_kmers(kmers, threshold=threshold, score=score)

    @convert_kmers_to_canonical
    def _search_kmer(self, kmer, canonical=False):
        out = {}
        for colour in self.colours(kmer, canonical=True):
            sample = self.colour_to_sample(colour)
            if sample != "DELETED":
                out[sample] = 1.0
        return out

    @convert_kmers_to_canonical
    def _search_kmers(self, kmers, threshold=1, score=False):
        if threshold == 1:
            return self._search_kmers_threshold_1(kmers, score=score)
        else:
            return self._search_kmers_threshold_not_1(kmers,
                                                      threshold=threshold,
                                                      score=score)

    def _search_kmers_threshold_not_1(self, kmers, threshold, score):
        if score:
            return self._search_kmers_threshold_not_1_with_scoring(
                kmers, threshold)
        else:
            return self._search_kmers_threshold_not_1_without_scoring(
                kmers, threshold)

    def _search_kmers_threshold_not_1_with_scoring(self, kmers, threshold):
        out = {}
        kmers = list(kmers)
        result = self._search_kmers_threshold_not_1_without_scoring(
            kmers, threshold, convert_colours=False)
        kmer_lookups = [self.load_graph().lookup(kmer) for kmer in kmers]
        for colour, r in result.items():
            percent = r["percent_kmers_found"]
            s = "".join(
                [str(int(kmer_lookups[i][colour])) for i in range(len(kmers))])
            sample = self.colour_to_sample(colour)
            out[sample] = self.scorer.score(s)
            out[sample]["percent_kmers_found"] = percent
        return out

    def _search_kmers_threshold_not_1_without_scoring(self,
                                                      kmers,
                                                      threshold,
                                                      convert_colours=True):
        out = {}
        bas = [ba for _, ba in self._get_kmers_colours(kmers)]
        cumsum = unpack_bas(bas, j=self.nproc)
        lkmers = len(bas)

        for i, f in enumerate(cumsum):
            res = float(f) / lkmers
            if res >= threshold:
                if convert_colours:
                    sample = self.colour_to_sample(i)
                else:
                    sample = i
                if sample != "DELETED":
                    out[sample] = {}
                    out[sample]["percent_kmers_found"] = 100 * res
        return out

    def _search_kmers_threshold_1(self, kmers, score=False):
        """Special case where the threshold is 1 (can accelerate queries with AND)"""
        kmers = list(kmers)
        ba = self.load_graph().lookup_all_present(kmers)
        out = {}
        for c in ba.colours():
            sample = self.colour_to_sample(c)
            if sample != "DELETED":
                if score:
                    out[sample] = self.scorer.score(
                        "1" * (len(kmers) + self.kmer_size - 1))  # Fix!
                else:
                    out[sample] = {}
                out[sample]["percent_kmers_found"] = 100
        return out

    @convert_kmers_to_canonical
    def _lookup(self, kmer, canonical=False):
        assert not isinstance(kmer, list)
        num_colours = self.get_num_colours()
        colour_presence_boolean_array = self.load_graph().lookup(kmer)
        samples_present = []
        for i, present in enumerate(colour_presence_boolean_array):
            if present:
                samples_present.append(self.colour_to_sample(i))
            if i > num_colours:
                break
        return samples_present

    def _add_sample(self, sample_name, sync=True):
        sample_name = str(sample_name)
        metadata = self.metadata
        # logger.debug("Adding sample %s" % sample_name)
        existing_index = self.sample_to_colour(sample_name)
        if existing_index is not None:
            raise ValueError("%s already exists in the db" % sample_name)
        else:
            colour = self.get_num_colours()
            if colour is None:
                colour = 0
            else:
                colour = int(colour)
            self.add_sample_metadata(sample_name, 'colour', colour, sync=sync)
            self.set_colour(colour, sample_name, sync=sync)
            metadata.incr('num_colours')
            if sync:
                metadata.sync()
            return colour

    def get_num_colours(self):
        return int.from_bytes(
            self.metadata.get('num_colours', b'\x00\x00\x00\x00'), 'big')

    def sync(self):
        self.load_graph().storage.sync()
        self.metadata.sync()

    def delete_all(self):
        self.load_graph().delete_all()
        self.metadata.delete_all()
        os.rmdir(self.db)