Example #1
0
    def _generate_models_from_stem(self):
        '''Post-initialization, set up the model to be used by the greedy heuristic.
        Use k-1 (k is stem k-mer length) as a key to a distribution of values of the terminal nt. Populates self.model

        (Models for the stem (length < k) will be determined from self.nt_freqs.)
        '''
        if self.root_k == 1:
            self.logger.info(
                "Using simple nucleotide frequencies for all models.")
            self.model = self.nt_freqs

        else:
            self.logger.info("Generating k-mer frequency models.")
            self.model = {}
            conditioning_kmers = self.all_kmers[self.root_k - 1]
            for kmer_seq in conditioning_kmers:
                kmer = self.access_kmer(kmer_seq)
                if kmer.children[0].alt_model is None and kmer.children[
                        0].inferred_model:
                    raise ku.KmerError(
                        "{} of {} doesn't have a model!!".format(
                            kmer.children[0].seq, kmer.seq))
                self.model[kmer_seq] = kmer.children[0].alt_model
            self.logger.info("Generated {} k-mer models".format(
                len(self.model.keys())))
Example #2
0
    def decide_if_should_prune_kmer(self):
        '''use a heuristic based on dAIC (model for ALL the sisters) and obs/exp ratio (for
        specific k-mer) to decide whether to prune this k-mer from the k-mer tree.
        I'd love to replace this nasty conditional.
        '''
        # when a certain gentleman arrived from rome
        if self.dAIC is None:
            self.should_prune = True
        elif self.count == 0:
            self.should_prune = True
        # she wore the dress- and i stayed home!
        elif (self.dAIC > self.daic_threshold) and (
                    self.obs_exp_ratio > self.ratio_threshold):
            self.should_prune = False
        elif (self.dAIC < self.daic_threshold) or (
                    self.obs_exp_ratio < self.ratio_threshold):
            self.should_prune = True
        # this happens when you try to do np.inf - np.inf!
        elif np.isnan(self.dAIC):
            self.should_prune = True

        if self.should_prune is None:
            print(self.dAIC)
            raise ku.KmerError("was unable to decide whether or not to prune a k-mer! "
                               "dAIC {}".format(self.dAIC))
Example #3
0
    def _changepoint_calc(self, kmer):
        '''Recursively explore (non-pruned) tree while tracking maximal k-mers.
        Use D-segment-like* heuristic to traverse a k-mer subtree and return all notable
        k-mers, where "notable" means "accounting for a larger fraction of its subtree than
        its children".

        *from Phil Green, http://bozeman.mbt.washington.edu/compbio/mbt599/Lecture5.pdf

        :param kmer: a KmerNode object, whose children shall be traversed
        :return: bool: whether the parent is maximal
        '''
        is_maximal = False
        # if a k-mer
        if kmer.should_prune:
            #self.logger.info("Explored past a tip of length {}, breaking DFS".format(kmer.length - 1))
            is_maximal = True
        else:
            if len(kmer.children) == 0 and kmer.length > self.root_k:
                raise ku.KmerError(
                    "Kmer {} is unpruned but has no children! Something weird is going on!"
                    .format(kmer.seq))
            for child in kmer.children:
                self._to_dfs.extend([child])

        if not is_maximal and (kmer.segment_score() < self.dseg_threshold
                               ) and (kmer.parent not in self._maximal_kmers):
            is_maximal = True
        return is_maximal
Example #4
0
    def _yield_model_for_kmer(self, kmer):
        '''Yield a model dict that conditions appropriately on some k-mer (presumably a parent)

        :param kmer: KmerNode that represents the parent of the k-mers to be modeled.
        :return: {str: float}: the model in the form of a dict mapping nts to expected frequencies.

        Raises:
            KmerError: the tree's model for the stem of the k-mer is None, something went wrong there.
        '''
        if self.root_k == 1 or self.model is None or self.leaf_length <= self.root_k:
            nt_model = self.nt_freqs
        else:
            nt_model = self.model[kmer.stem_seq]

        if nt_model is None:
            if kmer.stem_seq not in self.genome:
                self.logger.info(
                    "Substituting nucleotide frequencies for k-mer stem seq"
                    "sequence {}, absent from model.".format(kmer.stem_seq))
                nt_model = self.nt_freqs
            raise ku.KmerError(
                "could not get a model for k-mer {} with stem seq {}".format(
                    kmer.seq, kmer.stem_seq))

        return nt_model
Example #5
0
    def access_kmer(self, kmer_seq):
        '''Yield a single k-mer directly. A shortcut.

        :param kmer_seq (str): a string of nucleotides of length k.
        :return: (Kmer): the k-mer object itself
        '''
        right_len_kmers = self.all_kmers[len(kmer_seq)]
        if kmer_seq not in right_len_kmers:
            raise ku.KmerError(
                "attempted to access kmer {}, not in kmer tree!".format(
                    kmer_seq))
        return right_len_kmers[kmer_seq]
Example #6
0
    def child_proportion(self, child):
        '''Estimate the proportion of the present k-mer represented by each child
            Args:
                child (KmerNode): one of the children of the present k-mer
        '''
        if child not in self.children:
            raise ku.KmerError("{} is not a child of {}! can't estimate proportional counts!".format(
                self.seq, child.seq
            ))

        if self._children_proportions is None:
            self._children_proportions = dict([(daughter, (daughter.count / self.count)) for daughter in self.children])
        return self._children_proportions[child]
Example #7
0
    def get_local_models(self):
        '''define the local models based on the root k-mer sequences (if needed)'''
        total = float(sum(self.model.values()))
        for kmer in self.freqs:
            if len(kmer) != (self.root_k + 1):
                raise ku.KmerError(
                    "Kmer {} is of wrong length! Should be {}.".format(
                        kmer, self.root_k))
            stem = kmer[0:-2]
            #for suffix in ku.NTS:
            #self.model[kmer] /= total

        pass
Example #8
0
 def analyze_leaves(self, model_calc=True):
     '''Do statistical analysis of each leaf k-mer on a genome, populating some
     attributes of KmerNode() with relation to its sisters in the tree.
     '''
     for leaf in self._leaf_kmers:
         if leaf.seq == "root":
             raise ku.KmerError("Root k-mer should not be in leaves!")
         if leaf.should_prune:
             if self.debug:
                 print("pruning {}".format(leaf.seq))
             continue
         leaf.populate_sisters()
         if model_calc:
             leaf.estimate_daic()
     if self.debug:
         self.logger.error(
             "(Not actually an error!)\nLeaf k-mers:\n{}".format("\n".join(
                 " ".join([
                     k.seq,
                     str(k.obs_exp_ratio),
                     str(k.count),
                     str(k.dAIC),
                     str(k.should_prune)
                 ]) for k in self._leaf_kmers)))