def _generate_models_from_stem(self): '''Post-initialization, set up the model to be used by the greedy heuristic. Use k-1 (k is stem k-mer length) as a key to a distribution of values of the terminal nt. Populates self.model (Models for the stem (length < k) will be determined from self.nt_freqs.) ''' if self.root_k == 1: self.logger.info( "Using simple nucleotide frequencies for all models.") self.model = self.nt_freqs else: self.logger.info("Generating k-mer frequency models.") self.model = {} conditioning_kmers = self.all_kmers[self.root_k - 1] for kmer_seq in conditioning_kmers: kmer = self.access_kmer(kmer_seq) if kmer.children[0].alt_model is None and kmer.children[ 0].inferred_model: raise ku.KmerError( "{} of {} doesn't have a model!!".format( kmer.children[0].seq, kmer.seq)) self.model[kmer_seq] = kmer.children[0].alt_model self.logger.info("Generated {} k-mer models".format( len(self.model.keys())))
def decide_if_should_prune_kmer(self): '''use a heuristic based on dAIC (model for ALL the sisters) and obs/exp ratio (for specific k-mer) to decide whether to prune this k-mer from the k-mer tree. I'd love to replace this nasty conditional. ''' # when a certain gentleman arrived from rome if self.dAIC is None: self.should_prune = True elif self.count == 0: self.should_prune = True # she wore the dress- and i stayed home! elif (self.dAIC > self.daic_threshold) and ( self.obs_exp_ratio > self.ratio_threshold): self.should_prune = False elif (self.dAIC < self.daic_threshold) or ( self.obs_exp_ratio < self.ratio_threshold): self.should_prune = True # this happens when you try to do np.inf - np.inf! elif np.isnan(self.dAIC): self.should_prune = True if self.should_prune is None: print(self.dAIC) raise ku.KmerError("was unable to decide whether or not to prune a k-mer! " "dAIC {}".format(self.dAIC))
def _changepoint_calc(self, kmer): '''Recursively explore (non-pruned) tree while tracking maximal k-mers. Use D-segment-like* heuristic to traverse a k-mer subtree and return all notable k-mers, where "notable" means "accounting for a larger fraction of its subtree than its children". *from Phil Green, http://bozeman.mbt.washington.edu/compbio/mbt599/Lecture5.pdf :param kmer: a KmerNode object, whose children shall be traversed :return: bool: whether the parent is maximal ''' is_maximal = False # if a k-mer if kmer.should_prune: #self.logger.info("Explored past a tip of length {}, breaking DFS".format(kmer.length - 1)) is_maximal = True else: if len(kmer.children) == 0 and kmer.length > self.root_k: raise ku.KmerError( "Kmer {} is unpruned but has no children! Something weird is going on!" .format(kmer.seq)) for child in kmer.children: self._to_dfs.extend([child]) if not is_maximal and (kmer.segment_score() < self.dseg_threshold ) and (kmer.parent not in self._maximal_kmers): is_maximal = True return is_maximal
def _yield_model_for_kmer(self, kmer): '''Yield a model dict that conditions appropriately on some k-mer (presumably a parent) :param kmer: KmerNode that represents the parent of the k-mers to be modeled. :return: {str: float}: the model in the form of a dict mapping nts to expected frequencies. Raises: KmerError: the tree's model for the stem of the k-mer is None, something went wrong there. ''' if self.root_k == 1 or self.model is None or self.leaf_length <= self.root_k: nt_model = self.nt_freqs else: nt_model = self.model[kmer.stem_seq] if nt_model is None: if kmer.stem_seq not in self.genome: self.logger.info( "Substituting nucleotide frequencies for k-mer stem seq" "sequence {}, absent from model.".format(kmer.stem_seq)) nt_model = self.nt_freqs raise ku.KmerError( "could not get a model for k-mer {} with stem seq {}".format( kmer.seq, kmer.stem_seq)) return nt_model
def access_kmer(self, kmer_seq): '''Yield a single k-mer directly. A shortcut. :param kmer_seq (str): a string of nucleotides of length k. :return: (Kmer): the k-mer object itself ''' right_len_kmers = self.all_kmers[len(kmer_seq)] if kmer_seq not in right_len_kmers: raise ku.KmerError( "attempted to access kmer {}, not in kmer tree!".format( kmer_seq)) return right_len_kmers[kmer_seq]
def child_proportion(self, child): '''Estimate the proportion of the present k-mer represented by each child Args: child (KmerNode): one of the children of the present k-mer ''' if child not in self.children: raise ku.KmerError("{} is not a child of {}! can't estimate proportional counts!".format( self.seq, child.seq )) if self._children_proportions is None: self._children_proportions = dict([(daughter, (daughter.count / self.count)) for daughter in self.children]) return self._children_proportions[child]
def get_local_models(self): '''define the local models based on the root k-mer sequences (if needed)''' total = float(sum(self.model.values())) for kmer in self.freqs: if len(kmer) != (self.root_k + 1): raise ku.KmerError( "Kmer {} is of wrong length! Should be {}.".format( kmer, self.root_k)) stem = kmer[0:-2] #for suffix in ku.NTS: #self.model[kmer] /= total pass
def analyze_leaves(self, model_calc=True): '''Do statistical analysis of each leaf k-mer on a genome, populating some attributes of KmerNode() with relation to its sisters in the tree. ''' for leaf in self._leaf_kmers: if leaf.seq == "root": raise ku.KmerError("Root k-mer should not be in leaves!") if leaf.should_prune: if self.debug: print("pruning {}".format(leaf.seq)) continue leaf.populate_sisters() if model_calc: leaf.estimate_daic() if self.debug: self.logger.error( "(Not actually an error!)\nLeaf k-mers:\n{}".format("\n".join( " ".join([ k.seq, str(k.obs_exp_ratio), str(k.count), str(k.dAIC), str(k.should_prune) ]) for k in self._leaf_kmers)))