def seq_err(self, member): """ Calculate the probability of sequencing error. Assume each chromosome is equally-likely to be sequenced. The probability is drawn from a Dirichlet multinomial distribution: This is a point of divergence from the Cartwright et al. paper mentioned in the other functions. When the Dirichlet multinomial is called, the max element is stored in max_elems, so that the scaling of the probability matrix can be manipulated later. Args: member: Integer representing index of the read counts for a family member in the trio model. Returns: 1 x 16 probability vector that needs to be multiplied by a transition matrix. """ # TODO: add bias when alpha freq are added alpha_mat = ut.get_alphas(self.seq_err_rate) * self.dm_disp prob_mat = np.zeros((ut.GENOTYPE_COUNT)) for i, alpha in enumerate(alpha_mat): log_proba = ut.dirichlet_multinomial(alpha, self.reads[member]) prob_mat[i] = log_proba prob_mat_rescaled, max_elem = ut.normalspace(prob_mat) self.max_elems.append(max_elem) return prob_mat_rescaled
def dm_sample(self, soma_idx): """ Use alpha frequencies based on the somatic genotype to select nucleotide frequencies and use these frequencies to draw sequencing reads at a specified coverage (Dirichlet multinomial). Args: soma_idx: Index of somatic genotype to get the appropriate alpha frequencies. Returns: Array containing read counts [#A, #C, #G, #T]. """ alpha_mat = (ut.get_alphas(self.trio_model.seq_err_rate) * self.trio_model.dm_disp) alpha = np.random.dirichlet(alpha_mat[soma_idx]) return np.random.multinomial(self.cov, alpha)