def seq_err(self, member): """ Calculate the probability of sequencing error. Assume each chromosome is equally-likely to be sequenced. The probability is drawn from a Dirichlet multinomial distribution: This is a point of divergence from the Cartwright et al. paper mentioned in the other functions. When the Dirichlet multinomial is called, the max element is stored in max_elems, so that the scaling of the probability matrix can be manipulated later. Args: member: Integer representing index of the read counts for a family member in the trio model. Returns: 1 x 16 probability vector that needs to be multiplied by a transition matrix. """ # TODO: add bias when alpha freq are added alpha_mat = ut.get_alphas(self.seq_err_rate) * self.dm_disp prob_mat = np.zeros((ut.GENOTYPE_COUNT)) for i, alpha in enumerate(alpha_mat): log_proba = ut.dirichlet_multinomial(alpha, self.reads[member]) prob_mat[i] = log_proba prob_mat_rescaled, max_elem = ut.normalspace(prob_mat) self.max_elems.append(max_elem) return prob_mat_rescaled
def pop_sample(self): """ The multinomial component of the model generates the nucleotide frequency parameter vector (alpha_A, alpha_C, alpha_G, alpha_T) based on the nucleotide count input data. Probabilities are drawn from a Dirichlet multinomial distribution. The Dirichlet component of our models uses this frequency parameter vector in addition to the mutation rate (theta), nucleotide frequencies [alpha_A, alpha_C, alpha_G, alpha_T], and genome nucleotide counts [n_A, n_C, n_G, n_T]. For example: The genome mutation rate (theta) may be the small scalar quantity \theta = 0.00025, the frequency parameter vector (alpha_A, alpha_C, alpha_G, alpha_T) = (0.25, 0.25, 0.25, 0.25), the genome nucleotide counts (n_A, n_C, n_G, n_T) = (4, 0, 0, 0), for the event that both the mother and the father have genotype AA, resulting in N = 4. Note: This model does not follow that of the Cartwright paper mentioned in other functions. Set geno as the 1 x 16 probability matrix for a single parent. Returns: 1 x 256 probability matrix in log e space where the (i, j) element in the matrix is the probability that the mother has genotype i and the father has genotype j where i, j in {AA, AC, AG, AT, CA, CC, CG, CT, GA, GC, GG, GT, TA, TC, TG, TT}. The matrix is an order-relevant representation of the possible events in the sample space that covers all possible parent genotype combinations. For example: [P(AAAA), P(AAAC), P(AAAG), P(AAAT), P(AACA), P(AACC), P(AACG)...] """ # combine parameters for call to dirichlet multinomial muta_nt_freq = np.array([0.25 * self.pop_muta_rate for i in range(4)]) gt_count = ut.two_parent_counts() prob_mat = np.zeros(( ut.GENOTYPE_COUNT, ut.GENOTYPE_COUNT )) for i in range(ut.GENOTYPE_COUNT): for j in range(ut.GENOTYPE_COUNT): nt_count = gt_count[i, j, :] # count per 2-allele genotype log_proba = ut.dirichlet_multinomial(muta_nt_freq, nt_count) prob_mat[i, j] = np.exp(log_proba) self.geno = np.sum(prob_mat, axis=0) # set one parent prob mat return prob_mat.flatten()