コード例 #1
0
ファイル: trio_model.py プロジェクト: maip/novo-muta
    def seq_err(self, member):
        """
        Calculate the probability of sequencing error. Assume each chromosome
        is equally-likely to be sequenced.

        The probability is drawn from a Dirichlet multinomial distribution:
        This is a point of divergence from the Cartwright et al. paper
        mentioned in the other functions.

        When the Dirichlet multinomial is called, the max element is stored in
        max_elems, so that the scaling of the probability matrix can be
        manipulated later.

        Args:
            member: Integer representing index of the read counts for a
                family member in the trio model.

        Returns:
            1 x 16 probability vector that needs to be multiplied by a
            transition matrix.
        """
        # TODO: add bias when alpha freq are added
        alpha_mat = ut.get_alphas(self.seq_err_rate) * self.dm_disp

        prob_mat = np.zeros((ut.GENOTYPE_COUNT))
        for i, alpha in enumerate(alpha_mat):
            log_proba = ut.dirichlet_multinomial(alpha, self.reads[member])
            prob_mat[i] = log_proba

        prob_mat_rescaled, max_elem = ut.normalspace(prob_mat)
        self.max_elems.append(max_elem)

        return prob_mat_rescaled
コード例 #2
0
    def seq_err(self, member):
        """
        Calculate the probability of sequencing error. Assume each chromosome
        is equally-likely to be sequenced.

        The probability is drawn from a Dirichlet multinomial distribution:
        This is a point of divergence from the Cartwright et al. paper
        mentioned in the other functions.

        When the Dirichlet multinomial is called, the max element is stored in
        max_elems, so that the scaling of the probability matrix can be
        manipulated later.

        Args:
            member: Integer representing index of the read counts for a
                family member in the trio model.

        Returns:
            1 x 16 probability vector that needs to be multiplied by a
            transition matrix.
        """
        # TODO: add bias when alpha freq are added
        alpha_mat = ut.get_alphas(self.seq_err_rate) * self.dm_disp

        prob_mat = np.zeros((ut.GENOTYPE_COUNT))
        for i, alpha in enumerate(alpha_mat):
            log_proba = ut.dirichlet_multinomial(alpha, self.reads[member])
            prob_mat[i] = log_proba

        prob_mat_rescaled, max_elem = ut.normalspace(prob_mat)
        self.max_elems.append(max_elem)

        return prob_mat_rescaled
コード例 #3
0
ファイル: trio_model.py プロジェクト: maip/novo-muta
    def pop_sample(self):
        """
        The multinomial component of the model generates the nucleotide
        frequency parameter vector (alpha_A, alpha_C, alpha_G, alpha_T) based
        on the nucleotide count input data.

        Probabilities are drawn from a Dirichlet multinomial distribution.
        The Dirichlet component of our models uses this frequency parameter
        vector in addition to the mutation rate (theta), nucleotide
        frequencies [alpha_A, alpha_C, alpha_G, alpha_T], and genome
        nucleotide counts [n_A, n_C, n_G, n_T].

        For example: The genome mutation rate (theta) may be the small scalar
        quantity \theta = 0.00025, the frequency parameter vector
        (alpha_A, alpha_C, alpha_G, alpha_T) = (0.25, 0.25, 0.25, 0.25),
        the genome nucleotide counts (n_A, n_C, n_G, n_T) = (4, 0, 0, 0), for
        the event that both the mother and the father have genotype AA,
        resulting in N = 4.

        Note: This model does not follow that of the Cartwright paper
        mentioned in other functions.

        Set geno as the 1 x 16 probability matrix for a single parent.

        Returns:
            1 x 256 probability matrix in log e space where the (i, j)
            element in the matrix is the probability that the mother has
            genotype i and the father has genotype j where i, j in
            {AA, AC, AG, AT,
             CA, CC, CG, CT,
             GA, GC, GG, GT,
             TA, TC, TG, TT}.

            The matrix is an order-relevant representation of the possible
            events in the sample space that covers all possible parent
            genotype combinations. For example:

            [P(AAAA), P(AAAC), P(AAAG), P(AAAT), P(AACA), P(AACC), P(AACG)...]
        """
        # combine parameters for call to dirichlet multinomial
        muta_nt_freq = np.array([0.25 * self.pop_muta_rate for i in range(4)])
        gt_count = ut.two_parent_counts()
        prob_mat = np.zeros(( ut.GENOTYPE_COUNT, ut.GENOTYPE_COUNT ))
        for i in range(ut.GENOTYPE_COUNT):
            for j in range(ut.GENOTYPE_COUNT):
                nt_count = gt_count[i, j, :]  # count per 2-allele genotype
                log_proba = ut.dirichlet_multinomial(muta_nt_freq, nt_count)
                prob_mat[i, j] = np.exp(log_proba)

        self.geno = np.sum(prob_mat, axis=0)  # set one parent prob mat
        return prob_mat.flatten()
コード例 #4
0
    def pop_sample(self):
        """
        The multinomial component of the model generates the nucleotide
        frequency parameter vector (alpha_A, alpha_C, alpha_G, alpha_T) based
        on the nucleotide count input data.

        Probabilities are drawn from a Dirichlet multinomial distribution.
        The Dirichlet component of our models uses this frequency parameter
        vector in addition to the mutation rate (theta), nucleotide
        frequencies [alpha_A, alpha_C, alpha_G, alpha_T], and genome
        nucleotide counts [n_A, n_C, n_G, n_T].

        For example: The genome mutation rate (theta) may be the small scalar
        quantity \theta = 0.00025, the frequency parameter vector
        (alpha_A, alpha_C, alpha_G, alpha_T) = (0.25, 0.25, 0.25, 0.25),
        the genome nucleotide counts (n_A, n_C, n_G, n_T) = (4, 0, 0, 0), for
        the event that both the mother and the father have genotype AA,
        resulting in N = 4.

        Note: This model does not follow that of the Cartwright paper
        mentioned in other functions.

        Set geno as the 1 x 16 probability matrix for a single parent.

        Returns:
            1 x 256 probability matrix in log e space where the (i, j)
            element in the matrix is the probability that the mother has
            genotype i and the father has genotype j where i, j in
            {AA, AC, AG, AT,
             CA, CC, CG, CT,
             GA, GC, GG, GT,
             TA, TC, TG, TT}.

            The matrix is an order-relevant representation of the possible
            events in the sample space that covers all possible parent
            genotype combinations. For example:

            [P(AAAA), P(AAAC), P(AAAG), P(AAAT), P(AACA), P(AACC), P(AACG)...]
        """
        # combine parameters for call to dirichlet multinomial
        muta_nt_freq = np.array([0.25 * self.pop_muta_rate for i in range(4)])
        gt_count = ut.two_parent_counts()
        prob_mat = np.zeros(( ut.GENOTYPE_COUNT, ut.GENOTYPE_COUNT ))
        for i in range(ut.GENOTYPE_COUNT):
            for j in range(ut.GENOTYPE_COUNT):
                nt_count = gt_count[i, j, :]  # count per 2-allele genotype
                log_proba = ut.dirichlet_multinomial(muta_nt_freq, nt_count)
                prob_mat[i, j] = np.exp(log_proba)

        self.geno = np.sum(prob_mat, axis=0)  # set one parent prob mat
        return prob_mat.flatten()