Ejemplo n.º 1
0
def stationary_distn_check_helper(pre_Q, codon_distn, branch_length):
    Q = markovutil.pre_Q_to_Q(pre_Q, codon_distn, branch_length)
    P = scipy.linalg.expm(Q)
    next_distn = np.dot(codon_distn, P)
    if not np.allclose(next_distn, codon_distn):
        raise Exception(next_distn - codon_distn)
    print 'stationary distribution is ok'
Ejemplo n.º 2
0
    def get_neg_ll(cls,
            patterns, pattern_weights,
            stationary_distn,
            ts, tv, syn, nonsyn,
            theta,
            ):
        """
        This model has only a single omega parameter.
        @param theta: vector of free variables with sensitivities
        """

        # unpack theta
        log_mus = theta[0:3]
        log_kappa = theta[3]
        log_omega = theta[4]

        # construct the transition matrices
        transition_matrices = []
        for i in range(3):
            mu = algopy.exp(log_mus[i])
            kappa = algopy.exp(log_kappa)
            omega = algopy.exp(log_omega)
            pre_Q = codon1994.get_pre_Q(
                    ts, tv, syn, nonsyn,
                    stationary_distn,
                    kappa, omega)
            Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, mu)
            P = algopy.expm(Q)
            transition_matrices.append(P)

        # return the neg log likelihood
        ov = range(4)
        v_to_children = {3 : [0, 1, 2]}
        de_to_P = {
                (3, 0) : transition_matrices[0],
                (3, 1) : transition_matrices[1],
                (3, 2) : transition_matrices[2],
                }
        root_prior = stationary_distn
        log_likelihood = alignll.fast_fels(
        #log_likelihood = alignll.fels(
                ov, v_to_children, de_to_P, root_prior,
                patterns, pattern_weights,
                )
        neg_ll = -log_likelihood
        print neg_ll
        return neg_ll
Ejemplo n.º 3
0
def main(args):

    # read the description of the genetic code
    with open(args.code) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter='\t'))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    # look for stop codons
    aminos = [x.lower() for x in aminos]
    nstop = aminos.count('stop')
    if nstop not in (2, 3, 4):
        raise Exception('expected 2 or 3 or 4 stop codons')
    if any(x == 'stop' for x in aminos[:-nstop]):
        raise Exception('expected stop codons at the end of the genetic code')

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]
    ncodons = len(codons)

    # precompute some numpy ndarrays using the genetic code
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    compo = design.get_compo(codons)
    nt_sinks = design.get_nt_sinks(codons)
    asym_compo = np.transpose(nt_sinks, (1, 2, 0))

    # read the empirical counts and detect and trim stop codon counts
    empirical_codon_counts = np.loadtxt(args.empirical_codon_counts)
    if len(empirical_codon_counts) < ncodons:
        raise Exception
    if any(empirical_codon_counts[ncodons:]):
        raise Exception
    empirical_codon_counts = empirical_codon_counts[:ncodons]
    log_empirical_codon_counts = np.log(empirical_codon_counts)
    empirical_codon_distn = empirical_codon_counts / float(
            np.sum(empirical_codon_counts))

    # Precompute the distribution over codon substitutions
    # using the stationary distribution and transition matrix
    # of the simulation model given the simulation parameters.
    sim_theta = np.loadtxt(args.simulation_parameter_values)
    log_sim_theta = np.log(sim_theta)
    log_sim_blen = log_sim_theta[0]
    log_sim_theta_model = log_sim_theta[1:]
    sim_model = g_model_name_to_class[args.simulation_model]
    pre_Q = sim_model.get_pre_Q(
            log_empirical_codon_counts,
            empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            log_sim_theta_model,
            )
    stationary_distn = sim_model.get_distn(
            log_empirical_codon_counts,
            empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            log_sim_theta_model,
            )
    # get the rate matrix
    Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, np.exp(log_sim_blen))
    # get the conditional transition matrix
    P = scipy.linalg.expm(Q)
    # get the joint substitution probability matrix
    J = (P.T * stationary_distn).T

    # use an appropriate sample size
    if args.sample_size:
        sample_size = args.sample_size
    else:
        # If the sample size is unspecified,
        # use a sample size whose number of codon counts
        # matches the sum of the user-provided empirical codon counts.
        sample_size = int(np.sum(empirical_codon_counts) / 2)

    with open(args.table_out, 'w') as fout:

        # write the header of the R table file
        header_row = (
                args.first_inference_model.replace('-', '.'),
                args.second_inference_model.replace('-', '.'),
                )
        print >> fout, '\t'.join(header_row)

        for sample_index in range(args.nsamples):

            # sample an (ncodons, ncodons) ndarray of counts
            # using the precomputed multinomial distribution
            subs_counts = np.random.multinomial(
                    sample_size,
                    J.flat,
                    ).reshape(J.shape).astype(float)

            # compute the neg log likelihoods for the two models
            neg_log_likelihoods = []
            for model_name in (
                    args.first_inference_model,
                    args.second_inference_model,
                    ):
                model = g_model_name_to_class[model_name]
                min_neg_ll = get_min_neg_log_likelihood(
                        model,
                        subs_counts,
                        ts, tv, syn, nonsyn, compo, asym_compo,
                        args.minimization_method,
                        )
                neg_log_likelihoods.append(min_neg_ll)

            # write the data row
            data_row = (
                    sample_index+1,
                    -neg_log_likelihoods[0],
                    -neg_log_likelihoods[1],
                    )
            print >> fout, '\t'.join(str(x) for x in data_row)