Ejemplo n.º 1
0
def main(args):

    # read the description of the genetic code
    with open(args.code) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter='\t'))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    aminos = [x.lower() for x in aminos]
    nstop = aminos.count('stop')
    if nstop not in (2, 3, 4):
        raise Exception('expected 2 or 3 or 4 stop codons')
    if any(x == 'stop' for x in aminos[:-nstop]):
        raise Exception('expected stop codons at the end of the genetic code')

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]

    # precompute some numpy ndarrays using the genetic code
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    compo = design.get_compo(codons)
    nt_sinks = design.get_nt_sinks(codons)
    asym_compo = np.transpose(nt_sinks, (1, 2, 0))

    # read the (nstates, nstates) array of observed codon substitutions
    subs_counts = np.loadtxt(args.count_matrix, dtype=float)

    # trim the stop codons
    subs_counts = subs_counts[:-nstop, :-nstop]

    # compute some summaries of the observed codon substitutions
    counts = np.sum(subs_counts, axis=0) + np.sum(subs_counts, axis=1)
    log_counts = np.log(counts)
    empirical_codon_distn = counts / float(np.sum(counts))

    # make a crude guess of the expected number of changes
    log_blen = np.log(guess_branch_length(subs_counts))

    # use the chosen model to construct an initial guess for max likelihood
    model_natural_guess = args.model.get_natural_guess()
    model_nparams = len(model_natural_guess)
    encoded_guess = np.empty(model_nparams + 1, dtype=float)
    encoded_guess[0] = log_blen
    encoded_guess[1:] = args.model.natural_to_encoded(model_natural_guess)

    # construct the neg log likelihood non-free params
    neg_ll_args = (
            args.model,
            subs_counts,
            log_counts, empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            )

    # define the objective function and the gradient and hessian
    f_encoded_theta = functools.partial(
            get_two_taxon_neg_ll_encoded_theta, *neg_ll_args)
    g_encoded_theta = functools.partial(eval_grad, f_encoded_theta)
    h_encoded_theta = functools.partial(eval_hess, f_encoded_theta)

    # do the search, using information about the gradient and hessian
    results = scipy.optimize.minimize(
            f_encoded_theta,
            encoded_guess,
            method=args.minimization_method,
            jac=g_encoded_theta,
            hess=h_encoded_theta,
            )

    # extract and decode the maximum likelihood estimates
    encoded_xopt = results.x
    mle_log_blen = encoded_xopt[0]
    mle_blen = np.exp(mle_log_blen)
    model_encoded_xopt = encoded_xopt[1:]
    model_xopt = args.model.encoded_to_natural(model_encoded_xopt)
    xopt = np.empty_like(encoded_xopt)
    xopt[0] = mle_blen
    xopt[1:] = model_xopt

    # check that the stationary distribution is ok
    mle_distn = args.model.get_distn(
            log_counts, empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            model_xopt,
            )
    mle_pre_Q = args.model.get_pre_Q(
            log_counts, empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            model_xopt,
            )
    stationary_distn_check_helper(mle_pre_Q, mle_distn, mle_blen)

    # define functions for computing the hessian
    f = functools.partial(get_two_taxon_neg_ll, *neg_ll_args)
    g = functools.partial(eval_grad, f)
    h = functools.partial(eval_hess, f)

    # report a summary of the maximum likelihood search
    print 'raw results from the minimization:'
    print results
    print
    print 'max likelihood branch length (expected number of substitutions):'
    print mle_blen
    print
    print 'max likelihood estimates of other model parameters:'
    print model_xopt
    print

    # print the hessian matrix at the max likelihood parameter values
    fisher_info = h(xopt)
    cov = scipy.linalg.inv(fisher_info)
    errors = np.sqrt(np.diag(cov))
    print 'observed fisher information matrix:'
    print fisher_info
    print
    print 'inverse of fisher information matrix:'
    print cov
    print
    print 'standard error estimates (sqrt of diag of inv of fisher info)'
    print errors
    print

    # write the neg log likelihood into a separate file
    if args.neg_log_likelihood_out:
        with open(args.neg_log_likelihood_out, 'w') as fout:
            print >> fout, results.fun

    # write the parameter estimates into a separate file
    if args.parameter_estimates_out:
        with open(args.parameter_estimates_out, 'w') as fout:
            for value in xopt:
                print >> fout, value

    # write the parameter estimates into a separate file
    if args.parameter_errors_out:
        with open(args.parameter_errors_out, 'w') as fout:
            for value in errors:
                print >> fout, value
Ejemplo n.º 2
0
def main(args):

    # read the description of the genetic code
    with open(args.code) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter='\t'))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    # look for stop codons
    aminos = [x.lower() for x in aminos]
    nstop = aminos.count('stop')
    if nstop not in (2, 3, 4):
        raise Exception('expected 2 or 3 or 4 stop codons')
    if any(x == 'stop' for x in aminos[:-nstop]):
        raise Exception('expected stop codons at the end of the genetic code')

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]
    ncodons = len(codons)

    # precompute some numpy ndarrays using the genetic code
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    compo = design.get_compo(codons)
    nt_sinks = design.get_nt_sinks(codons)
    asym_compo = np.transpose(nt_sinks, (1, 2, 0))

    # read the empirical counts and detect and trim stop codon counts
    empirical_codon_counts = np.loadtxt(args.empirical_codon_counts)
    if len(empirical_codon_counts) < ncodons:
        raise Exception
    if any(empirical_codon_counts[ncodons:]):
        raise Exception
    empirical_codon_counts = empirical_codon_counts[:ncodons]
    log_empirical_codon_counts = np.log(empirical_codon_counts)
    empirical_codon_distn = empirical_codon_counts / float(
            np.sum(empirical_codon_counts))

    # Precompute the distribution over codon substitutions
    # using the stationary distribution and transition matrix
    # of the simulation model given the simulation parameters.
    sim_theta = np.loadtxt(args.simulation_parameter_values)
    log_sim_theta = np.log(sim_theta)
    log_sim_blen = log_sim_theta[0]
    log_sim_theta_model = log_sim_theta[1:]
    sim_model = g_model_name_to_class[args.simulation_model]
    pre_Q = sim_model.get_pre_Q(
            log_empirical_codon_counts,
            empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            log_sim_theta_model,
            )
    stationary_distn = sim_model.get_distn(
            log_empirical_codon_counts,
            empirical_codon_distn,
            ts, tv, syn, nonsyn, compo, asym_compo,
            log_sim_theta_model,
            )
    # get the rate matrix
    Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, np.exp(log_sim_blen))
    # get the conditional transition matrix
    P = scipy.linalg.expm(Q)
    # get the joint substitution probability matrix
    J = (P.T * stationary_distn).T

    # use an appropriate sample size
    if args.sample_size:
        sample_size = args.sample_size
    else:
        # If the sample size is unspecified,
        # use a sample size whose number of codon counts
        # matches the sum of the user-provided empirical codon counts.
        sample_size = int(np.sum(empirical_codon_counts) / 2)

    with open(args.table_out, 'w') as fout:

        # write the header of the R table file
        header_row = (
                args.first_inference_model.replace('-', '.'),
                args.second_inference_model.replace('-', '.'),
                )
        print >> fout, '\t'.join(header_row)

        for sample_index in range(args.nsamples):

            # sample an (ncodons, ncodons) ndarray of counts
            # using the precomputed multinomial distribution
            subs_counts = np.random.multinomial(
                    sample_size,
                    J.flat,
                    ).reshape(J.shape).astype(float)

            # compute the neg log likelihoods for the two models
            neg_log_likelihoods = []
            for model_name in (
                    args.first_inference_model,
                    args.second_inference_model,
                    ):
                model = g_model_name_to_class[model_name]
                min_neg_ll = get_min_neg_log_likelihood(
                        model,
                        subs_counts,
                        ts, tv, syn, nonsyn, compo, asym_compo,
                        args.minimization_method,
                        )
                neg_log_likelihoods.append(min_neg_ll)

            # write the data row
            data_row = (
                    sample_index+1,
                    -neg_log_likelihoods[0],
                    -neg_log_likelihoods[1],
                    )
            print >> fout, '\t'.join(str(x) for x in data_row)
def main(args):

    # read the description of the genetic code
    with open(args.code) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter='\t'))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    aminos = [x.lower() for x in aminos]
    nstop = aminos.count('stop')
    if nstop not in (2, 3, 4):
        raise Exception('expected 2 or 3 or 4 stop codons')
    if any(x == 'stop' for x in aminos[:-nstop]):
        raise Exception('expected stop codons at the end of the genetic code')

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]

    # precompute some numpy ndarrays using the genetic code
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    compo = design.get_compo(codons)
    nt_sinks = design.get_nt_sinks(codons)
    asym_compo = np.transpose(nt_sinks, (1, 2, 0))

    # read the (nstates, nstates) array of observed codon substitutions
    subs_counts = np.loadtxt(args.count_matrix, dtype=float)

    # trim the stop codons
    subs_counts = subs_counts[:-nstop, :-nstop]

    # compute some summaries of the observed codon substitutions
    counts = np.sum(subs_counts, axis=0) + np.sum(subs_counts, axis=1)
    log_counts = np.log(counts)
    empirical_codon_distn = counts / float(np.sum(counts))

    # make crude guesses about parameter values
    blen = markovutil.guess_branch_length(subs_counts)
    theta = args.model.get_natural_guess()

    # Get the initial guesses for the EM parameters.
    prior_probs = np.array([0.5, 0.5], dtype=float)
    prior_em_distns = np.vstack((
        empirical_codon_distn,
        empirical_codon_distn,
        ))

    # iteratively compute parameter estimates
    for em_iteration_index in range(10):

        # given parameter guesses, compute the pre-rate matrices
        pre_Qs = args.model.get_pre_Qs(
                prior_probs, prior_em_distns,
                ts, tv, syn, nonsyn, compo, asym_compo,
                theta)

        # compute the appropriately scaled transition matrices
        eq_distns = args.model.get_distns(
                prior_probs, prior_em_distns,
                ts, tv, syn, nonsyn, compo, asym_compo,
                theta)
        Ps = markovutil.get_branch_mix(
                prior_probs, pre_Qs, eq_distns, blen)

        # given parameter guesses, compute posterior expectations
        post_probs, post_em_distns = get_posterior_expectations(
                subs_counts, Ps, prior_probs, eq_distns)

        # given posterior expectations, optimize the parameter guesses
        encoded_theta = np.empty(len(theta) + 1, dtype=float)
        encoded_theta[0] = np.log(blen)
        encoded_theta[1:] = args.model.natural_to_encoded(theta)

        # construct the neg log likelihood non-free params
        neg_ll_args = (
                args.model,
                post_probs, post_em_distns,
                subs_counts,
                ts, tv, syn, nonsyn, compo, asym_compo,
                )

        # define the objective function and the gradient and hessian
        f_encoded_theta = functools.partial(
                get_two_taxon_neg_ll_encoded_theta, *neg_ll_args)
        g_encoded_theta = functools.partial(eval_grad, f_encoded_theta)
        h_encoded_theta = functools.partial(eval_hess, f_encoded_theta)

        # do the search, using information about the gradient and hessian
        results = scipy.optimize.minimize(
                f_encoded_theta,
                encoded_theta,
                method=args.minimization_method,
                jac=g_encoded_theta,
                hess=h_encoded_theta,
                )

        # extract and decode the maximum likelihood estimates
        encoded_xopt = results.x
        mle_log_blen = encoded_xopt[0]
        mle_blen = np.exp(mle_log_blen)
        model_encoded_xopt = encoded_xopt[1:]
        model_xopt = args.model.encoded_to_natural(
                model_encoded_xopt)
        xopt = np.empty_like(encoded_xopt)
        xopt[0] = mle_blen
        xopt[1:] = model_xopt

        # report a summary of the maximum likelihood search
        print 'raw results from the minimization:'
        print results
        print
        print 'max likelihood branch length (expected number of substitutions):'
        print mle_blen
        print
        print 'max likelihood estimates of other model parameters:'
        print model_xopt
        print
        print 'posterior mixture probabilities:'
        print post_probs
        print

        # get ready for the next iteration if we continue
        blen = mle_blen
        #theta = model_xopt
        prior_probs = post_probs
        prior_em_distns = post_em_distns
def main(args):

    # read the description of the genetic code
    with open(args.code) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter="\t"))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    aminos = [x.lower() for x in aminos]
    nstop = aminos.count("stop")
    if nstop not in (2, 3, 4):
        raise Exception("expected 2 or 3 or 4 stop codons")
    if any(x == "stop" for x in aminos[:-nstop]):
        raise Exception("expected stop codons at the end of the genetic code")

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]

    # precompute some numpy ndarrays using the genetic code
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    compo = design.get_compo(codons)
    nt_sinks = design.get_nt_sinks(codons)
    asym_compo = np.transpose(nt_sinks, (1, 2, 0))

    # read the (nstates, nstates) array of observed codon substitutions
    subs_counts = np.loadtxt(args.count_matrix, dtype=float)

    # trim the stop codons
    subs_counts = subs_counts[:-nstop, :-nstop]

    # do the constrained log likelihood maximizations
    min_lls = []
    min_ll_slopes = []
    junk_list = []
    space = np.linspace(args.linspace_start, args.linspace_stop, num=args.linspace_num)
    for kimura_d in space:

        # define the model
        model = FMutSelG_F_partial(kimura_d)

        # compute the constrained min negative log likelihood
        min_ll, min_ll_slope, xopt_full = get_min_neg_ll_and_slope_and_junk(
            model, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, args.minimization_method
        )

        # add the min log likelihood to the list
        min_lls.append(min_ll)
        min_ll_slopes.append(min_ll_slope)
        junk_list.append(xopt_full)

    # write the R table
    with open(args.table_out, "w") as fout:

        # write the R header
        header_row = ["Kimura.D", "min.neg.ll", "min.neg.ll.slope", "branch.length"] + FMutSelG_F.get_names()

        print >> fout, "\t".join(header_row)

        # write each row of the R table,
        # where each row has
        # position, kimura_d, min_ll
        for i, v in enumerate(zip(space, min_lls, min_ll_slopes, junk_list)):
            abc = list(v[:-1])
            xopt = list(v[-1])
            row = [i + 1] + abc + xopt
            print >> fout, "\t".join(str(x) for x in row)