def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the (nstates, nstates) array of observed codon substitutions subs_counts = np.loadtxt(args.count_matrix, dtype=float) # trim the stop codons subs_counts = subs_counts[:-nstop, :-nstop] # compute some summaries of the observed codon substitutions counts = np.sum(subs_counts, axis=0) + np.sum(subs_counts, axis=1) log_counts = np.log(counts) empirical_codon_distn = counts / float(np.sum(counts)) # make a crude guess of the expected number of changes log_blen = np.log(guess_branch_length(subs_counts)) # use the chosen model to construct an initial guess for max likelihood model_natural_guess = args.model.get_natural_guess() model_nparams = len(model_natural_guess) encoded_guess = np.empty(model_nparams + 1, dtype=float) encoded_guess[0] = log_blen encoded_guess[1:] = args.model.natural_to_encoded(model_natural_guess) # construct the neg log likelihood non-free params neg_ll_args = ( args.model, subs_counts, log_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, ) # define the objective function and the gradient and hessian f_encoded_theta = functools.partial( get_two_taxon_neg_ll_encoded_theta, *neg_ll_args) g_encoded_theta = functools.partial(eval_grad, f_encoded_theta) h_encoded_theta = functools.partial(eval_hess, f_encoded_theta) # do the search, using information about the gradient and hessian results = scipy.optimize.minimize( f_encoded_theta, encoded_guess, method=args.minimization_method, jac=g_encoded_theta, hess=h_encoded_theta, ) # extract and decode the maximum likelihood estimates encoded_xopt = results.x mle_log_blen = encoded_xopt[0] mle_blen = np.exp(mle_log_blen) model_encoded_xopt = encoded_xopt[1:] model_xopt = args.model.encoded_to_natural(model_encoded_xopt) xopt = np.empty_like(encoded_xopt) xopt[0] = mle_blen xopt[1:] = model_xopt # check that the stationary distribution is ok mle_distn = args.model.get_distn( log_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, model_xopt, ) mle_pre_Q = args.model.get_pre_Q( log_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, model_xopt, ) stationary_distn_check_helper(mle_pre_Q, mle_distn, mle_blen) # define functions for computing the hessian f = functools.partial(get_two_taxon_neg_ll, *neg_ll_args) g = functools.partial(eval_grad, f) h = functools.partial(eval_hess, f) # report a summary of the maximum likelihood search print 'raw results from the minimization:' print results print print 'max likelihood branch length (expected number of substitutions):' print mle_blen print print 'max likelihood estimates of other model parameters:' print model_xopt print # print the hessian matrix at the max likelihood parameter values fisher_info = h(xopt) cov = scipy.linalg.inv(fisher_info) errors = np.sqrt(np.diag(cov)) print 'observed fisher information matrix:' print fisher_info print print 'inverse of fisher information matrix:' print cov print print 'standard error estimates (sqrt of diag of inv of fisher info)' print errors print # write the neg log likelihood into a separate file if args.neg_log_likelihood_out: with open(args.neg_log_likelihood_out, 'w') as fout: print >> fout, results.fun # write the parameter estimates into a separate file if args.parameter_estimates_out: with open(args.parameter_estimates_out, 'w') as fout: for value in xopt: print >> fout, value # write the parameter estimates into a separate file if args.parameter_errors_out: with open(args.parameter_errors_out, 'w') as fout: for value in errors: print >> fout, value
def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError # look for stop codons aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] ncodons = len(codons) # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the empirical counts and detect and trim stop codon counts empirical_codon_counts = np.loadtxt(args.empirical_codon_counts) if len(empirical_codon_counts) < ncodons: raise Exception if any(empirical_codon_counts[ncodons:]): raise Exception empirical_codon_counts = empirical_codon_counts[:ncodons] log_empirical_codon_counts = np.log(empirical_codon_counts) empirical_codon_distn = empirical_codon_counts / float( np.sum(empirical_codon_counts)) # Precompute the distribution over codon substitutions # using the stationary distribution and transition matrix # of the simulation model given the simulation parameters. sim_theta = np.loadtxt(args.simulation_parameter_values) log_sim_theta = np.log(sim_theta) log_sim_blen = log_sim_theta[0] log_sim_theta_model = log_sim_theta[1:] sim_model = g_model_name_to_class[args.simulation_model] pre_Q = sim_model.get_pre_Q( log_empirical_codon_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, log_sim_theta_model, ) stationary_distn = sim_model.get_distn( log_empirical_codon_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, log_sim_theta_model, ) # get the rate matrix Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, np.exp(log_sim_blen)) # get the conditional transition matrix P = scipy.linalg.expm(Q) # get the joint substitution probability matrix J = (P.T * stationary_distn).T # use an appropriate sample size if args.sample_size: sample_size = args.sample_size else: # If the sample size is unspecified, # use a sample size whose number of codon counts # matches the sum of the user-provided empirical codon counts. sample_size = int(np.sum(empirical_codon_counts) / 2) with open(args.table_out, 'w') as fout: # write the header of the R table file header_row = ( args.first_inference_model.replace('-', '.'), args.second_inference_model.replace('-', '.'), ) print >> fout, '\t'.join(header_row) for sample_index in range(args.nsamples): # sample an (ncodons, ncodons) ndarray of counts # using the precomputed multinomial distribution subs_counts = np.random.multinomial( sample_size, J.flat, ).reshape(J.shape).astype(float) # compute the neg log likelihoods for the two models neg_log_likelihoods = [] for model_name in ( args.first_inference_model, args.second_inference_model, ): model = g_model_name_to_class[model_name] min_neg_ll = get_min_neg_log_likelihood( model, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, args.minimization_method, ) neg_log_likelihoods.append(min_neg_ll) # write the data row data_row = ( sample_index+1, -neg_log_likelihoods[0], -neg_log_likelihoods[1], ) print >> fout, '\t'.join(str(x) for x in data_row)
def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the (nstates, nstates) array of observed codon substitutions subs_counts = np.loadtxt(args.count_matrix, dtype=float) # trim the stop codons subs_counts = subs_counts[:-nstop, :-nstop] # compute some summaries of the observed codon substitutions counts = np.sum(subs_counts, axis=0) + np.sum(subs_counts, axis=1) log_counts = np.log(counts) empirical_codon_distn = counts / float(np.sum(counts)) # make crude guesses about parameter values blen = markovutil.guess_branch_length(subs_counts) theta = args.model.get_natural_guess() # Get the initial guesses for the EM parameters. prior_probs = np.array([0.5, 0.5], dtype=float) prior_em_distns = np.vstack(( empirical_codon_distn, empirical_codon_distn, )) # iteratively compute parameter estimates for em_iteration_index in range(10): # given parameter guesses, compute the pre-rate matrices pre_Qs = args.model.get_pre_Qs( prior_probs, prior_em_distns, ts, tv, syn, nonsyn, compo, asym_compo, theta) # compute the appropriately scaled transition matrices eq_distns = args.model.get_distns( prior_probs, prior_em_distns, ts, tv, syn, nonsyn, compo, asym_compo, theta) Ps = markovutil.get_branch_mix( prior_probs, pre_Qs, eq_distns, blen) # given parameter guesses, compute posterior expectations post_probs, post_em_distns = get_posterior_expectations( subs_counts, Ps, prior_probs, eq_distns) # given posterior expectations, optimize the parameter guesses encoded_theta = np.empty(len(theta) + 1, dtype=float) encoded_theta[0] = np.log(blen) encoded_theta[1:] = args.model.natural_to_encoded(theta) # construct the neg log likelihood non-free params neg_ll_args = ( args.model, post_probs, post_em_distns, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, ) # define the objective function and the gradient and hessian f_encoded_theta = functools.partial( get_two_taxon_neg_ll_encoded_theta, *neg_ll_args) g_encoded_theta = functools.partial(eval_grad, f_encoded_theta) h_encoded_theta = functools.partial(eval_hess, f_encoded_theta) # do the search, using information about the gradient and hessian results = scipy.optimize.minimize( f_encoded_theta, encoded_theta, method=args.minimization_method, jac=g_encoded_theta, hess=h_encoded_theta, ) # extract and decode the maximum likelihood estimates encoded_xopt = results.x mle_log_blen = encoded_xopt[0] mle_blen = np.exp(mle_log_blen) model_encoded_xopt = encoded_xopt[1:] model_xopt = args.model.encoded_to_natural( model_encoded_xopt) xopt = np.empty_like(encoded_xopt) xopt[0] = mle_blen xopt[1:] = model_xopt # report a summary of the maximum likelihood search print 'raw results from the minimization:' print results print print 'max likelihood branch length (expected number of substitutions):' print mle_blen print print 'max likelihood estimates of other model parameters:' print model_xopt print print 'posterior mixture probabilities:' print post_probs print # get ready for the next iteration if we continue blen = mle_blen #theta = model_xopt prior_probs = post_probs prior_em_distns = post_em_distns
def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter="\t")) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError aminos = [x.lower() for x in aminos] nstop = aminos.count("stop") if nstop not in (2, 3, 4): raise Exception("expected 2 or 3 or 4 stop codons") if any(x == "stop" for x in aminos[:-nstop]): raise Exception("expected stop codons at the end of the genetic code") # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the (nstates, nstates) array of observed codon substitutions subs_counts = np.loadtxt(args.count_matrix, dtype=float) # trim the stop codons subs_counts = subs_counts[:-nstop, :-nstop] # do the constrained log likelihood maximizations min_lls = [] min_ll_slopes = [] junk_list = [] space = np.linspace(args.linspace_start, args.linspace_stop, num=args.linspace_num) for kimura_d in space: # define the model model = FMutSelG_F_partial(kimura_d) # compute the constrained min negative log likelihood min_ll, min_ll_slope, xopt_full = get_min_neg_ll_and_slope_and_junk( model, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, args.minimization_method ) # add the min log likelihood to the list min_lls.append(min_ll) min_ll_slopes.append(min_ll_slope) junk_list.append(xopt_full) # write the R table with open(args.table_out, "w") as fout: # write the R header header_row = ["Kimura.D", "min.neg.ll", "min.neg.ll.slope", "branch.length"] + FMutSelG_F.get_names() print >> fout, "\t".join(header_row) # write each row of the R table, # where each row has # position, kimura_d, min_ll for i, v in enumerate(zip(space, min_lls, min_ll_slopes, junk_list)): abc = list(v[:-1]) xopt = list(v[-1]) row = [i + 1] + abc + xopt print >> fout, "\t".join(str(x) for x in row)