def stationary_distn_check_helper(pre_Q, codon_distn, branch_length): Q = markovutil.pre_Q_to_Q(pre_Q, codon_distn, branch_length) P = scipy.linalg.expm(Q) next_distn = np.dot(codon_distn, P) if not np.allclose(next_distn, codon_distn): raise Exception(next_distn - codon_distn) print 'stationary distribution is ok'
def get_neg_ll(cls, patterns, pattern_weights, stationary_distn, ts, tv, syn, nonsyn, theta, ): """ This model has only a single omega parameter. @param theta: vector of free variables with sensitivities """ # unpack theta log_mus = theta[0:3] log_kappa = theta[3] log_omega = theta[4] # construct the transition matrices transition_matrices = [] for i in range(3): mu = algopy.exp(log_mus[i]) kappa = algopy.exp(log_kappa) omega = algopy.exp(log_omega) pre_Q = codon1994.get_pre_Q( ts, tv, syn, nonsyn, stationary_distn, kappa, omega) Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, mu) P = algopy.expm(Q) transition_matrices.append(P) # return the neg log likelihood ov = range(4) v_to_children = {3 : [0, 1, 2]} de_to_P = { (3, 0) : transition_matrices[0], (3, 1) : transition_matrices[1], (3, 2) : transition_matrices[2], } root_prior = stationary_distn log_likelihood = alignll.fast_fels( #log_likelihood = alignll.fels( ov, v_to_children, de_to_P, root_prior, patterns, pattern_weights, ) neg_ll = -log_likelihood print neg_ll return neg_ll
def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError # look for stop codons aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] ncodons = len(codons) # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the empirical counts and detect and trim stop codon counts empirical_codon_counts = np.loadtxt(args.empirical_codon_counts) if len(empirical_codon_counts) < ncodons: raise Exception if any(empirical_codon_counts[ncodons:]): raise Exception empirical_codon_counts = empirical_codon_counts[:ncodons] log_empirical_codon_counts = np.log(empirical_codon_counts) empirical_codon_distn = empirical_codon_counts / float( np.sum(empirical_codon_counts)) # Precompute the distribution over codon substitutions # using the stationary distribution and transition matrix # of the simulation model given the simulation parameters. sim_theta = np.loadtxt(args.simulation_parameter_values) log_sim_theta = np.log(sim_theta) log_sim_blen = log_sim_theta[0] log_sim_theta_model = log_sim_theta[1:] sim_model = g_model_name_to_class[args.simulation_model] pre_Q = sim_model.get_pre_Q( log_empirical_codon_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, log_sim_theta_model, ) stationary_distn = sim_model.get_distn( log_empirical_codon_counts, empirical_codon_distn, ts, tv, syn, nonsyn, compo, asym_compo, log_sim_theta_model, ) # get the rate matrix Q = markovutil.pre_Q_to_Q(pre_Q, stationary_distn, np.exp(log_sim_blen)) # get the conditional transition matrix P = scipy.linalg.expm(Q) # get the joint substitution probability matrix J = (P.T * stationary_distn).T # use an appropriate sample size if args.sample_size: sample_size = args.sample_size else: # If the sample size is unspecified, # use a sample size whose number of codon counts # matches the sum of the user-provided empirical codon counts. sample_size = int(np.sum(empirical_codon_counts) / 2) with open(args.table_out, 'w') as fout: # write the header of the R table file header_row = ( args.first_inference_model.replace('-', '.'), args.second_inference_model.replace('-', '.'), ) print >> fout, '\t'.join(header_row) for sample_index in range(args.nsamples): # sample an (ncodons, ncodons) ndarray of counts # using the precomputed multinomial distribution subs_counts = np.random.multinomial( sample_size, J.flat, ).reshape(J.shape).astype(float) # compute the neg log likelihoods for the two models neg_log_likelihoods = [] for model_name in ( args.first_inference_model, args.second_inference_model, ): model = g_model_name_to_class[model_name] min_neg_ll = get_min_neg_log_likelihood( model, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, args.minimization_method, ) neg_log_likelihoods.append(min_neg_ll) # write the data row data_row = ( sample_index+1, -neg_log_likelihoods[0], -neg_log_likelihoods[1], ) print >> fout, '\t'.join(str(x) for x in data_row)