def get_two_taxon_neg_ll( model, em_probs, em_distns, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, natural_theta, ): """ Get the negative log likelihood. This function does not use the logarithms. It is mostly for computing the hessian; otherwise the version with the logarithms would probably be better. The first param group is the model implementation. The second param group is expectation-maximization stuff. The third param group is the data. The next param group consists of design matrices related to genetic code. The next param group consist of free parameters of the model. """ # unpack some parameters branch_length = natural_theta[0] natural_model_theta = natural_theta[1:] # compute the appropriately scaled transition matrices pre_Qs = model.get_pre_Qs( em_probs, em_distns, ts, tv, syn, nonsyn, compo, asym_compo, natural_model_theta) eq_distns = model.get_distns( em_probs, em_distns, ts, tv, syn, nonsyn, compo, asym_compo, natural_model_theta) Ps = markovutil.get_branch_mix(em_probs, pre_Qs, eq_distns, branch_length) # compute the mixture transition matrix P_mix = algopy.zeros_like(Ps[0]) P_mix += em_probs[0] * (Ps[0].T * eq_distns[0]).T P_mix += em_probs[1] * (Ps[1].T * eq_distns[1]).T # compute the neg log likelihood neg_ll = -algopy.sum(algopy.log(P_mix) * subs_counts) print neg_ll return neg_ll
def main(args): # read the description of the genetic code with open(args.code) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] # precompute some numpy ndarrays using the genetic code ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) compo = design.get_compo(codons) nt_sinks = design.get_nt_sinks(codons) asym_compo = np.transpose(nt_sinks, (1, 2, 0)) # read the (nstates, nstates) array of observed codon substitutions subs_counts = np.loadtxt(args.count_matrix, dtype=float) # trim the stop codons subs_counts = subs_counts[:-nstop, :-nstop] # compute some summaries of the observed codon substitutions counts = np.sum(subs_counts, axis=0) + np.sum(subs_counts, axis=1) log_counts = np.log(counts) empirical_codon_distn = counts / float(np.sum(counts)) # make crude guesses about parameter values blen = markovutil.guess_branch_length(subs_counts) theta = args.model.get_natural_guess() # Get the initial guesses for the EM parameters. prior_probs = np.array([0.5, 0.5], dtype=float) prior_em_distns = np.vstack(( empirical_codon_distn, empirical_codon_distn, )) # iteratively compute parameter estimates for em_iteration_index in range(10): # given parameter guesses, compute the pre-rate matrices pre_Qs = args.model.get_pre_Qs( prior_probs, prior_em_distns, ts, tv, syn, nonsyn, compo, asym_compo, theta) # compute the appropriately scaled transition matrices eq_distns = args.model.get_distns( prior_probs, prior_em_distns, ts, tv, syn, nonsyn, compo, asym_compo, theta) Ps = markovutil.get_branch_mix( prior_probs, pre_Qs, eq_distns, blen) # given parameter guesses, compute posterior expectations post_probs, post_em_distns = get_posterior_expectations( subs_counts, Ps, prior_probs, eq_distns) # given posterior expectations, optimize the parameter guesses encoded_theta = np.empty(len(theta) + 1, dtype=float) encoded_theta[0] = np.log(blen) encoded_theta[1:] = args.model.natural_to_encoded(theta) # construct the neg log likelihood non-free params neg_ll_args = ( args.model, post_probs, post_em_distns, subs_counts, ts, tv, syn, nonsyn, compo, asym_compo, ) # define the objective function and the gradient and hessian f_encoded_theta = functools.partial( get_two_taxon_neg_ll_encoded_theta, *neg_ll_args) g_encoded_theta = functools.partial(eval_grad, f_encoded_theta) h_encoded_theta = functools.partial(eval_hess, f_encoded_theta) # do the search, using information about the gradient and hessian results = scipy.optimize.minimize( f_encoded_theta, encoded_theta, method=args.minimization_method, jac=g_encoded_theta, hess=h_encoded_theta, ) # extract and decode the maximum likelihood estimates encoded_xopt = results.x mle_log_blen = encoded_xopt[0] mle_blen = np.exp(mle_log_blen) model_encoded_xopt = encoded_xopt[1:] model_xopt = args.model.encoded_to_natural( model_encoded_xopt) xopt = np.empty_like(encoded_xopt) xopt[0] = mle_blen xopt[1:] = model_xopt # report a summary of the maximum likelihood search print 'raw results from the minimization:' print results print print 'max likelihood branch length (expected number of substitutions):' print mle_blen print print 'max likelihood estimates of other model parameters:' print model_xopt print print 'posterior mixture probabilities:' print post_probs print # get ready for the next iteration if we continue blen = mle_blen #theta = model_xopt prior_probs = post_probs prior_em_distns = post_em_distns