def main(msa_filename, tree_filename, single_model_filename=os.path.join(os.environ['LCODE'],'data/single_model'), \ paired_model_filename=os.path.join(os.environ['LCODE'],'data/pair_model')): from MSA import MSA from EvoModel import SingleModel, PairedModel from Tree import * msa = MSA(msa_filename) single_model = SingleModel(single_model_filename) paired_model = PairedModel(paired_model_filename, single_model) # --------------- using newick --------------------- # acc = list(msa.ids) # post_order_traversal(t, acc) # order = acc[msa.nseq:] # -------------- using dendropy ------------------- t2 = dendropy.Tree.get_from_path(tree_filename, 'newick') msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()]) t = t2 order = postorder_assign_then_traverse(t, list(msa.ids)) single_cols = xrange(msa.aln_len) paired_cols = msa.BP.items() paired_cols.sort() n = msa.nseq S = init_likelihood(msa, single_cols, single_model) g = MyMat.calc_likelihood # NOTE: NO LONGER logs the single model Frequency! # first calculate the null model (joint indep prob at each position) # TODO: this is not the fastest code ever....but will do for now L_null = [sum(sum(exp(S[:msa.nseq, col, :4]) * log(single_model.Frequency))) for col in single_cols] # convert S into 1d nnode, ncol, nbase = S.shape S = scipy.ascontiguousarray(S.reshape(S.size)) P = init_likelihood_paired(msa, paired_cols, paired_model, nnode) nnode_p, ncol_p, nbase_p = P.shape P = scipy.ascontiguousarray(P.reshape(P.size)) like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr return like_s_n_p
treat_gap_as_missing = options.treat_gap_as_missing assert 0. < options.trim_gap_threshold <= 1. assert 1 <= options.cpu msa = MSA(msa_filename, options.ignore_bp) msa.trim_gaps(removeAmbs=True, threshold=options.trim_gap_threshold) single_model = SingleModel(single_model_filename) paired_model = PairedModel(paired_model_filename, single_model) # -------------- using dendropy ------------------- t = dendropy.Tree.get_from_path(tree_filename, 'newick') # have to call remove_seqs_not_in_tree becuz sometimes I # will manually trim leaves from the tree msa.remove_seqs_not_in_tree([x.taxon.label for x in t.leaf_nodes()]) # edge lengths of 0 will cause calculation problems... # TODO: better way to handle this? for n in t.nodes(): if n.edge_length <= 0: n.edge_length = 1e-3 print >> sys.stderr, "Node {0} has an edge length of 0. Manually padded to 0.001. Remove this node in the future to avoid this".format(n) with open(options.log_filename, 'w') as f: o = TreeLikelihood.TreeLikelihood(msa, t, single_model, paired_model, treat_gap_as_missing) # this must be called to initialize the S, P arrays # TODO: maybe incorporate it in _init__? o.calc_likelihood() f.write("Before full tree optimization: {0}\n".format(o.like)) try: o.optimize_branch()