Example #1
0
def main(msa_filename, tree_filename, single_model_filename=os.path.join(os.environ['LCODE'],'data/single_model'), \
		paired_model_filename=os.path.join(os.environ['LCODE'],'data/pair_model')):
	from MSA import MSA
	from EvoModel import SingleModel, PairedModel
	from Tree import *

	msa = MSA(msa_filename)

	single_model = SingleModel(single_model_filename)
	paired_model = PairedModel(paired_model_filename, single_model)

	# --------------- using newick ---------------------
#	acc = list(msa.ids)
#	post_order_traversal(t, acc)
#	order = acc[msa.nseq:]
	# -------------- using dendropy -------------------
	t2 = dendropy.Tree.get_from_path(tree_filename, 'newick')
	msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()])
	t = t2
	order = postorder_assign_then_traverse(t, list(msa.ids))
	
	single_cols = xrange(msa.aln_len)
	paired_cols = msa.BP.items()
	paired_cols.sort()
	n = msa.nseq

	S = init_likelihood(msa, single_cols, single_model)

	g = MyMat.calc_likelihood
	# NOTE: NO LONGER logs the single model Frequency!
	# first calculate the null model (joint indep prob at each position)
	# TODO: this is not the fastest code ever....but will do for now
	L_null = [sum(sum(exp(S[:msa.nseq, col, :4]) * log(single_model.Frequency))) for col in single_cols]

	# convert S into 1d
	nnode, ncol, nbase = S.shape
	S = scipy.ascontiguousarray(S.reshape(S.size))

	P = init_likelihood_paired(msa, paired_cols, paired_model, nnode)
	nnode_p, ncol_p, nbase_p = P.shape
	P = scipy.ascontiguousarray(P.reshape(P.size))

	like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr
	return like_s_n_p
Example #2
0
	treat_gap_as_missing  = options.treat_gap_as_missing

	assert 0. < options.trim_gap_threshold <= 1.
	assert 1 <= options.cpu

	msa = MSA(msa_filename, options.ignore_bp)
	msa.trim_gaps(removeAmbs=True, threshold=options.trim_gap_threshold) 

	single_model = SingleModel(single_model_filename)
	paired_model = PairedModel(paired_model_filename, single_model)

	# -------------- using dendropy -------------------
	t = dendropy.Tree.get_from_path(tree_filename, 'newick')
	# have to call remove_seqs_not_in_tree becuz sometimes I 
	# will manually trim leaves from the tree
	msa.remove_seqs_not_in_tree([x.taxon.label for x in t.leaf_nodes()])
	# edge lengths of 0 will cause calculation problems...
	# TODO: better way to handle this?
	for n in t.nodes():
		if n.edge_length <= 0:
			n.edge_length = 1e-3
			print >> sys.stderr, "Node {0} has an edge length of 0. Manually padded to 0.001. Remove this node in the future to avoid this".format(n)

	with open(options.log_filename, 'w') as f:
		o = TreeLikelihood.TreeLikelihood(msa, t, single_model, paired_model, treat_gap_as_missing)
		# this must be called to initialize the S, P arrays
		# TODO: maybe incorporate it in _init__?
		o.calc_likelihood()
		f.write("Before full tree optimization: {0}\n".format(o.like))
		try:
			o.optimize_branch()