def ref_lh(): """ reference likelihood - LH values for all possible variants of the internal node sequences """ tiny_aln = AlignIO.read(StringIO(">A\n" + A_seq + "\n" ">B\n" + B_seq + "\n" ">D\n" + D_seq + "\n" ">C\nAAAACCCCGGGGTTTT\n" ">E\nACGTACGTACGTACGT\n"), 'fasta') myTree = TreeAnc(gtr=mygtr, tree = tiny_tree, aln =tiny_aln, verbose = 4) logLH_ref = myTree.ancestral_likelihood() return logLH_ref
def ref_lh(): """ reference likelihood - LH values for all possible variants of the internal node sequences """ tiny_aln = AlignIO.read( StringIO(">A\n" + A_seq + "\n" ">B\n" + B_seq + "\n" ">D\n" + D_seq + "\n" ">C\nAAAACCCCGGGGTTTT\n" ">E\nACGTACGTACGTACGT\n"), 'fasta') myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln, verbose=4) logLH_ref = myTree.ancestral_likelihood() return logLH_ref
def test_seq_joint_reconstruction_correct(): """ evolve the random sequence, get the alignment at the leaf nodes. Reconstruct the sequences of the internal nodes (joint) and prove the reconstruction is correct. In addition, compute the likelihood of the particular realization of the sequences on the tree and prove that this likelihood is exactly the same as calculated in the joint reconstruction """ from treetime import TreeAnc, GTR from treetime import seq_utils from Bio import Phylo, AlignIO import numpy as np try: from itertools import izip except ImportError: #python3.x izip = zip from collections import defaultdict def exclusion(a, b): """ Intersection of two lists """ return list(set(a) - set(b)) tiny_tree = Phylo.read(StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick') mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4,4))) seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400) myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4) # simulate evolution, set resulting sequence as ref_seq tree = myTree.tree seq_len = 400 tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len) print ("Root sequence: " + ''.join(tree.root.ref_seq)) mutation_list = defaultdict(list) for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t) # normalie profile p=(p.T/p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0])]) node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs]) node.ref_mutations = [(anc, pos, der) for pos, (anc, der) in enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc!=der] for anc, pos, der in node.ref_mutations: print(pos) mutation_list[pos].append((node.name, anc, der)) print (node.name, len(node.ref_mutations), node.ref_mutations) # set as the starting sequences to the terminal nodes: alnstr = "" i = 1 for leaf in tree.get_terminals(): alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n' i += 1 print (alnstr) myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta') myTree._attach_sequences_to_nodes() # reconstruct ancestral sequences: myTree._ml_anc_joint(debug=True) diff_count = 0 mut_count = 0 for node in myTree.tree.find_clades(): if node.up is not None: mut_count += len(node.ref_mutations) diff_count += np.sum(node.sequence != node.ref_seq)==0 if np.sum(node.sequence != node.ref_seq): print("%s: True sequence does not equal inferred sequence. parent %s"%(node.name, node.up.name)) else: print("%s: True sequence equals inferred sequence. parent %s"%(node.name, node.up.name)) print (node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations) # the assignment of mutations to the root node is probabilistic. Hence some differences are expected assert diff_count/seq_len<2*(1.0*mut_count/seq_len)**2 # prove the likelihood value calculation is correct LH = myTree.ancestral_likelihood() LH_p = (myTree.tree.sequence_LH) print ("Difference between reference and inferred LH:", (LH - LH_p).sum()) assert ((LH - LH_p).sum())<1e-9 return myTree
def test_seq_joint_reconstruction_correct(): """ evolve the random sequence, get the alignment at the leaf nodes. Reconstruct the sequences of the internal nodes (joint) and prove the reconstruction is correct. In addition, compute the likelihood of the particular realization of the sequences on the tree and prove that this likelihood is exactly the same as calculated in the joint reconstruction """ from treetime import TreeAnc, GTR from treetime import seq_utils from Bio import Phylo, AlignIO from StringIO import StringIO import numpy as np try: from itertools import izip except ImportError: #python3.x izip = zip from collections import defaultdict def exclusion(a, b): """ Intersection of two lists """ return list(set(a) - set(b)) tiny_tree = Phylo.read( StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick') mygtr = GTR.custom(alphabet=np.array(['A', 'C', 'G', 'T']), pi=np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4, 4))) seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400) myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4) # simulate evolution, set resulting sequence as ref_seq tree = myTree.tree seq_len = 400 tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len) print("Root sequence: " + ''.join(tree.root.ref_seq)) mutation_list = defaultdict(list) for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t) # normalie profile p = (p.T / p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([ int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0]) ]) node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs]) node.ref_mutations = [ (anc, pos, der) for pos, (anc, der) in enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc != der ] for anc, pos, der in node.ref_mutations: print(pos) mutation_list[pos].append((node.name, anc, der)) print(node.name, len(node.ref_mutations), node.ref_mutations) # set as the starting sequences to the terminal nodes: alnstr = "" i = 1 for leaf in tree.get_terminals(): alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n' i += 1 print(alnstr) myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta') myTree._attach_sequences_to_nodes() # reconstruct ancestral sequences: myTree._ml_anc_joint(debug=True) diff_count = 0 mut_count = 0 for node in myTree.tree.find_clades(): if node.up is not None: mut_count += len(node.ref_mutations) diff_count += np.sum(node.sequence != node.ref_seq) == 0 if np.sum(node.sequence != node.ref_seq): print( "%s: True sequence does not equal inferred sequence. parent %s" % (node.name, node.up.name)) else: print("%s: True sequence equals inferred sequence. parent %s" % (node.name, node.up.name)) print(node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations) # the assignment of mutations to the root node is probabilistic. Hence some differences are expected assert diff_count / seq_len < 2 * (1.0 * mut_count / seq_len)**2 # prove the likelihood value calculation is correct LH = myTree.ancestral_likelihood() LH_p = (myTree.tree.sequence_LH) print("Difference between reference and inferred LH:", (LH - LH_p).sum()) assert ((LH - LH_p).sum()) < 1e-9 return myTree