Beispiel #1
0
def compute_lh(tree, verbose=0):
    """
    compute the likelihood for each gene presence pattern in the sequence given the gtr parameters
    """

    min_branch_length = 1e-10
    L = tree.tree.get_terminals()[0].sequence.shape[0]
    n_states = tree.gtr.alphabet.shape[0]
    if verbose > 2:
        print(
            "Walking up the tree, computing likelihoods for the pattern in the leaves..."
        )
    for leaf in tree.tree.get_terminals():
        # in any case, set the profile
        leaf.profile = seq_utils.seq2prof(leaf.sequence, tree.gtr.profile_map)
        leaf.lh_prefactor = np.zeros(L)
    for node in tree.tree.get_nonterminals(order='postorder'):  #leaves -> root
        # regardless of what was before, set the profile to ones
        node.lh_prefactor = np.zeros(L)
        node.profile = np.ones(
            (L, n_states
             ))  # this has to be ones in each entry -> we will multiply it
        for ch in node.clades:
            ch.seq_msg_to_parent = tree.gtr.propagate_profile(
                ch.profile,
                max(ch.branch_length, min_branch_length),
                return_log=False)  # raw prob to transfer prob up
            node.profile *= ch.seq_msg_to_parent
            node.lh_prefactor += ch.lh_prefactor
        pre = node.profile.sum(axis=1)  #sum over nucleotide states

        node.profile = (node.profile.T /
                        pre).T  # normalize so that the sum is 1
        node.lh_prefactor += np.log(pre)  # and store log-prefactor

    tree.tree.root.pattern_profile_lh = (
        np.log(tree.tree.root.profile).transpose() +
        tree.tree.root.lh_prefactor).transpose()
Beispiel #2
0
def test_seq_joint_reconstruction_correct():
    """
    evolve the random sequence, get the alignment at the leaf nodes.
    Reconstruct the sequences of the internal nodes (joint)
    and prove the reconstruction is correct.
    In addition, compute the likelihood of the particular realization of the
    sequences on the tree and prove that this likelihood is exactly the same
    as calculated in the joint reconstruction
    """

    from treetime import TreeAnc, GTR
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    import numpy as np
    try:
        from itertools import izip
    except ImportError:  #python3.x
        izip = zip
    from collections import defaultdict
    def exclusion(a, b):
        """
        Intersection of two lists
        """
        return list(set(a) - set(b))

    tiny_tree = Phylo.read(StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick')
    mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']),
                       pi = np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4,4)))
    seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400)


    myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4)

    # simulate evolution, set resulting sequence as ref_seq
    tree = myTree.tree
    seq_len = 400
    tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len)
    print ("Root sequence: " + ''.join(tree.root.ref_seq))
    mutation_list = defaultdict(list)
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p=(p.T/p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0])])

        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])

        node.ref_mutations = [(anc, pos, der) for pos, (anc, der) in
                            enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc!=der]
        for anc, pos, der in node.ref_mutations:
            print(pos)
            mutation_list[pos].append((node.name, anc, der))
        print (node.name, len(node.ref_mutations), node.ref_mutations)

    # set as the starting sequences to the terminal nodes:
    alnstr = ""
    i = 1
    for leaf in tree.get_terminals():
        alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n'
        i += 1
    print (alnstr)
    myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta')
    myTree._attach_sequences_to_nodes()
    # reconstruct ancestral sequences:
    myTree._ml_anc_joint(debug=True)

    diff_count = 0
    mut_count = 0
    for node in myTree.tree.find_clades():
        if node.up is not None:
            mut_count += len(node.ref_mutations)
            diff_count += np.sum(node.sequence != node.ref_seq)==0
            if np.sum(node.sequence != node.ref_seq):
                print("%s: True sequence does not equal inferred sequence. parent %s"%(node.name, node.up.name))
            else:
                print("%s: True sequence equals inferred sequence. parent %s"%(node.name, node.up.name))
        print (node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations)

    # the assignment of mutations to the root node is probabilistic. Hence some differences are expected
    assert diff_count/seq_len<2*(1.0*mut_count/seq_len)**2

    # prove the likelihood value calculation is correct
    LH = myTree.ancestral_likelihood()
    LH_p = (myTree.tree.sequence_LH)

    print ("Difference between reference and inferred LH:", (LH - LH_p).sum())
    assert ((LH - LH_p).sum())<1e-9

    return myTree
Beispiel #3
0
def test_seq_joint_reconstruction_correct():
    """
    evolve the random sequence, get the alignment at the leaf nodes.
    Reconstruct the sequences of the internal nodes (joint)
    and prove the reconstruction is correct.
    In addition, compute the likelihood of the particular realization of the
    sequences on the tree and prove that this likelihood is exactly the same
    as calculated in the joint reconstruction
    """

    from treetime import TreeAnc, GTR
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    from StringIO import StringIO
    import numpy as np
    try:
        from itertools import izip
    except ImportError:  #python3.x
        izip = zip
    from collections import defaultdict

    def exclusion(a, b):
        """
        Intersection of two lists
        """
        return list(set(a) - set(b))

    tiny_tree = Phylo.read(
        StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick')
    mygtr = GTR.custom(alphabet=np.array(['A', 'C', 'G', 'T']),
                       pi=np.array([0.15, 0.95, 0.05, 0.3]),
                       W=np.ones((4, 4)))
    seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400)

    myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4)

    # simulate evolution, set resulting sequence as ref_seq
    tree = myTree.tree
    seq_len = 400
    tree.root.ref_seq = np.random.choice(mygtr.alphabet,
                                         p=mygtr.Pi,
                                         size=seq_len)
    print("Root sequence: " + ''.join(tree.root.ref_seq))
    mutation_list = defaultdict(list)
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile(
            seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])

        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])

        node.ref_mutations = [
            (anc, pos, der)
            for pos, (anc,
                      der) in enumerate(izip(node.up.ref_seq, node.ref_seq))
            if anc != der
        ]
        for anc, pos, der in node.ref_mutations:
            print(pos)
            mutation_list[pos].append((node.name, anc, der))
        print(node.name, len(node.ref_mutations), node.ref_mutations)

    # set as the starting sequences to the terminal nodes:
    alnstr = ""
    i = 1
    for leaf in tree.get_terminals():
        alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n'
        i += 1
    print(alnstr)
    myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta')
    myTree._attach_sequences_to_nodes()
    # reconstruct ancestral sequences:
    myTree._ml_anc_joint(debug=True)

    diff_count = 0
    mut_count = 0
    for node in myTree.tree.find_clades():
        if node.up is not None:
            mut_count += len(node.ref_mutations)
            diff_count += np.sum(node.sequence != node.ref_seq) == 0
            if np.sum(node.sequence != node.ref_seq):
                print(
                    "%s: True sequence does not equal inferred sequence. parent %s"
                    % (node.name, node.up.name))
            else:
                print("%s: True sequence equals inferred sequence. parent %s" %
                      (node.name, node.up.name))
        print(node.name, np.sum(node.sequence != node.ref_seq),
              np.where(node.sequence != node.ref_seq), len(node.mutations),
              node.mutations)

    # the assignment of mutations to the root node is probabilistic. Hence some differences are expected
    assert diff_count / seq_len < 2 * (1.0 * mut_count / seq_len)**2

    # prove the likelihood value calculation is correct
    LH = myTree.ancestral_likelihood()
    LH_p = (myTree.tree.sequence_LH)

    print("Difference between reference and inferred LH:", (LH - LH_p).sum())
    assert ((LH - LH_p).sum()) < 1e-9

    return myTree
from __future__ import print_function, division
import numpy as np
from Bio import Phylo

if __name__ == '__main__':
    from treetime.seq_utils import normalize_profile, prof2seq, seq2prof
    from treetime.gtr import GTR

    gtr = GTR.standard('JC69')
    dummy_prof = np.random.random(size=(10000, 5))

    # used a lot (300us)
    norm_prof = normalize_profile(dummy_prof)[0]

    # used less but still a lot (50us)
    gtr.evolve(norm_prof, 0.1)

    # used less but still a lot (50us)
    gtr.propagate_profile(norm_prof, 0.1)

    # used only in final, sample_from_prof=False speeds it up (600us or 300us)
    seq, p, seq_ii = prof2seq(norm_prof,
                              gtr,
                              sample_from_prof=True,
                              normalize=False)

    # used only initially (slow, 5ms)
    tmp_prof = seq2prof(seq, gtr.profile_map)
Beispiel #5
0
def evolve_seq(treefile,
               basename,
               mu=0.0001,
               L=1000,
               mygtr=treetime.GTR.standard('jc')):
    """
    Generate a random sequence of a given length, and evolve it on the tree

    Args:
     - treefile: filename for the tree, on which a sequence should be evolved.
     - basename: filename prefix to save alignments.
     - mu: mutation rate. The units of the mutation rate should be consistent with
     the tree branch length
     - L: sequence length.
     - mygtr: GTR model for sequence evolution

    """
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    import numpy as np
    from itertools import izip

    mygtr.mu = mu
    tree = Phylo.read(treefile, 'newick')
    tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=L)
    print("Started sequence evolution...")
    mu_real = 0.0
    n_branches = 0
    #print ("Root sequence: " + ''.join(tree.root.ref_seq))
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile(
            seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])
        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])
        node.ref_mutations = [
            (anc, pos, der)
            for pos, (anc,
                      der) in enumerate(izip(node.up.ref_seq, node.ref_seq))
            if anc != der
        ]
        #print (node.name, len(node.ref_mutations))
        mu_real += 1.0 * (node.ref_seq != node.up.ref_seq).sum() / L
        n_branches += t
    mu_real /= n_branches
    print("Mutation rate is {}".format(mu_real))
    records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]
    full_records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]
    #import ipdb; ipdb.set_trace()
    aln = Align.MultipleSeqAlignment(records)
    full_aln = Align.MultipleSeqAlignment(full_records)
    print("Sequence evolution done...")

    # save results
    AlignIO.write(aln, basename + '.aln.ev.fasta', 'fasta')
    AlignIO.write(full_aln, basename + '.aln.ev_full.fasta', 'fasta')

    return aln, full_aln, mu_real
Beispiel #6
0
    S = -np.sum(np.log(gtr.Pi) * gtr.Pi, axis=0)
    print('Entropy:', S.mean(), S.std())

    # make a replica of the model with frequencies averaged over the sequence
    gtr.mu /= gtr.average_rate().mean()
    gtr_flat = GTR_site_specific.custom(alphabet=gtr.alphabet,
                                        mu=gtr.mu,
                                        W=gtr.W,
                                        pi=gtr.Pi.mean(axis=1))
    gtr_flat.mu /= gtr_flat.average_rate().mean()

    branch_length = []
    for r in range(20):  # repeat 20 times
        # sample a random sequence from the true profile
        s1 = prof2seq(gtr.Pi.T, gtr, sample_from_prof=True)
        s1_prof = seq2prof(s1[0], profile_map=gtr.profile_map)
        tmp_branch_length = []
        for t in np.linspace(0, 1, 31):
            # evolve the ancestral sequence for time t
            s2_prof = gtr.evolve(s1_prof, t)
            s2 = prof2seq(s2_prof, gtr, sample_from_prof=True)
            s2_prof_q = seq2prof(s2[0], profile_map=gtr.profile_map)

            # compute ML estimate for true model, flat model, and hamming distance
            tmp_branch_length.append(
                (t,
                 gtr.optimal_t_compressed((s1_prof, s2_prof_q),
                                          multiplicity=np.ones_like(gtr.mu),
                                          profiles=True),
                 gtr_flat.optimal_t_compressed(
                     (s1_prof, s2_prof_q),