Example #1
0
def build_tree(focal_alignment):
    """
    Parameters
    ----------
    focal_alignment : path-like
        path to a fasta file containing the sequences to build the
        focal-alignment tree.

    Returns
    -------
    Bio.Phylo.Newick.Tree
        pyhlogenetic tree of focal-alignment.

    """
    tree_cmd = [
        "fasttree", "-nt", "-noml", "-nome", "-nosupport", focal_alignment
    ]

    T = Phylo.read(io.StringIO(subprocess.check_output(tree_cmd).decode()),
                   'newick')
    T.root_with_outgroup('Wuhan/Hu-1/2019')
    #T.root_at_midpoint()

    tt = TreeAnc(tree=T, aln=focal_alignment)
    tt.infer_ancestral_sequences(reconstruct_tip_states=False)
    tt.prune_short_branches()
    tt.optimize_tree()
    return tt.tree
def run_tree(fname, out_prefix, alphabet, model, n_iter=20):
    params = parse_alignment_name(fname)
    m = params['m']
    tree = Phylo.read(tree_name(prefix, params), 'newick')
    tree.root.branch_length = 0.001
    tree.ladderize()
    for n in tree.find_clades():
        n.branch_length *= m*(0.6+0.4*np.random.random())

    with gzip.open(alignment_name(prefix, params), 'rt') as fh:
        aln = AlignIO.read(fh, 'fasta')

    tt = TreeAnc(tree=tree, aln=aln, gtr = model, compress=False,
                 alphabet=alphabet, verbose=3)

    tt.optimize_tree(branch_length_mode='marginal', max_iter=n_iter,
                     infer_gtr=False)

    return tt
Example #3
0
def run_tree(fname,
             out_prefix,
             alphabet,
             true_tree=False,
             true_model=False,
             pc=0.1,
             true_rates=False):
    """
    read a tree and an alignment and optimize its branch length using different types of models.
    the use can specify to either use the true model for optimization, just the true rates, or
    infer the entire model from the data. The either the true or an inferred tree-topology can
    be used.
    """
    params = parse_alignment_name(fname)
    params['pc'] = pc
    prefix = os.path.dirname(fname)
    m = params['m']
    tree = Phylo.read(tree_name(prefix, params),
                      'newick') if true_tree else Phylo.read(
                          reconstructed_tree_name(prefix, params), 'newick')
    tree.root.branch_length = 0.001

    old_bl = []
    print(
        np.mean([x for c, x in tree.depths().items() if c.is_terminal()]) *
        (m if true_tree else 1.0))
    print(tree.root.clades[0].branch_length /
          tree.root.clades[1].branch_length)

    # randomize branch length of true tree to allow fair comparison
    for n in tree.find_clades():
        old_bl.append(n.branch_length)
        if true_tree:
            # rescale with mutation rate and multiply by a random number between 0.6 and 1.0
            n.branch_length *= m * (0.6 + 0.4 * np.random.random())

    print(np.sum(old_bl) * (m if true_tree else 1.0), m)
    # load true GTR model. Use this for inference if true_tree=True, else start with Jukes Cantor
    true_GTR = load_model(model_name(prefix, params))
    if true_model:
        model = true_GTR
        model.mu /= m
    else:
        model = 'JC69'

    with gzip.open(alignment_name(prefix, params), 'rt') as fh:
        aln = AlignIO.read(fh, 'fasta')

    tt = TreeAnc(tree=tree,
                 aln=aln,
                 gtr=model,
                 compress=False,
                 alphabet=alphabet,
                 verbose=3)

    # run the tree optimization of treetime. the damping parameter slows down the iterative
    # branch length optimization to avoid oscillations and run-away solutions
    # a site-specific GTR model is inferred if true_model is False
    tt.optimize_tree(branch_length_mode='marginal',
                     max_iter=n_iter,
                     infer_gtr=not true_model,
                     site_specific_gtr=True,
                     pc=pc,
                     damping=0.75)

    # if the true raes are to be used, replace those in the model and re-optimize
    if true_rates:
        tt.gtr.mu = true_GTR.mu / m
        tt.optimize_tree(branch_length_mode='marginal',
                         max_iter=n_iter,
                         infer_gtr=False,
                         site_specific_gtr=True,
                         pc=pc,
                         damping=0.75)

    new_bl = []
    for n in tt.tree.find_clades():
        new_bl.append(n.branch_length)

    # save new tree to file
    tt.tree.root_at_midpoint()
    tfname = reoptimized_tree_true_model(
        out_prefix, params) if args.true_model else reoptimized_tree(
            out_prefix, params, true_rates=true_rates)
    Phylo.write(tt.tree, tfname, 'newick')

    print(tt.tree.total_branch_length(), tt.gtr.average_rate().mean())
    print(np.mean([x for c, x in tt.tree.depths().items() if c.is_terminal()]),
          tt.tree.total_branch_length())
    print(tt.tree.root.clades[0].branch_length /
          tt.tree.root.clades[1].branch_length)
    print(np.corrcoef(old_bl, new_bl)[0, 1])
Example #4
0
from treetime import TreeAnc
from Bio import Phylo

T = Phylo.read('tree_raw.nwk', 'newick')
T.root_at_midpoint()

tt = TreeAnc(tree=T, aln='subsampled_alignment.fasta')

tt.optimize_tree(prune_short=True)

to_prune = []
for n in tt.tree.get_terminals():
    if len([x for x in n.mutations if x[2] != 'N']) > 5:
        to_prune.append(n)

for n in to_prune:
    tt.tree.prune(n)

Phylo.write(tt.tree, 'tree_raw.nwk', 'newick')