Esempio n. 1
0
def ancestral_reconstruction(params):
    """
    implementing treetime ancestral
    """

    # set up
    if assure_tree(params, tmp_dir='ancestral_tmp'):
        return 1

    outdir = get_outdir(params, '_ancestral')
    basename = get_basename(params, outdir)

    gtr = create_gtr(params)

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    treeanc = TreeAnc(params.tree,
                      aln=aln,
                      ref=ref,
                      gtr=gtr,
                      verbose=1,
                      fill_overhangs=not params.keep_overhangs)

    try:
        ndiff = treeanc.infer_ancestral_sequences(
            'ml',
            infer_gtr=params.gtr == 'infer',
            marginal=params.marginal,
            fixed_pi=fixed_pi,
            reconstruct_tip_states=params.reconstruct_tip_states)
    except TreeTimeError as e:
        print(
            "\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n"
        )
        raise e

    ###########################################################################
    ### OUTPUT and saving of results
    ###########################################################################
    if params.gtr == 'infer':
        fname = outdir + '/sequence_evolution_model.txt'
        with open(fname, 'w', encoding='utf-8') as ofile:
            ofile.write(str(treeanc.gtr) + '\n')
        print('\nInferred sequence evolution model (saved as %s):' % fname)
        print(treeanc.gtr)

    export_sequences_and_tree(
        treeanc,
        basename,
        is_vcf,
        params.zero_based,
        report_ambiguous=params.report_ambiguous,
        reconstruct_tip_states=params.reconstruct_tip_states)

    return 0
Esempio n. 2
0
def test_ancestral():
    import os
    from Bio import AlignIO
    import numpy as np
    from treetime import TreeAnc, GTR
    root_dir = os.path.dirname(os.path.realpath(__file__))
    fasta = str(os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.fasta'))
    nwk = str(os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.nwk'))

    for marginal in [True, False]:
        print('loading flu example')
        t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta)
        print('ancestral reconstruction' + ("marginal" if marginal else "joint"))
        t.reconstruct_anc(method='ml', marginal=marginal)
        assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA'

    print('testing LH normalization')
    from StringIO import StringIO
    from Bio import Phylo,AlignIO
    tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick')
    tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n"
                                     ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n"
                                     ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta')

    mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4)))
    t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln)
    t.reconstruct_anc('ml', marginal=True, debug=True)
    lhsum =  (t.tree.root.marginal_profile.sum(axis=1) * np.exp(t.tree.root.marginal_subtree_LH_prefactor)).sum()
    print (lhsum)
    assert(np.abs(lhsum-1.0)<1e-6)

    t.optimize_branch_len()
def infer_gene_gain_loss(path, rates=[1.0, 1.0]):
    # initialize GTR model with default parameters
    mu = np.sum(rates)
    gene_pi = np.array(rates) / mu
    gain_loss_model = GTR.custom(pi=gene_pi,
                                 mu=mu,
                                 W=np.ones((2, 2)),
                                 alphabet=np.array(['0', '1']))
    # add "unknown" state to profile
    gain_loss_model.profile_map['-'] = np.ones(2)
    root_dir = os.path.dirname(os.path.realpath(__file__))

    # define file names for pseudo alignment of presence/absence patterns as in 001001010110
    sep = '/'
    fasta = sep.join([path.rstrip(sep), 'geneCluster', 'genePresence.aln'])
    # strain tree based on core gene SNPs
    nwk = sep.join([path.rstrip(sep), 'geneCluster', 'strain_tree.nwk'])

    # instantiate treetime with custom GTR
    t = TreeAnc(nwk, gtr=gain_loss_model, verbose=2)
    # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence
    for leaf in t.tree.get_terminals():
        if leaf.name is None:
            leaf.name = str(leaf.confidence)
    t.aln = fasta
    t.tree.root.branch_length = 0.0001
    t.reconstruct_anc(method='ml')

    for n in t.tree.find_clades():
        n.genepresence = n.sequence

    return t
Esempio n. 4
0
def ancestral_sequence_inference(tree=None,
                                 aln=None,
                                 ref=None,
                                 infer_gtr=True,
                                 marginal=False,
                                 fill_overhangs=True,
                                 infer_tips=False):
    """infer ancestral sequences using TreeTime

    Parameters
    ----------
    tree : Bio.Phylo tree or str
        tree or filename of tree
    aln : Bio.Align.MultipleSeqAlignment or str
        alignment or filename of alignment
    infer_gtr : bool, optional
        Description
    marginal : bool, optional
        Description
    fill_overhangs : bool
       In some cases, the missing data on both ends of the alignment is
       filled with the gap character ('-'). If set to True, these end-gaps are
       converted to "ambiguous" characters ('N' for nucleotides, 'X' for
       aminoacids). Otherwise, the alignment is treated as-is
    infer_tips : bool
        Since v0.7, TreeTime does not reconstruct tip states by default.
        This is only relevant when tip-state are not exactly specified, e.g. via
        characters that signify ambiguous states. To replace those with the
        most-likely state, set infer_tips=True

    Returns
    -------
    TreeAnc
        treetime.TreeAnc instance
    """

    from treetime import TreeAnc
    tt = TreeAnc(tree=tree,
                 aln=aln,
                 ref=ref,
                 gtr='JC69',
                 fill_overhangs=fill_overhangs,
                 verbose=1)

    # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
    bool_marginal = (marginal == "marginal")

    # only infer ancestral sequences, leave branch length untouched
    tt.infer_ancestral_sequences(infer_gtr=infer_gtr,
                                 marginal=bool_marginal,
                                 reconstruct_tip_states=infer_tips)

    print(
        "\nInferred ancestral sequence states using TreeTime:"
        "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
        "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n"
    )

    return tt
Esempio n. 5
0
def test_ancestral():
    import os
    from Bio import AlignIO
    import numpy as np
    from treetime import TreeAnc, GTR
    root_dir = os.path.dirname(os.path.realpath(__file__))
    fasta = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.fasta'))
    nwk = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.nwk'))

    for marginal in [True, False]:
        print('loading flu example')
        t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta)
        print('ancestral reconstruction' + ("marginal" if marginal else "joint"))
        t.reconstruct_anc(method='ml', marginal=marginal)
        assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA'

    print('testing LH normalization')
    from Bio import Phylo,AlignIO
    tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick')
    tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n"
                                     ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n"
                                     ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta')

    mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4)))
    t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln)
    t.reconstruct_anc('ml', marginal=True, debug=True)
    lhsum =  np.exp(t.sequence_LH(pos=np.arange(4**3))).sum()
    print (lhsum)
    assert(np.abs(lhsum-1.0)<1e-6)

    t.optimize_branch_len()
Esempio n. 6
0
def build_tree(focal_alignment):
    """
    Parameters
    ----------
    focal_alignment : path-like
        path to a fasta file containing the sequences to build the
        focal-alignment tree.

    Returns
    -------
    Bio.Phylo.Newick.Tree
        pyhlogenetic tree of focal-alignment.

    """
    tree_cmd = [
        "fasttree", "-nt", "-noml", "-nome", "-nosupport", focal_alignment
    ]

    T = Phylo.read(io.StringIO(subprocess.check_output(tree_cmd).decode()),
                   'newick')
    T.root_with_outgroup('Wuhan/Hu-1/2019')
    #T.root_at_midpoint()

    tt = TreeAnc(tree=T, aln=focal_alignment)
    tt.infer_ancestral_sequences(reconstruct_tip_states=False)
    tt.prune_short_branches()
    tt.optimize_tree()
    return tt.tree
Esempio n. 7
0
    def real_lh():
        """
        Likelihood of the sequences calculated by the joint ancestral
        sequence reconstruction
        """
        tiny_aln_1 = AlignIO.read(StringIO(">A\n"+A_char+"\n"
                                           ">B\n"+B_char+"\n"
                                           ">D\n"+D_char+"\n"), 'fasta')

        myTree_1 = TreeAnc(gtr=mygtr, tree = tiny_tree,
                            aln=tiny_aln_1, verbose = 4)

        myTree_1.reconstruct_anc(method='ml', marginal=False, debug=True)
        logLH = myTree_1.tree.sequence_LH
        return logLH
Esempio n. 8
0
    def real_lh():
        """
        Likelihood of the sequences calculated by the joint ancestral
        sequence reconstruction
        """
        tiny_aln_1 = AlignIO.read(StringIO(">A\n"+A_char+"\n"
                                           ">B\n"+B_char+"\n"
                                           ">D\n"+D_char+"\n"), 'fasta')

        myTree_1 = TreeAnc(gtr=mygtr, tree = tiny_tree,
                            aln=tiny_aln_1, verbose = 4)

        myTree_1.reconstruct_anc(method='ml', marginal=False, debug=True)
        logLH = myTree_1.tree.sequence_LH
        return logLH
Esempio n. 9
0
def ancestral_reconstruction(params):
    """
    implementing treetime ancestral
    """

    # set up
    if assure_tree(params, tmp_dir='ancestral_tmp'):
        return 1

    outdir = get_outdir(params, '_ancestral')
    basename = get_basename(params, outdir)

    gtr = create_gtr(params)

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    treeanc = TreeAnc(params.tree,
                      aln=aln,
                      ref=ref,
                      gtr=gtr,
                      verbose=1,
                      fill_overhangs=not params.keep_overhangs)
    ndiff = treeanc.infer_ancestral_sequences('ml',
                                              infer_gtr=params.gtr == 'infer',
                                              marginal=params.marginal,
                                              fixed_pi=fixed_pi)
    if ndiff == ttconf.ERROR:  # if reconstruction failed, exit
        return 1

    ###########################################################################
    ### OUTPUT and saving of results
    ###########################################################################
    if params.gtr == "infer":
        print('\nInferred GTR model:')
        print(treeanc.gtr)

    export_sequences_and_tree(treeanc,
                              basename,
                              is_vcf,
                              params.zero_based,
                              report_ambiguous=params.report_ambiguous)

    return 0
Esempio n. 10
0
    def ref_lh():
        """
        reference likelihood - LH values for all possible variants
        of the internal node sequences
        """

        tiny_aln = AlignIO.read(
            StringIO(">A\n" + A_seq + "\n"
                     ">B\n" + B_seq + "\n"
                     ">D\n" + D_seq + "\n"
                     ">C\nAAAACCCCGGGGTTTT\n"
                     ">E\nACGTACGTACGTACGT\n"), 'fasta')

        myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln, verbose=4)

        logLH_ref = myTree.ancestral_likelihood()

        return logLH_ref
Esempio n. 11
0
    def ref_lh():
        """
        reference likelihood - LH values for all possible variants
        of the internal node sequences
        """

        tiny_aln = AlignIO.read(StringIO(">A\n" + A_seq + "\n"
                                         ">B\n" + B_seq + "\n"
                                         ">D\n" + D_seq + "\n"
                                         ">C\nAAAACCCCGGGGTTTT\n"
                                         ">E\nACGTACGTACGTACGT\n"), 'fasta')

        myTree = TreeAnc(gtr=mygtr, tree = tiny_tree,
                         aln =tiny_aln, verbose = 4)

        logLH_ref = myTree.ancestral_likelihood()

        return logLH_ref
def run_tree(fname, out_prefix, alphabet, model, n_iter=20):
    params = parse_alignment_name(fname)
    m = params['m']
    tree = Phylo.read(tree_name(prefix, params), 'newick')
    tree.root.branch_length = 0.001
    tree.ladderize()
    for n in tree.find_clades():
        n.branch_length *= m*(0.6+0.4*np.random.random())

    with gzip.open(alignment_name(prefix, params), 'rt') as fh:
        aln = AlignIO.read(fh, 'fasta')

    tt = TreeAnc(tree=tree, aln=aln, gtr = model, compress=False,
                 alphabet=alphabet, verbose=3)

    tt.optimize_tree(branch_length_mode='marginal', max_iter=n_iter,
                     infer_gtr=False)

    return tt
Esempio n. 13
0
def ancestral_sequence_inference(tree=None,
                                 aln=None,
                                 ref=None,
                                 infer_gtr=True,
                                 marginal=False):
    from treetime import TreeAnc
    tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', verbose=1)

    # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
    bool_marginal = (marginal == "marginal")

    # only infer ancestral sequences, leave branch length untouched
    tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal)

    print(
        "\nInferred ancestral sequence states using TreeTime:"
        "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
        "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n"
    )

    return tt
Esempio n. 14
0
def ancestral_reconstruction(params):
    """
    implementing treetime ancestral
    """

    # set up
    if assure_tree(params, tmp_dir='ancestral_tmp'):
        return 1

    outdir = get_outdir(params, '_ancestral')
    basename = get_basename(params, outdir)

    gtr = create_gtr(params)

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1,
                      fill_overhangs=not params.keep_overhangs)
    ndiff =treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer',
                                             marginal=params.marginal, fixed_pi=fixed_pi)
    if ndiff==ttconf.ERROR: # if reconstruction failed, exit
        return 1

    ###########################################################################
    ### OUTPUT and saving of results
    ###########################################################################
    if params.gtr=="infer":
        print('\nInferred GTR model:')
        print(treeanc.gtr)

    export_sequences_and_tree(treeanc, basename, is_vcf, params.zero_based,
                              report_ambiguous=params.report_ambiguous)

    return 0
Esempio n. 15
0
def getAge(start,end,species,mytree,margin=50):
	from StringIO import StringIO
	from Bio import Phylo,AlignIO
	from treetime import TreeAnc
	myfasta=''
	for spec in species.keys():
		myfasta=myfasta+'>'+spec+'\nA'+str(''.join(species[spec][(start-margin-analysis_start):(end+margin-analysis_start)]))+'A\n'
	my_aln = AlignIO.read(StringIO(myfasta), 'fasta')
	ta = TreeAnc(tree=mytree, aln=my_aln, gtr='JC69')
	ta.infer_ancestral_sequences(method = 'ml', infer_gtr=True, marginal=False)
	seq=ta.get_reconstructed_alignment()
	print 'ancestral seq inferred'
	# 	for i in range(len(seq)):
	# 		print seq[i].id+' '+seq[i].seq[(margin+1):(margin+1+end-start)]
	Phylo.write(ta.tree, home+'/Annotation/Conservation/SEqAlign/test_'+id+'.nwk', 'newick')
	mytreeAnnot=Phylo.read(home+'/Annotation/Conservation/SEqAlign/test_'+id+'.nwk', "newick")
	AncestralSeq={}
	for y in seq:
		AncestralSeq[y.id]=str(y.seq[(margin+1):(margin+1+end-start)])
	    AppearedInCommonAncestor=['human','chimp','gorilla','orangutan','macaque','marmoset','tarsier','lemur','treeShrew','mouse','cow','elephant','opossum','platypus','chicken','frog','zebrafish','lamprey']
    	AppearedInCommonAncestorScientific=['hg19','panTro2','gorGor1','ponAbe2','rheMac2','calJac1','tarSyr1','micMur1','tupBel1','mm9','bosTau4','loxAfr3','monDom5','ornAna1','galGal3','xenTro2','danRer6','petMar1']
    	CommonAncestor=['hg19','NODE_0000044','NODE_0000043','NODE_0000042','NODE_0000040','NODE_0000039','NODE_0000038','NODE_0000036','NODE_0000035','NODE_0000028','NODE_0000018','NODE_0000013','NODE_0000011','NODE_0000010','NODE_0000007','NODE_0000006','NODE_0000001','NODE_0000000']
	    CommonAncestorName=['Humans','Chimpanzees','Gorillas','Orangutans','Gibbons','Monkeys','Tarsiers','Lemurs','Shrews','Rodents','Boroeutheria','Afroeutheria','Marsupials','Monotremes','Sauropsids','Amphibians','Vertebrates','Chordates']
	    PhyloGeneticDistance=[ mytreeAnnot.distance('hg19', CommonAncestor[i]) for i in range(0,len(CommonAncestor))]
Esempio n. 16
0
def run_beast(args):
    '''
    BEAST MCC tree to newick and node-data JSON for further augur processing / export
    '''
    verbose = args.verbose
    print("importing from BEAST MCC tree", args.mcc)

    if args.recursion_limit:
        print("Setting recursion limit to %d" % (args.recursion_limit))
        sys.setrecursionlimit(args.recursion_limit)

    # node data is the dict that will be exported as json
    node_data = {
        'comment': "Imported from a BEAST MCC tree using `augur import beast`",
        'mcc_file': args.mcc
    }

    tree = parse_nexus(tree_path=args.mcc, verbose=args.verbose)
    summarise_parsed_traits(tree)
    # Phylo.draw_ascii(tree)
    # instantiate treetime for the sole reason to name internal nodes (!)
    # note that tt.tree = tree, and this is modified in-place by this function
    tt = TreeAnc(tree=tree,
                 aln=fake_alignment(tree),
                 ref=None,
                 gtr='JC69',
                 verbose=1)

    # extract date information from the tree
    root_date_offset, most_recent_tip = calc_tree_dates(
        tree, args.most_recent_tip_date, args.tip_date_regex,
        args.tip_date_format, args.tip_date_delimeter)
    compute_entropies_for_discrete_traits(tree)

    node_data['nodes'] = collect_node_data(tree, root_date_offset,
                                           most_recent_tip)

    tree_success = Phylo.write(tree,
                               args.output_tree,
                               'newick',
                               format_branch_length='%1.8f')
    json_success = write_json(node_data, args.output_node_data)

    print_what_to_do_next(nodes=node_data['nodes'],
                          mcc_path=args.mcc,
                          tree_path=args.output_tree,
                          node_data_path=args.output_node_data)
Esempio n. 17
0
def ancestral_sequence_inference(tree=None, aln=None, infer_gtr=True,
                                 optimize_branch_length=True):
    from treetime import TreeAnc
    tt = TreeAnc(tree=tree, aln=aln, gtr='JC69')

    if optimize_branch_length:
        tt.optimize_seq_and_branch_len(infer_gtr=infer_gtr)
    else: # only infer ancestral sequences, leave branch length untouched
        tt.infer_ancestral_sequences(infer_gtr=infer_gtr)

    return tt
Esempio n. 18
0
def assure_tree(params, tmp_dir='treetime_tmp'):
    """
    Function that attempts to load a tree and build it from the alignment
    if no tree is provided.
    """
    if params.tree is None:
        params.tree = os.path.basename(params.aln)+'.nwk'
        print("No tree given: inferring tree")
        utils.tree_inference(params.aln, params.tree, tmp_dir = tmp_dir)

    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)

    try:
        tt = TreeAnc(params.tree)
    except:
        print("Tree loading/building failed.")
        return 1
    return 0
    def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True):
        '''
        build a phylogenetic tree using fasttree and raxML (optional)
        based on nextflu tree building pipeline
        '''
        import subprocess
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        AlignIO.write(self.aln, 'origin.fasta', 'fasta')
        name_translation = make_strains_unique(self.aln)
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = [fasttree_program]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"

        if raxml==False:
            #shutil.copy('initial_tree.newick', out_fname)
            polytomies_midpointRooting('initial_tree.newick',out_fname, self.clusterID)
        elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')]))>3:
        ## only for tree with >3 strains
            if raxml_time_limit>0:
                tmp_tree = Phylo.read('initial_tree.newick','newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter<10):
                    resolve_iter+=1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
                AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
                print( "RAxML tree optimization with time limit", raxml_time_limit,  "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit*3600)
                process = subprocess.Popen("exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system("raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)

        if treetime_used:
            # load the resulting tree as a treetime instance
            from treetime import TreeAnc
            self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0)
            # provide short cut to tree and revert names that conflicted with newick format
            self.tree = self.tt.tree
        else:
            self.tree = Phylo.read(out_fname,'newick')
        self.tree.root.branch_length=0.0001
        restore_strain_name(name_translation, self.aln)
        restore_strain_name(name_translation, self.tree.get_terminals())

        for node in self.tree.find_clades():
            if node.name is not None:
                if node.name.startswith('NODE_')==False:
                    node.ann=node.name
            else:
                node.name='NODE_0'

        os.chdir(cwd)
        remove_dir(self.run_dir)
        self.is_timetree=False
Esempio n. 20
0
def scan_homoplasies(params):
    """
    the function implementing treetime homoplasies
    """
    if assure_tree(params, tmp_dir='homoplasy_tmp'):
        return 1

    gtr = create_gtr(params)

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    ###########################################################################
    ### ANCESTRAL RECONSTRUCTION
    ###########################################################################
    treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1,
                      fill_overhangs=True)
    if treeanc.aln is None: # if alignment didn't load, exit
        return 1

    if is_vcf:
        L = len(ref) + params.const
    else:
        L = treeanc.aln.get_alignment_length() + params.const

    N_seq = len(treeanc.aln)
    N_tree = treeanc.tree.count_terminals()
    if params.rescale!=1.0:
        for n in treeanc.tree.find_clades():
            n.branch_length *= params.rescale
            n.mutation_length = n.branch_length

    print("read alignment from file %s with %d sequences of length %d"%(params.aln,N_seq,L))
    print("read tree from file %s with %d leaves"%(params.tree,N_tree))
    print("\ninferring ancestral sequences...")

    ndiff = treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer',
                                      marginal=False, fixed_pi=fixed_pi)
    print("...done.")
    if ndiff==ttconf.ERROR: # if reconstruction failed, exit
        print("Something went wrong during ancestral reconstruction, please check your input files.", file=sys.stderr)
        return 1
    else:
        print("...done.")

    if is_vcf:
        treeanc.recover_var_ambigs()

    ###########################################################################
    ### analysis of reconstruction
    ###########################################################################
    from collections import defaultdict
    from scipy.stats import poisson
    offset = 0 if params.zero_based else 1

    if params.drms:
        DRM_info = read_in_DRMs(params.drms, offset)
        drms = DRM_info['DRMs']

    # construct dictionaries gathering mutations and positions
    mutations = defaultdict(list)
    positions = defaultdict(list)
    terminal_mutations = defaultdict(list)
    for n in treeanc.tree.find_clades():
        if n.up is None:
            continue

        if len(n.mutations):
            for (a,pos, d) in n.mutations:
                if '-' not in [a,d] and 'N' not in [a,d]:
                    mutations[(a,pos+offset,d)].append(n)
                    positions[pos+offset].append(n)
            if n.is_terminal():
                for (a,pos, d) in n.mutations:
                    if '-' not in [a,d] and 'N' not in [a,d]:
                        terminal_mutations[(a,pos+offset,d)].append(n)

    # gather homoplasic mutations by strain
    mutation_by_strain = defaultdict(list)
    for n in treeanc.tree.get_terminals():
        for a,pos,d in n.mutations:
            if pos+offset in positions and len(positions[pos+offset])>1:
                if '-' not in [a,d] and 'N' not in [a,d]:
                    mutation_by_strain[n.name].append([(a,pos+offset,d), len(positions[pos])])


    # total_branch_length is the expected number of substitutions
    # corrected_branch_length is the expected number of observable substitutions
    # (probability of an odd number of substitutions at a particular site)
    total_branch_length = treeanc.tree.total_branch_length()
    corrected_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length)
                                      for x in treeanc.tree.find_clades()])
    corrected_terminal_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length)
                                      for x in treeanc.tree.get_terminals()])
    expected_mutations = L*corrected_branch_length
    expected_terminal_mutations = L*corrected_terminal_branch_length

    # make histograms and sum mutations in different categories
    multiplicities = np.bincount([len(x) for x in mutations.values()])
    total_mutations = np.sum([len(x) for x in mutations.values()])

    multiplicities_terminal = np.bincount([len(x) for x in terminal_mutations.values()])
    terminal_mutation_count = np.sum([len(x) for x in terminal_mutations.values()])

    multiplicities_positions = np.bincount([len(x) for x in positions.values()])
    multiplicities_positions[0] = L - np.sum(multiplicities_positions)

    ###########################################################################
    ### Output the distribution of times particular mutations are observed
    ###########################################################################
    print("\nThe TOTAL tree length is %1.3e and %d mutations were observed."
          %(total_branch_length,total_mutations))
    print("Of these %d mutations,"%total_mutations
            +"".join(['\n\t - %d occur %d times'%(n,mi)
                      for mi,n in enumerate(multiplicities) if n]))
    # additional optional output this for terminal mutations only
    if params.detailed:
        print("\nThe TERMINAL branch length is %1.3e and %d mutations were observed."
              %(corrected_terminal_branch_length,terminal_mutation_count))
        print("Of these %d mutations,"%terminal_mutation_count
                +"".join(['\n\t - %d occur %d times'%(n,mi)
                          for mi,n in enumerate(multiplicities_terminal) if n]))


    ###########################################################################
    ### Output the distribution of times mutations at particular positions are observed
    ###########################################################################
    print("\nOf the %d positions in the genome,"%L
            +"".join(['\n\t - %d were hit %d times (expected %1.2f)'%(n,mi,L*poisson.pmf(mi,1.0*total_mutations/L))
                      for mi,n in enumerate(multiplicities_positions) if n]))


    # compare that distribution to a Poisson distribution with the same mean
    p = poisson.pmf(np.arange(10*multiplicities_positions.max()),1.0*total_mutations/L)
    print("\nlog-likelihood difference to Poisson distribution with same mean: %1.3e"%(
            - L*np.sum(p*np.log(p+1e-100))
            + np.sum(multiplicities_positions*np.log(p[:len(multiplicities_positions)]+1e-100))))


    ###########################################################################
    ### Output the mutations that are observed most often
    ###########################################################################
    if params.drms:
        print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)")
        mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
        for mut, val in mutations_sorted[:params.n]:
            if len(val)>1:
                print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val),
                    " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else ""))
            else:
                break
    else:
        print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity")
        mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
        for mut, val in mutations_sorted[:params.n]:
            if len(val)>1:
                print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val)))
            else:
                break

    # optional output specifically for mutations on terminal branches
    if params.detailed:
        if params.drms:
            print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)")
            terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
            for mut, val in terminal_mutations_sorted[:params.n]:
                if len(val)>1:
                    print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val),
                        " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else ""))
                else:
                    break
        else:
            print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity")
            terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
            for mut, val in terminal_mutations_sorted[:params.n]:
                if len(val)>1:
                    print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val)))
                else:
                    break

    ###########################################################################
    ### Output strains that have many homoplasic mutations
    ###########################################################################
    # TODO: add statistical criterion
    if params.detailed:
        if params.drms:
            print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations\t# DRM")
            mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True)
            for name, val in mutation_by_strain_sorted[:params.n]:
                if len(val):
                    print("\t%s\t%d\t%d"%(name, len(val),
                        len([mut for mut,l in val if mut[1] in drms])))
        else:
            print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations")
            mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True)
            for name, val in mutation_by_strain_sorted[:params.n]:
                if len(val):
                    print("\t%s\t%d"%(name, len(val)))


    return 0
Esempio n. 21
0
def mugration(params):
    """
    implementing treetime mugration
    """

    ###########################################################################
    ### Parse states
    ###########################################################################
    if os.path.isfile(params.states):
        states = pd.read_csv(params.states, sep='\t' if params.states[-3:]=='tsv' else ',',
                             skipinitialspace=True)
    else:
        print("file with states does not exist")
        return 1

    outdir = get_outdir(params, '_mugration')

    taxon_name = 'name' if 'name' in states.columns else states.columns[0]
    if params.attribute:
        if params.attribute in states.columns:
            attr = params.attribute
        else:
            print("The specified attribute was not found in the metadata file "+params.states, file=sys.stderr)
            print("Available columns are: "+", ".join(states.columns), file=sys.stderr)
            return 1
    else:
        attr = states.columns[1]
        print("Attribute for mugration inference was not specified. Using "+attr, file=sys.stderr)

    leaf_to_attr = {x[taxon_name]:x[attr] for xi, x in states.iterrows()
                    if x[attr]!=params.missing_data}
    unique_states = sorted(set(leaf_to_attr.values()))
    nc = len(unique_states)
    if nc>180:
        print("mugration: can't have more than 180 states!", file=sys.stderr)
        return 1
    elif nc<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return 1

    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################
    alphabet = [chr(65+i) for i,state in enumerate(unique_states)]
    missing_char = chr(65+nc)
    letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)}
    letter_to_state[missing_char]=params.missing_data
    reverse_alphabet = {v:k for k,v in letter_to_state.items()}

    ###########################################################################
    ### construct gtr model
    ###########################################################################
    if params.weights:
        params.infer_gtr = True
        tmp_weights = pd.read_csv(params.weights, sep='\t' if params.states[-3:]=='tsv' else ',',
                             skipinitialspace=True)
        weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()}
        mean_weight = np.mean(list(weights.values()))
        weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float)
        weights/=weights.sum()
    else:
        weights = np.ones(nc, dtype=float)/nc

    # set up dummy matrix
    W = np.ones((nc,nc), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous=missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(params.tree, gtr=mugration_GTR, verbose=params.verbose,
                      convert_upper=False, one_mutation=0.001)
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[leaf_to_attr[n.name]]
                           if n.name in leaf_to_attr else missing_char))
                   for n in treeanc.tree.get_terminals()]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=params.pc, marginal=True, normalized_rate=False,
            fixed_pi=weights if params.weights else None)
    if ndiff==ttconf.ERROR: # if reconstruction failed, exit
        return 1


    ###########################################################################
    ### output
    ###########################################################################
    print("\nCompleted mugration model inference of attribute '%s' for"%attr,params.tree)

    basename = get_basename(params, outdir)
    gtr_name = basename + 'GTR.txt'
    with open(gtr_name, 'w') as ofile:
        ofile.write('Character to attribute mapping:\n')
        for state in unique_states:
            ofile.write('  %s: %s\n'%(reverse_alphabet[state], state))
        ofile.write('\n\n'+str(treeanc.gtr)+'\n')
        print("\nSaved inferred mugration model as:", gtr_name)

    terminal_count = 0
    for n in treeanc.tree.find_clades():
        if n.up is None:
            continue
        n.confidence=None
        # due to a bug in older versions of biopython that truncated filenames in nexus export
        # we truncate them by hand and make them unique.
        if n.is_terminal() and len(n.name)>40 and bioversion<"1.69":
            n.name = n.name[:35]+'_%03d'%terminal_count
            terminal_count+=1
        n.comment= '&%s="'%attr + letter_to_state[n.sequence[0]] +'"'

    if params.confidence:
        conf_name = basename+'confidence.csv'
        with open(conf_name, 'w') as ofile:
            ofile.write('#name, '+', '.join(unique_states)+'\n')
            for n in treeanc.tree.find_clades():
                ofile.write(n.name + ', '+', '.join([str(x) for x in n.marginal_profile[0]])+'\n')
        print("Saved table with ancestral state confidences as:", conf_name)

    # write tree to file
    outtree_name = basename+'annotated_tree.nexus'
    Phylo.write(treeanc.tree, outtree_name, 'nexus')
    print("Saved annotated tree as:",outtree_name)

    return 0
Esempio n. 22
0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None,
                                weights=None, verbose=0, iterations=5):
    """take a set of discrete states associated with tips of a tree
    and reconstruct their ancestral states along with a GTR model that
    approximately maximizes the likelihood of the states on the tree.

    Parameters
    ----------
    tree : str, Bio.Phylo.Tree
        name of tree file or Biopython tree object
    traits : dict
        dictionary linking tips to straits
    missing_data : str, optional
        string indicating missing data
    pc : float, optional
        number of pseudo-counts to be used during GTR inference, default 1.0
    sampling_bias_correction : float, optional
        factor to inflate overall switching rate by to counteract sampling bias
    weights : str, optional
        name of file with equilibirum frequencies
    verbose : int, optional
        level of verbosity in output
    iterations : int, optional
        number of times non-linear optimization of overall rate and
        transmission estimation are iterated

    Returns
    -------
    tuple
        tuple of treeanc object, forward and reverse alphabets

    Raises
    ------
    TreeTimeError
        raise error if ancestral reconstruction errors out
    """
    unique_states = sorted(set(traits.values()))
    nc = len(unique_states)
    if nc>180:
        print("mugration: can't have more than 180 states!", file=sys.stderr)
        return None, None, None
    elif nc<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return None, None, None

    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################
    alphabet = [chr(65+i) for i,state in enumerate(unique_states)]
    missing_char = chr(65+nc)
    letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)}
    letter_to_state[missing_char]=missing_data
    reverse_alphabet = {v:k for k,v in letter_to_state.items()}

    ###########################################################################
    ### construct gtr model
    ###########################################################################
    if type(weights)==str:
        tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',',
                             skipinitialspace=True)
        weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()}
        mean_weight = np.mean(list(weights.values()))
        weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float)
        weights/=weights.sum()
    else:
        weights = None

    # set up dummy matrix
    W = np.ones((nc,nc), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous=missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose,
                      convert_upper=False, one_mutation=0.001)
    treeanc.use_mutation_length = False
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[traits[n.name]]
                           if n.name in traits else missing_char))
                   for n in treeanc.tree.get_terminals()]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    try:
        ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=pc, marginal=True, normalized_rate=False,
            fixed_pi=weights, reconstruct_tip_states=True)
        treeanc.optimize_gtr_rate()
    except TreeTimeError as e:
        print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n")
        raise e

    for i in range(iterations):
        treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc)
        treeanc.optimize_gtr_rate()

    if sampling_bias_correction:
        treeanc.gtr.mu *= sampling_bias_correction

    treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                 marginal=True, normalized_rate=False, reconstruct_tip_states=True)

    print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. "
          "TreeTime now optimizes the overall rate numerically and thus allows for long branches "
          "along which multiple changes accumulated. This is expected to affect estimates of the "
          "overall rate while leaving the relative rates mostly unchanged."))

    return treeanc, letter_to_state, reverse_alphabet
Esempio n. 23
0
from Bio import Phylo
import matplotlib.pyplot as plt
from Bio import Phylo

if __name__ == '__main__':
    plt.ion()

    # load data
    base_name = 'data/H3N2_NA_allyears_NA.20'
    T = Phylo.read(base_name + ".nwk", "newick")
    T.root_with_outgroup(
        T.find_clades(lambda x: x.name.startswith('A/Scot')).next())

    # instantiate treetime
    myTree = TreeAnc(gtr='Jukes-Cantor',
                     tree=T,
                     aln=base_name + '.fasta',
                     verbose=0)

    # infer optimize branch length, infer a GTR model, and reoptimize branch length
    myTree.optimize_sequences_and_branch_length(infer_gtr=True)

    # lets examine the properties of a node in the tree after ancestral inference
    node = myTree.tree.get_nonterminals()[7]
    # each node now has an inferred sequence
    print("the inferred sequences is an array of states:", node.sequence)

    # in addition, each node of the tree now has an mutation object attached
    # note that the mutation numbering starts at 0 rather than 1
    print("mutations of node %s:" % node.name, node.mutations)

    # we can readily verify these mutations by checking the inferred sequences
Esempio n. 24
0
    args = parser.parse_args()

    genes = args.genes if type(args.genes)==list else [args.genes]
    translations = args.translations if type(args.translations)==list else [args.translations]

    T = Phylo.read(args.tree, 'newick')
    leafs = {n.name for n in T.get_terminals()}

    node_data = {}
    for gene, translation in zip(genes, translations):
        seqs = []
        for s in SeqIO.parse(translation, 'fasta'):
            if s.id in leafs:
                seqs.append(s)


        tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa')

        tt.infer_ancestral_sequences(reconstruct_tip_states=True)

        with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh:
            for n in tt.tree.find_clades():
                if n.name not in node_data:
                    node_data[n.name] = {"aa_muts":{}}
                node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations]
                fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n")


    with open(args.output, 'w') as fh:
        json.dump({"nodes":node_data}, fh)
Esempio n. 25
0
                    "GTR params are not specified. Creating GTR model with default parameters"
                )

            gtr = GTR.standard(model, **kwargs)
        except:
            print(
                "Could not create GTR model from input arguments. Using default (Jukes-Cantor 1969)"
            )
            gtr = GTR.standard('jc')

    ###########################################################################
    ### ANCESTRAL RECONSTRUCTION
    ###########################################################################
    treeanc = TreeAnc(params.tree,
                      aln=params.aln,
                      gtr=gtr,
                      verbose=1,
                      fill_overhangs=True)
    L = treeanc.aln.get_alignment_length()
    N_seq = len(treeanc.aln)
    N_tree = treeanc.tree.count_terminals()

    print("read alignment from file %s with %d sequences of length %d" %
          (params.aln, N_seq, L))
    print("read tree from file %s with %d leaves" % (params.tree, N_tree))
    print("\ninferring ancestral sequences...")

    treeanc.infer_ancestral_sequences('ml',
                                      infer_gtr=infer_gtr,
                                      marginal=False)
    print("...done.")
class mpm_tree(object):
    '''
    class that aligns a set of sequences and infers a tree
    '''
    def __init__(self, cluster_seq_filepath, **kwarks):
        self.clusterID = cluster_seq_filepath.split('/')[-1].split('.fna')[0]
        if 'speciesID' in kwarks:
            folderID = kwarks['speciesID']
        else:
            folderID = cluster_seq_filepath.split('/')[-3]
        self.seqs = {
            x.id: x
            for x in SeqIO.parse(cluster_seq_filepath, 'fasta')
        }
        if 'run_dir' not in kwarks:
            import random
            #self.run_dir = '_'.join(['tmp', self.clusterID])
            self.run_dir = 'tmp/'
            self.run_dir += '_'.join([
                folderID, 'tmp',
                time.strftime('%H%M%S', time.gmtime()),
                str(random.randint(0, 100000000))
            ])
        else:
            self.run_dir = kwarks['run_dir']
        self.nuc = True

    def codon_align(self,
                    alignment_tool="mafft",
                    prune=True,
                    discard_premature_stops=False):
        '''
        takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translate
        aa_seqs = {}
        for seq in self.seqs.values():
            tempseq = seq.seq.translate(table="Bacterial")
            # use only sequences that translate without trouble
            if not discard_premature_stops or '*' not in str(
                    tempseq)[:-1] or prune == False:
                aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id)
            else:
                print(seq.id, "has premature stops, discarding")

        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname, 'fasta')

        if alignment_tool == 'mafft':
            os.system(
                'mafft --reorder --amino temp_in.fasta 1> temp_out.fasta')
            aln_aa = AlignIO.read('temp_out.fasta', "fasta")
        elif alignment_tool == 'muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname,
                                      out=tmpfname[:-5] + 'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta")
        else:
            print 'Alignment tool not supported:' + alignment_tool
            #return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        os.chdir(cwd)
        remove_dir(self.run_dir)

    def align(self):
        '''
        align sequencences in self.seqs using mafft
        '''
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta")
        os.system(
            'mafft --reorder --anysymbol temp_in.fasta 1> temp_out.fasta 2> mafft.log'
        )

        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        os.chdir(cwd)
        remove_dir(self.run_dir)

    def build(self,
              root='midpoint',
              raxml=True,
              fasttree_program='fasttree',
              raxml_time_limit=0.5,
              treetime_used=True):
        '''
        build a phylogenetic tree using fasttree and raxML (optional)
        based on nextflu tree building pipeline
        '''
        import subprocess
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        AlignIO.write(self.aln, 'origin.fasta', 'fasta')
        name_translation = make_strains_unique(self.aln)
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = [fasttree_program]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"

        if raxml == False:
            #shutil.copy('initial_tree.newick', out_fname)
            polytomies_midpointRooting('initial_tree.newick', out_fname,
                                       self.clusterID)
        elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')])) > 3:
            ## only for tree with >3 strains
            if raxml_time_limit > 0:
                tmp_tree = Phylo.read('initial_tree.newick', 'newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter < 10):
                    resolve_iter += 1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree, 'initial_tree.newick', 'newick')
                AlignIO.write(self.aln, "temp.phyx", "phylip-relaxed")
                print("RAxML tree optimization with time limit",
                      raxml_time_limit, "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit * 3600)
                process = subprocess.Popen(
                    "exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick",
                    shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system(
                    "raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick"
                )
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)

        if treetime_used:
            # load the resulting tree as a treetime instance
            from treetime import TreeAnc
            self.tt = TreeAnc(tree=out_fname,
                              aln=self.aln,
                              gtr='Jukes-Cantor',
                              verbose=0)
            # provide short cut to tree and revert names that conflicted with newick format
            self.tree = self.tt.tree
        else:
            self.tree = Phylo.read(out_fname, 'newick')
        self.tree.root.branch_length = 0.0001
        restore_strain_name(name_translation, self.aln)
        restore_strain_name(name_translation, self.tree.get_terminals())

        for node in self.tree.find_clades():
            if node.name is not None:
                if node.name.startswith('NODE_') == False:
                    node.ann = node.name
            else:
                node.name = 'NODE_0'

        os.chdir(cwd)
        remove_dir(self.run_dir)
        self.is_timetree = False

    def ancestral(self, translate_tree=False):
        '''
        infer ancestral nucleotide sequences using maximum likelihood
        and translate the resulting sequences (+ terminals) to amino acids
        '''
        try:
            self.tt.reconstruct_anc(method='ml')
        except:
            print "trouble at self.tt.reconstruct_anc(method='ml')"

        if translate_tree:
            for node in self.tree.find_clades():
                node.aa_sequence = np.fromstring(str(
                    self.translate_seq("".join(node.sequence))),
                                                 dtype='S1')

    def refine(self, CDS=True):
        '''
        determine mutations on each branch and attach as string to the branches
        '''
        for node in self.tree.find_clades():

            if node.up is not None:
                node.muts = ",".join([
                    "".join(map(str, x)) for x in node.mutations
                    if '-' not in x
                ])
                if CDS == True:
                    node.aa_muts = ",".join([
                        anc + str(pos + 1) + der
                        for pos, (anc, der) in enumerate(
                            zip(node.up.aa_sequence, node.aa_sequence))
                        if anc != der and '-' not in anc and '-' not in der
                    ])

    def translate_seq(self, seq):
        '''
        custom translation sequence that handles gaps
        '''
        if type(seq) != str:
            str_seq = str(seq.seq)
        else:
            str_seq = seq
        try:
            # soon not needed as future biopython version will translate --- into -
            tmp_seq = Seq(
                str(
                    Seq(str_seq.replace('---', 'NNN')).translate(
                        table="Bacterial")).replace('X', '-'))
        except:
            tmp_seq = Seq(
                str(
                    Seq(str_seq.replace(
                        '-',
                        'N')).translate(table="Bacterial")).replace('X', '-'))
        return tmp_seq

    def translate(self):
        '''
        translate the nucleotide alignment to an amino acid alignment
        '''
        aa_seqs = []
        for seq in self.aln:
            aa_seqs.append(
                SeqRecord(seq=self.translate_seq(seq),
                          id=seq.id,
                          name=seq.name,
                          description=seq.description))
        self.aa_aln = MultipleSeqAlignment(aa_seqs)

    def mean_std_seqLen(self):
        """ returen mean and standard deviation of sequence lengths """
        seqLen_arr = np.array([len(seq) for seq in self.seqs.values()])
        return np.mean(seqLen_arr, axis=0), np.std(seqLen_arr, axis=0)

    def paralogy_statistics(self):
        best_split = find_best_split(self.tree)
        return len(best_split.para_nodes), best_split.branch_length

    def diversity_statistics_nuc(self):
        ''' calculate alignment entropy of nucleotide alignments '''
        TINY = 1e-10
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        self.af_nuc = calc_af(self.aln, nuc_alpha)
        is_valid = self.af_nuc[:-2].sum(axis=0) > 0.5
        tmp_af = self.af_nuc[:-2, is_valid] / self.af_nuc[:-2,
                                                          is_valid].sum(axis=0)
        #self.entropy_nuc = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0))
        self.diversity_nuc = np.mean(1.0 - (tmp_af**2).sum(axis=0))

    def diversity_statistics_aa(self):
        ''' calculate alignment entropy of nucleotide alignments '''
        TINY = 1e-10
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        self.af_aa = calc_af(self.aa_aln, aa_alpha)
        is_valid = self.af_aa[:-2].sum(axis=0) > 0.5
        tmp_af = self.af_aa[:-2, is_valid] / self.af_aa[:-2,
                                                        is_valid].sum(axis=0)
        #self.entropy_aa = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0))
        self.diversity_aa = np.mean(1.0 - (tmp_af**2).sum(axis=0))

    def mutations_to_branch(self):
        self.mut_to_branch = defaultdict(list)
        for node in self.tree.find_clades():
            if node.up is not None:
                for mut in node.mutations:
                    self.mut_to_branch[mut].append(node)

    def reduce_alignments(self, RNA_specific=False):
        if RNA_specific:
            self.aa_aln = None
            self.af_aa = None
        else:
            self.af_aa = calc_af(self.aa_aln, aa_alpha)
        for attr, aln, alpha, freq in [[
                "aln_reduced", self.aln, nuc_alpha, self.af_nuc
        ], ["aa_aln_reduced", self.aa_aln, aa_alpha, self.af_aa]]:
            try:
                if RNA_specific and attr == "aa_aln_reduced":
                    pass  #** no reduced amino alignment for RNA
                else:
                    consensus = np.array(list(alpha))[freq.argmax(axis=0)]
                    aln_array = np.array(aln)
                    aln_array[aln_array == consensus] = '.'
                    new_seqs = [
                        SeqRecord(seq=Seq("".join(consensus)),
                                  name="consensus",
                                  id="consensus")
                    ]
                    for si, seq in enumerate(aln):
                        new_seqs.append(
                            SeqRecord(seq=Seq("".join(aln_array[si])),
                                      name=seq.name,
                                      id=seq.id,
                                      description=seq.description))
                    self.__setattr__(attr, MultipleSeqAlignment(new_seqs))
            except:
                print(
                    "sf_geneCluster_align_MakeTree: aligment reduction failed")

    #def export(self, path = '', extra_attr = ['aa_muts','ann','branch_length','name','longName'], RNA_specific=False):
    def export(self,
               path='',
               extra_attr=[
                   'aa_muts', 'annotation', 'branch_length', 'name',
                   'accession'
               ],
               RNA_specific=False):
        ## write tree
        Phylo.write(self.tree, path + self.clusterID + '.nwk', 'newick')

        ## processing node name
        for node in self.tree.get_terminals():
            #node.name = node.ann.split('|')[0]
            node.accession = node.ann.split('|')[0]
            #node.longName = node.ann.split('-')[0]
            node.name = node.ann.split('-')[0]
            #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase
            annotation = node.ann.split('-', 2)
            if len(annotation) == 3:
                node.annotation = annotation[2]
            else:
                node.annotation = annotation[0]

        ## write tree json
        for n in self.tree.root.find_clades():
            if n.branch_length < 1e-6:
                n.branch_length = 1e-6
        timetree_fname = path + self.clusterID + '_tree.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)

        self.reduce_alignments(RNA_specific)

        ## msa compatible
        for i_aln in self.aln:
            i_aln.id = i_aln.id.replace('|', '-', 1)
        for i_alnr in self.aln_reduced:
            i_alnr.id = i_alnr.id.replace('|', '-', 1)

        AlignIO.write(self.aln, path + self.clusterID + '_na_aln.fa', 'fasta')
        AlignIO.write(self.aln_reduced,
                      path + self.clusterID + '_na_aln_reduced.fa', 'fasta')

        if RNA_specific == False:
            for i_aa_aln in self.aa_aln:
                i_aa_aln.id = i_aa_aln.id.replace('|', '-', 1)
            for i_aa_alnr in self.aa_aln_reduced:
                i_aa_alnr.id = i_aa_alnr.id.replace('|', '-', 1)

            AlignIO.write(self.aa_aln, path + self.clusterID + '_aa_aln.fa',
                          'fasta')
            AlignIO.write(self.aa_aln_reduced,
                          path + self.clusterID + '_aa_aln_reduced.fa',
                          'fasta')

        ## write seq json
        write_seq_json = 0
        if write_seq_json:
            elems = {}
            for node in self.tree.find_clades():
                if hasattr(node, "sequence"):
                    if hasattr(node, "longName") == False:
                        node.longName = node.name
                    elems[node.longName] = {}
                    nuc_dt = {
                        pos: state
                        for pos, (state, ancstate) in enumerate(
                            izip(node.sequence.tostring(),
                                 self.tree.root.sequence.tostring()))
                        if state != ancstate
                    }
                    nodeseq = node.sequence.tostring()
                    nodeseq_len = len(nodeseq)
                    elems[node.longName]['nuc'] = nuc_dt

            elems['root'] = {}
            elems['root']['nuc'] = self.tree.root.sequence.tostring()

            self.sequences_fname = path + self.clusterID + '_seq.json'
            write_json(elems, self.sequences_fname, indent=None)
Esempio n. 27
0
                           dtype=float)
        weights /= weights.sum()
    else:
        weights = np.ones(nc, dtype=float) / nc

    # set up dummy matrix
    W = np.ones((nc, nc), dtype=float)

    mugration_GTR = GTR.custom(pi=weights, W=W, alphabet=np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous = missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(params.tree, gtr=mugration_GTR, verbose=params.verbose)
    pseudo_seqs = [
        SeqRecord(id=n.name,
                  name=n.name,
                  seq=Seq(reverse_alphabet[leaf_to_attr[n.name]] if n.name in
                          leaf_to_attr else missing))
        for n in treeanc.tree.get_terminals()
    ]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    treeanc.infer_ancestral_sequences(
        method='ml',
        infer_gtr=True,
        store_compressed=False,
        pc=params.pc,
        marginal=True,
Esempio n. 28
0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None,
                                weights=None, verbose=0, iterations=5):
    """take a set of discrete states associated with tips of a tree
    and reconstruct their ancestral states along with a GTR model that
    approximately maximizes the likelihood of the states on the tree.

    Parameters
    ----------
    tree : str, Bio.Phylo.Tree
        name of tree file or Biopython tree object
    traits : dict
        dictionary linking tips to straits
    missing_data : str, optional
        string indicating missing data
    pc : float, optional
        number of pseudo-counts to be used during GTR inference, default 1.0
    sampling_bias_correction : float, optional
        factor to inflate overall switching rate by to counteract sampling bias
    weights : str, optional
        name of file with equilibirum frequencies
    verbose : int, optional
        level of verbosity in output
    iterations : int, optional
        number of times non-linear optimization of overall rate and
        transmission estimation are iterated

    Returns
    -------
    tuple
        tuple of treeanc object, forward and reverse alphabets

    Raises
    ------
    TreeTimeError
        raise error if ancestral reconstruction errors out
    """
    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################

    unique_states = set(traits.values())
    n_observed_states = len(unique_states)

    # load weights from file and convert to dict if supplied as string
    if type(weights)==str:
        try:
            tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',',
                                 skipinitialspace=True)
            weight_dict = {row[0]:row[1] for ri,row in tmp_weights.iterrows() if not np.isnan(row[1])}
        except:
            raise ValueError("Loading of weights file '%s' failed!"%weights)
    elif type(weights)==dict:
        weight_dict = weights
    else:
        weight_dict = None

    # add weights to unique states for alphabet construction
    if weight_dict is not None:
        unique_states.update(weight_dict.keys())
        missing_weights = [c for c in unique_states if c not in weight_dict and c is not missing_data]
        if len(missing_weights):
            print("Missing weights for values: " + ", ".join(missing_weights))

        if len(missing_weights)>0.5*n_observed_states:
            print("More than half of discrete states missing from the weights file")
            print("Weights read from file are:", weights)
            raise TreeTimeError("More than half of discrete states missing from the weights file")

    unique_states=sorted(unique_states)
    # make a map from states (excluding missing data) to characters in the alphabet
    # note that gap character '-' is chr(45) and will never be included here
    reverse_alphabet = {state:chr(65+i) for i,state in enumerate(unique_states) if state!=missing_data}
    alphabet = list(reverse_alphabet.values())
    # construct a look up from alphabet character to states
    letter_to_state = {v:k for k,v in reverse_alphabet.items()}

    # construct the vector with weights to be used as equilibrium frequency
    if weight_dict is not None:
        mean_weight = np.mean(list(weight_dict.values()))
        weights = np.array([weight_dict[letter_to_state[c]] if letter_to_state[c] in weight_dict else mean_weight
                            for c in alphabet], dtype=float)
        weights/=weights.sum()

    # consistency checks
    if len(alphabet)<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return None, None, None

    n_states = len(alphabet)
    missing_char = chr(65+n_states)
    reverse_alphabet[missing_data]=missing_char
    letter_to_state[missing_char]=missing_data

    ###########################################################################
    ### construct gtr model
    ###########################################################################

    # set up dummy matrix
    W = np.ones((n_states,n_states), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(n_states)
    mugration_GTR.ambiguous=missing_char


    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose,
                      convert_upper=False, one_mutation=0.001)
    treeanc.use_mutation_length = False
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[traits[n.name]]
                           if n.name in traits else missing_char))
                   for n in treeanc.tree.get_terminals()]
    valid_seq = np.array([str(s.seq)!=missing_char for s in pseudo_seqs])
    print("Assigned discrete traits to %d out of %d taxa.\n"%(np.sum(valid_seq),len(valid_seq)))
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    try:
        ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=pc, marginal=True, normalized_rate=False,
            fixed_pi=weights, reconstruct_tip_states=True)
        treeanc.optimize_gtr_rate()
    except TreeTimeError as e:
        print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n")
        raise e

    for i in range(iterations):
        treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc, fixed_pi=weights)
        treeanc.optimize_gtr_rate()

    if sampling_bias_correction:
        treeanc.gtr.mu *= sampling_bias_correction

    treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                 marginal=True, normalized_rate=False,
                                 reconstruct_tip_states=True)

    print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. "
          "TreeTime now optimizes the overall rate numerically and thus allows for long branches "
          "along which multiple changes accumulated. This is expected to affect estimates of the "
          "overall rate while leaving the relative rates mostly unchanged."))

    return treeanc, letter_to_state, reverse_alphabet
Esempio n. 29
0
def scan_homoplasies(params):
    """
    the function implementing treetime homoplasies
    """
    if assure_tree(params, tmp_dir='homoplasy_tmp'):
        return 1

    gtr = create_gtr(params)

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    ###########################################################################
    ### ANCESTRAL RECONSTRUCTION
    ###########################################################################
    treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1,
                      fill_overhangs=True)
    if treeanc.aln is None: # if alignment didn't load, exit
        return 1

    if is_vcf:
        L = len(ref) + params.const
    else:
        L = treeanc.data.full_length + params.const

    N_seq = len(treeanc.aln)
    N_tree = treeanc.tree.count_terminals()
    if params.rescale!=1.0:
        for n in treeanc.tree.find_clades():
            n.branch_length *= params.rescale
            n.mutation_length = n.branch_length

    print("read alignment from file %s with %d sequences of length %d"%(params.aln,N_seq,L))
    print("read tree from file %s with %d leaves"%(params.tree,N_tree))
    print("\ninferring ancestral sequences...")

    ndiff = treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer',
                                      marginal=False, fixed_pi=fixed_pi)
    print("...done.")

    if is_vcf:
        treeanc.recover_var_ambigs()

    ###########################################################################
    ### analysis of reconstruction
    ###########################################################################
    from collections import defaultdict
    from scipy.stats import poisson
    offset = 0 if params.zero_based else 1

    if params.drms:
        DRM_info = read_in_DRMs(params.drms, offset)
        drms = DRM_info['DRMs']

    # construct dictionaries gathering mutations and positions
    mutations = defaultdict(list)
    positions = defaultdict(list)
    terminal_mutations = defaultdict(list)
    for n in treeanc.tree.find_clades():
        if n.up is None:
            continue

        if len(n.mutations):
            for (a,pos, d) in n.mutations:
                if '-' not in [a,d] and 'N' not in [a,d]:
                    mutations[(a,pos+offset,d)].append(n)
                    positions[pos+offset].append(n)
            if n.is_terminal():
                for (a,pos, d) in n.mutations:
                    if '-' not in [a,d] and 'N' not in [a,d]:
                        terminal_mutations[(a,pos+offset,d)].append(n)

    # gather homoplasic mutations by strain
    mutation_by_strain = defaultdict(list)
    for n in treeanc.tree.get_terminals():
        for a,pos,d in n.mutations:
            if pos+offset in positions and len(positions[pos+offset])>1:
                if '-' not in [a,d] and 'N' not in [a,d]:
                    mutation_by_strain[n.name].append([(a,pos+offset,d), len(positions[pos])])


    # total_branch_length is the expected number of substitutions
    # corrected_branch_length is the expected number of observable substitutions
    # (probability of an odd number of substitutions at a particular site)
    total_branch_length = treeanc.tree.total_branch_length()
    corrected_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length)
                                      for x in treeanc.tree.find_clades()])
    corrected_terminal_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length)
                                      for x in treeanc.tree.get_terminals()])
    expected_mutations = L*corrected_branch_length
    expected_terminal_mutations = L*corrected_terminal_branch_length

    # make histograms and sum mutations in different categories
    multiplicities = np.bincount([len(x) for x in mutations.values()])
    total_mutations = np.sum([len(x) for x in mutations.values()])

    multiplicities_terminal = np.bincount([len(x) for x in terminal_mutations.values()])
    terminal_mutation_count = np.sum([len(x) for x in terminal_mutations.values()])

    multiplicities_positions = np.bincount([len(x) for x in positions.values()])
    multiplicities_positions[0] = L - np.sum(multiplicities_positions)

    ###########################################################################
    ### Output the distribution of times particular mutations are observed
    ###########################################################################
    print("\nThe TOTAL tree length is %1.3e and %d mutations were observed."
          %(total_branch_length,total_mutations))
    print("Of these %d mutations,"%total_mutations
            +"".join(['\n\t - %d occur %d times'%(n,mi)
                      for mi,n in enumerate(multiplicities) if n]))
    # additional optional output this for terminal mutations only
    if params.detailed:
        print("\nThe TERMINAL branch length is %1.3e and %d mutations were observed."
              %(corrected_terminal_branch_length,terminal_mutation_count))
        print("Of these %d mutations,"%terminal_mutation_count
                +"".join(['\n\t - %d occur %d times'%(n,mi)
                          for mi,n in enumerate(multiplicities_terminal) if n]))


    ###########################################################################
    ### Output the distribution of times mutations at particular positions are observed
    ###########################################################################
    print("\nOf the %d positions in the genome,"%L
            +"".join(['\n\t - %d were hit %d times (expected %1.2f)'%(n,mi,L*poisson.pmf(mi,1.0*total_mutations/L))
                      for mi,n in enumerate(multiplicities_positions) if n]))


    # compare that distribution to a Poisson distribution with the same mean
    p = poisson.pmf(np.arange(10*multiplicities_positions.max()),1.0*total_mutations/L)
    print("\nlog-likelihood difference to Poisson distribution with same mean: %1.3e"%(
            - L*np.sum(p*np.log(p+1e-100))
            + np.sum(multiplicities_positions*np.log(p[:len(multiplicities_positions)]+1e-100))))


    ###########################################################################
    ### Output the mutations that are observed most often
    ###########################################################################
    if params.drms:
        print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)")
        mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
        for mut, val in mutations_sorted[:params.n]:
            if len(val)>1:
                print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val),
                    " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else ""))
            else:
                break
    else:
        print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity")
        mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
        for mut, val in mutations_sorted[:params.n]:
            if len(val)>1:
                print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val)))
            else:
                break

    # optional output specifically for mutations on terminal branches
    if params.detailed:
        if params.drms:
            print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)")
            terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
            for mut, val in terminal_mutations_sorted[:params.n]:
                if len(val)>1:
                    print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val),
                        " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else ""))
                else:
                    break
        else:
            print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity")
            terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True)
            for mut, val in terminal_mutations_sorted[:params.n]:
                if len(val)>1:
                    print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val)))
                else:
                    break

    ###########################################################################
    ### Output strains that have many homoplasic mutations
    ###########################################################################
    # TODO: add statistical criterion
    if params.detailed:
        if params.drms:
            print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations\t# DRM")
            mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True)
            for name, val in mutation_by_strain_sorted[:params.n]:
                if len(val):
                    print("\t%s\t%d\t%d"%(name, len(val),
                        len([mut for mut,l in val if mut[1] in drms])))
        else:
            print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations")
            mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True)
            for name, val in mutation_by_strain_sorted[:params.n]:
                if len(val):
                    print("\t%s\t%d"%(name, len(val)))


    return 0
    def build(self,
              root='midpoint',
              raxml=True,
              fasttree_program='fasttree',
              raxml_time_limit=0.5,
              treetime_used=True):
        '''
        build a phylogenetic tree using fasttree and raxML (optional)
        based on nextflu tree building pipeline
        '''
        import subprocess
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        AlignIO.write(self.aln, 'origin.fasta', 'fasta')
        name_translation = make_strains_unique(self.aln)
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = [fasttree_program]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"

        if raxml == False:
            #shutil.copy('initial_tree.newick', out_fname)
            polytomies_midpointRooting('initial_tree.newick', out_fname,
                                       self.clusterID)
        elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')])) > 3:
            ## only for tree with >3 strains
            if raxml_time_limit > 0:
                tmp_tree = Phylo.read('initial_tree.newick', 'newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter < 10):
                    resolve_iter += 1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree, 'initial_tree.newick', 'newick')
                AlignIO.write(self.aln, "temp.phyx", "phylip-relaxed")
                print("RAxML tree optimization with time limit",
                      raxml_time_limit, "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit * 3600)
                process = subprocess.Popen(
                    "exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick",
                    shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system(
                    "raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick"
                )
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)

        if treetime_used:
            # load the resulting tree as a treetime instance
            from treetime import TreeAnc
            self.tt = TreeAnc(tree=out_fname,
                              aln=self.aln,
                              gtr='Jukes-Cantor',
                              verbose=0)
            # provide short cut to tree and revert names that conflicted with newick format
            self.tree = self.tt.tree
        else:
            self.tree = Phylo.read(out_fname, 'newick')
        self.tree.root.branch_length = 0.0001
        restore_strain_name(name_translation, self.aln)
        restore_strain_name(name_translation, self.tree.get_terminals())

        for node in self.tree.find_clades():
            if node.name is not None:
                if node.name.startswith('NODE_') == False:
                    node.ann = node.name
            else:
                node.name = 'NODE_0'

        os.chdir(cwd)
        remove_dir(self.run_dir)
        self.is_timetree = False
Esempio n. 31
0
def test_seq_joint_reconstruction_correct():
    """
    evolve the random sequence, get the alignment at the leaf nodes.
    Reconstruct the sequences of the internal nodes (joint)
    and prove the reconstruction is correct.
    In addition, compute the likelihood of the particular realization of the
    sequences on the tree and prove that this likelihood is exactly the same
    as calculated in the joint reconstruction
    """

    from treetime import TreeAnc, GTR
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    import numpy as np
    try:
        from itertools import izip
    except ImportError:  #python3.x
        izip = zip
    from collections import defaultdict
    def exclusion(a, b):
        """
        Intersection of two lists
        """
        return list(set(a) - set(b))

    tiny_tree = Phylo.read(StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick')
    mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']),
                       pi = np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4,4)))
    seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400)


    myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4)

    # simulate evolution, set resulting sequence as ref_seq
    tree = myTree.tree
    seq_len = 400
    tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len)
    print ("Root sequence: " + ''.join(tree.root.ref_seq))
    mutation_list = defaultdict(list)
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p=(p.T/p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0])])

        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])

        node.ref_mutations = [(anc, pos, der) for pos, (anc, der) in
                            enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc!=der]
        for anc, pos, der in node.ref_mutations:
            print(pos)
            mutation_list[pos].append((node.name, anc, der))
        print (node.name, len(node.ref_mutations), node.ref_mutations)

    # set as the starting sequences to the terminal nodes:
    alnstr = ""
    i = 1
    for leaf in tree.get_terminals():
        alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n'
        i += 1
    print (alnstr)
    myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta')
    myTree._attach_sequences_to_nodes()
    # reconstruct ancestral sequences:
    myTree._ml_anc_joint(debug=True)

    diff_count = 0
    mut_count = 0
    for node in myTree.tree.find_clades():
        if node.up is not None:
            mut_count += len(node.ref_mutations)
            diff_count += np.sum(node.sequence != node.ref_seq)==0
            if np.sum(node.sequence != node.ref_seq):
                print("%s: True sequence does not equal inferred sequence. parent %s"%(node.name, node.up.name))
            else:
                print("%s: True sequence equals inferred sequence. parent %s"%(node.name, node.up.name))
        print (node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations)

    # the assignment of mutations to the root node is probabilistic. Hence some differences are expected
    assert diff_count/seq_len<2*(1.0*mut_count/seq_len)**2

    # prove the likelihood value calculation is correct
    LH = myTree.ancestral_likelihood()
    LH_p = (myTree.tree.sequence_LH)

    print ("Difference between reference and inferred LH:", (LH - LH_p).sum())
    assert ((LH - LH_p).sum())<1e-9

    return myTree
Esempio n. 32
0
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True,
                        infer_gtr=True, root_state=None, missing='?'):
    from treetime import GTR
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio import Phylo

    T = Phylo.read(tree, 'newick')
    nodes = {n.name:n for n in T.get_terminals()}

    # Determine alphabet only counting tips in the tree
    places = set()
    for name, meta in seq_meta.items():
        if field in meta and name in nodes:
            places.add(meta[field])
    if root_state is not None:
        places.add(root_state)

    # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
    places = sorted(places)
    nc = len(places)
    if nc>180:
        print("ERROR: geo_inference: can't have more than 180 places!", file=sys.stderr)
        return None,None
    elif nc==1:
        print("WARNING: geo_inference: only one place found -- set every internal node to %s!"%places[0], file=sys.stderr)
        return None,None
    elif nc==0:
        print("ERROR: geo_inference: list of places is empty!", file=sys.stderr)
        return None,None
    else:
        # set up model
        alphabet = {chr(65+i):place for i,place in enumerate(places)}
        model = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)),
                          alphabet = np.array(sorted(alphabet.keys())))

        missing_char = chr(65+nc)
        alphabet[missing_char]=missing
        model.profile_map[missing_char] = np.ones(nc)
        model.ambiguous = missing_char
        alphabet_rev = {v:k for k,v in alphabet.items()}

        # construct pseudo alignment
        pseudo_seqs = []
        for name, meta in seq_meta.items():
            if name in nodes:
                s=alphabet_rev[meta[field]] if field in meta else missing_char
                pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name))
        aln = MultipleSeqAlignment(pseudo_seqs)

        # set up treetime and infer
        from treetime import TreeAnc
        tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0)
        tt.use_mutation_length=False
        tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=5.0,
                                     marginal=True, normalized_rate=False)

        # attach inferred states as e.g. node.region = 'africa'
        for node in tt.tree.find_clades():
            node.__setattr__(field, alphabet[node.sequence[0]])

        # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03
        if confidence:
            for node in tt.tree.find_clades():
                pdis = node.marginal_profile[0]
                S = -np.sum(pdis*np.log(pdis+TINY))

                marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))]
                marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods
                marginal = [(a, b) for a, b in marginal if b > 0.001][:4] #only take stuff over .1% and the top 4 elements
                conf = {a:b for a,b in marginal}
                node.__setattr__(field + "_entropy", S)
                node.__setattr__(field + "_confidence", conf)

        return tt, alphabet
Esempio n. 33
0
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True,
                        infer_gtr=True, root_state=None, missing='?', sampling_bias_correction=None):
    """
    Infer likely ancestral states of a discrete character assuming a time reversible model.

    Parameters
    ----------
    tree : str
        name of tree file
    seq_meta : dict
        meta data associated with sequences
    field : str, optional
        meta data field to use
    confidence : bool, optional
        calculate confidence values for inferences
    infer_gtr : bool, optional
        infer a GTR model for trait transitions (otherwises uses a flat model with rate 1)
    root_state : None, optional
        force the state of the root node (currently not implemented)
    missing : str, optional
        character that is to be interpreted as missing data, default='?'

    Returns
    -------
    T : Phylo.Tree
        Biophyton tree
    gtr : treetime.GTR
        GTR model
    alphabet : dict
        mapping of character states to
    """
    from treetime import GTR
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio import Phylo

    T = Phylo.read(tree, 'newick')
    nodes = {n.name:n for n in T.get_terminals()}

    # Determine alphabet only counting tips in the tree
    places = set()
    for name, meta in seq_meta.items():
        if field in meta and name in nodes:
            places.add(meta[field])
    if root_state is not None:
        places.add(root_state)

    # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
    places = sorted(places)
    nc = len(places)
    if nc>180:
        print("ERROR: geo_inference: can't have more than 180 places!", file=sys.stderr)
        return None,None,None
    elif nc==0:
        print("ERROR: geo_inference: list of places is empty!", file=sys.stderr)
        return None,None,None
    elif nc==1:
        print("WARNING: geo_inference: only one place found -- set every internal node to %s!"%places[0], file=sys.stderr)
        alphabet = {'A':places[0]}
        alphabet_values = ['A']
        gtr = None
        for node in T.find_clades():
            node.sequence=['A']
            node.marginal_profile=np.array([[1.0]])
    else:
        # set up model
        alphabet = {chr(65+i):place for i,place in enumerate(places)}
        model = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)),
                          alphabet = np.array(sorted(alphabet.keys())))

        missing_char = chr(65+nc)
        alphabet[missing_char]=missing
        model.profile_map[missing_char] = np.ones(nc)
        model.ambiguous = missing_char
        alphabet_rev = {v:k for k,v in alphabet.items()}

        # construct pseudo alignment
        pseudo_seqs = []
        for name, meta in seq_meta.items():
            if name in nodes:
                s=alphabet_rev[meta[field]] if field in meta else missing_char
                pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name))
        aln = MultipleSeqAlignment(pseudo_seqs)

        # set up treetime and infer
        from treetime import TreeAnc
        tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0)
        tt.use_mutation_length = False
        tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=1.0,
                                     marginal=True, normalized_rate=False)

        if sampling_bias_correction:
            tt.gtr.mu *= sampling_bias_correction
            tt.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                         marginal=True, normalized_rate=False)

        T = tt.tree
        gtr = tt.gtr
        alphabet_values = tt.gtr.alphabet


    # attach inferred states as e.g. node.region = 'africa'
    for node in T.find_clades():
        node.__setattr__(field, alphabet[node.sequence[0]])

    # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03
    if confidence:
        for node in T.find_clades():
            pdis = node.marginal_profile[0]
            S = -np.sum(pdis*np.log(pdis+TINY))

            marginal = [(alphabet[alphabet_values[i]], pdis[i]) for i in range(len(alphabet_values))]
            marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods
            marginal = [(a, b) for a, b in marginal if b > 0.001][:4] #only take stuff over .1% and the top 4 elements
            conf = {a:b for a,b in marginal}
            node.__setattr__(field + "_entropy", S)
            node.__setattr__(field + "_confidence", conf)

    return T, gtr, alphabet
Esempio n. 34
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            node_data['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed."%args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.timetree:
            print("ERROR: alignment is required for ancestral reconstruction or timetree inference")
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment


    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print("ERROR: meta data with dates is required for time tree reconstruction")
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata, fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence,
                      reroot=args.root or 'best',
                      Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale
                      use_marginal = args.date_inference == 'marginal',
                      branch_length_inference = args.branch_length_inference or 'auto',
                      clock_rate=args.clock_rate, clock_filter_iqd=args.clock_filter_iqd)

        node_data['clock'] = {'rate': tt.date2dist.clock_rate,
                              'intercept': tt.date2dist.intercept,
                              'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate}
        attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
    print("updated tree written to",tree_fname, file=sys.stdout)
    if args.output_node_data:
        node_data_fname = args.output_node_data
    else:
        node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json'

    json_success = write_json(node_data, node_data_fname)
    print("node attributes written to",node_data_fname, file=sys.stdout)

    return 0 if (tree_success and json_success) else 1
Esempio n. 35
0
def run(args):
    if args.seed is not None:
        np.random.seed(args.seed)

    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    try:
        T = read_tree(args.tree)
        node_data['input_tree'] = args.tree
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    if not args.alignment:
        if args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference",
                file=sys.stderr)
            return 1

        if args.divergence_units == 'mutations':
            print(
                "ERROR: alignment is required for divergence in units of mutations",
                file=sys.stderr)
            return 1

        # fake alignment to appease treetime when only using it for naming nodes...
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments",
                file=sys.stderr)
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    from treetime import version as treetime_version
    print(f"augur refine is using TreeTime version {treetime_version}")

    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    elif args.alignment:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'
    else:
        tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk'

    if args.root and len(
            args.root
    ) == 1:  #if anything but a list of seqs, don't send as a list
        args.root = args.root[0]
    if args.keep_root:  # This flag overrides anything specified by 'root'
        args.root = None

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction",
                file=sys.stderr)
            return 1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        tt = refine(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.
            root,  # or 'best', # We now have a default in param spec - this just adds confusion.
            Tc=0.01 if args.coalescent is None else
            args.coalescent,  #use 0.01 as default coalescent time scale
            use_marginal=args.date_inference == 'marginal',
            branch_length_inference=args.branch_length_inference or 'auto',
            precision='auto' if args.precision is None else args.precision,
            clock_rate=args.clock_rate,
            clock_std=args.clock_std_dev,
            clock_filter_iqd=args.clock_filter_iqd,
            covariance=args.covariance,
            resolve_polytomies=(not args.keep_polytomies))

        node_data['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        if args.coalescent == 'skyline':
            try:
                skyline, conf = tt.merger_model.skyline_inferred(
                    gen=args.gen_per_year, confidence=2)
                node_data['skyline'] = [[float(x) for x in skyline.x],
                                        [float(y) for y in conf[0]],
                                        [float(y) for y in skyline.y],
                                        [float(y) for y in conf[1]]]
            except:
                print("ERROR: skyline optimization by TreeTime has failed.",
                      file=sys.stderr)
                return 1

        attributes.extend(
            ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        if args.root:
            if args.root == 'best':
                print(
                    "Warning: To root without inferring a timetree, you must specify an explicit outgroup."
                )
                print(
                    "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n"
                )
            elif args.root in ['least-squares', 'min_dev', 'oldest']:
                raise TypeError(
                    "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup."
                    % args.root)
            else:
                T.root_with_outgroup(args.root)

        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)
    if args.divergence_units == 'mutations-per-site':  #default
        pass
    elif args.divergence_units == 'mutations':
        if not args.timetree:
            tt.infer_ancestral_sequences()
        nuc_map = profile_maps['nuc']

        def are_sequence_states_different(nuc1, nuc2):
            '''
            determine whether two ancestral states should count as mutation for divergence estimates
            while correctly accounting for ambiguous nucleotides
            '''
            if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']:
                return False
            elif nuc1 in nuc_map and nuc2 in nuc_map:
                return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0
            else:
                return False

        for node in T.find_clades():
            n_muts = len([
                position for ancestral, position, derived in node.mutations
                if are_sequence_states_different(ancestral, derived)
            ])

            if args.timetree:
                node_data['nodes'][node.name]['mutation_length'] = n_muts

            node_data['nodes'][node.name]['branch_length'] = n_muts
    else:
        print("ERROR: divergence unit",
              args.divergence_units,
              "not supported!",
              file=sys.stderr)
        return 1

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T,
                               tree_fname,
                               'newick',
                               format_branch_length='%1.8f')
    print("updated tree written to", tree_fname, file=sys.stdout)

    if args.output_node_data:
        node_data_fname = args.output_node_data
    elif args.alignment:
        node_data_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.node_data.json'
    else:
        node_data_fname = '.'.join(
            args.tree.split('.')[:-1]) + '.node_data.json'

    write_json(node_data, node_data_fname)
    print("node attributes written to", node_data_fname, file=sys.stdout)

    return 0 if tree_success else 1
Esempio n. 36
0
def test_seq_joint_reconstruction_correct():
    """
    evolve the random sequence, get the alignment at the leaf nodes.
    Reconstruct the sequences of the internal nodes (joint)
    and prove the reconstruction is correct.
    In addition, compute the likelihood of the particular realization of the
    sequences on the tree and prove that this likelihood is exactly the same
    as calculated in the joint reconstruction
    """

    from treetime import TreeAnc, GTR
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    from StringIO import StringIO
    import numpy as np
    try:
        from itertools import izip
    except ImportError:  #python3.x
        izip = zip
    from collections import defaultdict

    def exclusion(a, b):
        """
        Intersection of two lists
        """
        return list(set(a) - set(b))

    tiny_tree = Phylo.read(
        StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick')
    mygtr = GTR.custom(alphabet=np.array(['A', 'C', 'G', 'T']),
                       pi=np.array([0.15, 0.95, 0.05, 0.3]),
                       W=np.ones((4, 4)))
    seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400)

    myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4)

    # simulate evolution, set resulting sequence as ref_seq
    tree = myTree.tree
    seq_len = 400
    tree.root.ref_seq = np.random.choice(mygtr.alphabet,
                                         p=mygtr.Pi,
                                         size=seq_len)
    print("Root sequence: " + ''.join(tree.root.ref_seq))
    mutation_list = defaultdict(list)
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile(
            seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])

        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])

        node.ref_mutations = [
            (anc, pos, der)
            for pos, (anc,
                      der) in enumerate(izip(node.up.ref_seq, node.ref_seq))
            if anc != der
        ]
        for anc, pos, der in node.ref_mutations:
            print(pos)
            mutation_list[pos].append((node.name, anc, der))
        print(node.name, len(node.ref_mutations), node.ref_mutations)

    # set as the starting sequences to the terminal nodes:
    alnstr = ""
    i = 1
    for leaf in tree.get_terminals():
        alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n'
        i += 1
    print(alnstr)
    myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta')
    myTree._attach_sequences_to_nodes()
    # reconstruct ancestral sequences:
    myTree._ml_anc_joint(debug=True)

    diff_count = 0
    mut_count = 0
    for node in myTree.tree.find_clades():
        if node.up is not None:
            mut_count += len(node.ref_mutations)
            diff_count += np.sum(node.sequence != node.ref_seq) == 0
            if np.sum(node.sequence != node.ref_seq):
                print(
                    "%s: True sequence does not equal inferred sequence. parent %s"
                    % (node.name, node.up.name))
            else:
                print("%s: True sequence equals inferred sequence. parent %s" %
                      (node.name, node.up.name))
        print(node.name, np.sum(node.sequence != node.ref_seq),
              np.where(node.sequence != node.ref_seq), len(node.mutations),
              node.mutations)

    # the assignment of mutations to the root node is probabilistic. Hence some differences are expected
    assert diff_count / seq_len < 2 * (1.0 * mut_count / seq_len)**2

    # prove the likelihood value calculation is correct
    LH = myTree.ancestral_likelihood()
    LH_p = (myTree.tree.sequence_LH)

    print("Difference between reference and inferred LH:", (LH - LH_p).sum())
    assert ((LH - LH_p).sum()) < 1e-9

    return myTree
class mpm_tree(object):
    '''
    class that aligns a set of sequences and infers a tree
    '''

    def __init__(self, cluster_seq_filepath, **kwarks):
        self.clusterID= cluster_seq_filepath.split('/')[-1].split('.fna')[0]
        if 'speciesID' in kwarks:
            folderID=kwarks['speciesID']
        else:
            folderID= cluster_seq_filepath.split('/')[-3]
        self.seqs = {x.id:x for x in SeqIO.parse(cluster_seq_filepath, 'fasta')}
        if 'run_dir' not in kwarks:
            import random
            #self.run_dir = '_'.join(['tmp', self.clusterID])
            self.run_dir = 'tmp/'
            self.run_dir += '_'.join([folderID, 'tmp', time.strftime('%H%M%S',time.gmtime()), str(random.randint(0,100000000))])
        else:
            self.run_dir = kwarks['run_dir']
        self.nuc=True

    def codon_align(self, alignment_tool="mafft", prune=True, discard_premature_stops=False):
        '''
        takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translate
        aa_seqs = {}
        for seq in self.seqs.values():
            tempseq = seq.seq.translate(table="Bacterial")
            # use only sequences that translate without trouble
            if not discard_premature_stops or '*' not in str(tempseq)[:-1] or prune==False:
                aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id)
            else:
                print(seq.id,"has premature stops, discarding")

        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname,'fasta')

        if alignment_tool=='mafft':
            os.system('mafft --reorder --amino temp_in.fasta 1> temp_out.fasta')
            aln_aa = AlignIO.read('temp_out.fasta', "fasta")
        elif alignment_tool=='muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta")
        else:
            print 'Alignment tool not supported:'+alignment_tool
            #return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        os.chdir(cwd)
        remove_dir(self.run_dir)

    def align(self):
        '''
        align sequencences in self.seqs using mafft
        '''
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta")
        os.system('mafft --reorder --anysymbol temp_in.fasta 1> temp_out.fasta 2> mafft.log')

        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        os.chdir(cwd)
        remove_dir(self.run_dir)


    def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True):
        '''
        build a phylogenetic tree using fasttree and raxML (optional)
        based on nextflu tree building pipeline
        '''
        import subprocess
        cwd = os.getcwd()
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        AlignIO.write(self.aln, 'origin.fasta', 'fasta')
        name_translation = make_strains_unique(self.aln)
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = [fasttree_program]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"

        if raxml==False:
            #shutil.copy('initial_tree.newick', out_fname)
            polytomies_midpointRooting('initial_tree.newick',out_fname, self.clusterID)
        elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')]))>3:
        ## only for tree with >3 strains
            if raxml_time_limit>0:
                tmp_tree = Phylo.read('initial_tree.newick','newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter<10):
                    resolve_iter+=1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
                AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
                print( "RAxML tree optimization with time limit", raxml_time_limit,  "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit*3600)
                process = subprocess.Popen("exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system("raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)

        if treetime_used:
            # load the resulting tree as a treetime instance
            from treetime import TreeAnc
            self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0)
            # provide short cut to tree and revert names that conflicted with newick format
            self.tree = self.tt.tree
        else:
            self.tree = Phylo.read(out_fname,'newick')
        self.tree.root.branch_length=0.0001
        restore_strain_name(name_translation, self.aln)
        restore_strain_name(name_translation, self.tree.get_terminals())

        for node in self.tree.find_clades():
            if node.name is not None:
                if node.name.startswith('NODE_')==False:
                    node.ann=node.name
            else:
                node.name='NODE_0'

        os.chdir(cwd)
        remove_dir(self.run_dir)
        self.is_timetree=False

    def ancestral(self, translate_tree = False):
        '''
        infer ancestral nucleotide sequences using maximum likelihood
        and translate the resulting sequences (+ terminals) to amino acids
        '''
        try:
            self.tt.reconstruct_anc(method='ml')
        except:
            print "trouble at self.tt.reconstruct_anc(method='ml')"

        if translate_tree:
            for node in self.tree.find_clades():
                node.aa_sequence = np.fromstring(str(self.translate_seq("".join(node.sequence))), dtype='S1')

    def refine(self, CDS = True):
        '''
        determine mutations on each branch and attach as string to the branches
        '''
        for node in self.tree.find_clades():

            if node.up is not None:
                node.muts = ",".join(["".join(map(str, x)) for x in node.mutations if '-' not in x])
                if CDS == True:
                    node.aa_muts = ",".join([anc+str(pos+1)+der for pos, (anc, der)
                                    in enumerate(zip(node.up.aa_sequence, node.aa_sequence))
                                    if anc!=der and '-' not in anc and '-' not in der])


    def translate_seq(self, seq):
        '''
        custom translation sequence that handles gaps
        '''
        if type(seq) not in [str, unicode]:
            str_seq = str(seq.seq)
        else:
            str_seq = seq
        try:
            # soon not needed as future biopython version will translate --- into -
            tmp_seq = Seq(str(Seq(str_seq.replace('---', 'NNN')).translate(table="Bacterial")).replace('X','-'))
        except:
            tmp_seq = Seq(str(Seq(str_seq.replace('-', 'N')).translate(table="Bacterial")).replace('X','-'))
        return tmp_seq

    def translate(self):
        '''
        translate the nucleotide alignment to an amino acid alignment
        '''
        aa_seqs = []
        for seq in self.aln:
            aa_seqs.append(SeqRecord(seq=self.translate_seq(seq), id=seq.id,
                                     name=seq.name, description=seq.description))
        self.aa_aln = MultipleSeqAlignment(aa_seqs)

    def mean_std_seqLen(self):
        """ returen mean and standard deviation of sequence lengths """
        seqLen_arr = np.array([ len(seq) for seq in self.seqs.values()])
        return np.mean(seqLen_arr, axis=0), np.std(seqLen_arr, axis=0)

    def paralogy_statistics(self):
        best_split = find_best_split(self.tree)
        return len(best_split.para_nodes), best_split.branch_length

    def diversity_statistics_nuc(self):
        ''' calculate alignment entropy of nucleotide alignments '''
        TINY = 1e-10
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        self.af_nuc = calc_af(self.aln, nuc_alpha)
        is_valid = self.af_nuc[:-2].sum(axis=0)>0.5
        tmp_af = self.af_nuc[:-2,is_valid]/self.af_nuc[:-2,is_valid].sum(axis=0)
        #self.entropy_nuc = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0))
        self.diversity_nuc = np.mean(1.0-(tmp_af**2).sum(axis=0))

    def diversity_statistics_aa(self):
        ''' calculate alignment entropy of nucleotide alignments '''
        TINY = 1e-10
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        self.af_aa = calc_af(self.aa_aln, aa_alpha)
        is_valid = self.af_aa[:-2].sum(axis=0)>0.5
        tmp_af = self.af_aa[:-2,is_valid]/self.af_aa[:-2,is_valid].sum(axis=0)
        #self.entropy_aa = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0))
        self.diversity_aa = np.mean(1.0-(tmp_af**2).sum(axis=0))

    def mutations_to_branch(self):
        self.mut_to_branch = defaultdict(list)
        for node in self.tree.find_clades():
            if node.up is not None:
                for mut in node.mutations:
                    self.mut_to_branch[mut].append(node)

    def reduce_alignments(self,RNA_specific=False):
        if RNA_specific:
            self.aa_aln=None
            self.af_aa =None
        else:
            self.af_aa= calc_af(self.aa_aln, aa_alpha)
        for attr, aln, alpha, freq in [["aln_reduced", self.aln, nuc_alpha, self.af_nuc],
                                 ["aa_aln_reduced", self.aa_aln, aa_alpha, self.af_aa]]:
            try:
                if RNA_specific and attr=="aa_aln_reduced":
                    pass #** no reduced amino alignment for RNA
                else:
                    consensus = np.array(list(alpha))[freq.argmax(axis=0)]
                    aln_array = np.array(aln)
                    aln_array[aln_array==consensus]='.'
                    new_seqs = [SeqRecord(seq=Seq("".join(consensus)), name="consensus", id="consensus")]
                    for si, seq in enumerate(aln):
                        new_seqs.append(SeqRecord(seq=Seq("".join(aln_array[si])), name=seq.name,
                                           id=seq.id, description=seq.description))
                    self.__setattr__(attr, MultipleSeqAlignment(new_seqs))
            except:
                print("sf_geneCluster_align_MakeTree: aligment reduction failed")


    #def export(self, path = '', extra_attr = ['aa_muts','ann','branch_length','name','longName'], RNA_specific=False):
    def export(self, path = '', extra_attr = ['aa_muts','annotation','branch_length','name','accession'], RNA_specific=False):
        ## write tree
        Phylo.write(self.tree, path+self.clusterID+'.nwk', 'newick')

        ## processing node name
        for node in self.tree.get_terminals():
            #node.name = node.ann.split('|')[0]
            node.accession = node.ann.split('|')[0]
            #node.longName = node.ann.split('-')[0]
            node.name = node.ann.split('-')[0]
            #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase
            annotation=node.ann.split('-',2)
            if len(annotation)==3:
                node.annotation= annotation[2]
            else:
                node.annotation= annotation[0]

        ## write tree json
        for n in self.tree.root.find_clades():
            if n.branch_length<1e-6:
                n.branch_length = 1e-6
        timetree_fname = path+self.clusterID+'_tree.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)

        self.reduce_alignments(RNA_specific)

        ## msa compatible
        for i_aln in self.aln:
            i_aln.id=i_aln.id.replace('|','-',1)
        for i_alnr in self.aln_reduced:
            i_alnr.id=i_alnr.id.replace('|','-',1)

        AlignIO.write(self.aln, path+self.clusterID+'_na_aln.fa', 'fasta')
        AlignIO.write(self.aln_reduced, path+self.clusterID+'_na_aln_reduced.fa', 'fasta')

        if RNA_specific==False:
            for i_aa_aln in self.aa_aln:
                i_aa_aln.id=i_aa_aln.id.replace('|','-',1)
            for i_aa_alnr in self.aa_aln_reduced:
                i_aa_alnr.id=i_aa_alnr.id.replace('|','-',1)

            AlignIO.write(self.aa_aln, path+self.clusterID+'_aa_aln.fa', 'fasta')
            AlignIO.write(self.aa_aln_reduced, path+self.clusterID+'_aa_aln_reduced.fa', 'fasta')

        ## write seq json
        write_seq_json=0
        if write_seq_json:
            elems = {}
            for node in self.tree.find_clades():
                if hasattr(node, "sequence"):
                    if hasattr(node, "longName")==False:
                        node.longName=node.name
                    elems[node.longName] = {}
                    nuc_dt= {pos:state for pos, (state, ancstate) in
                                    enumerate(izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state!=ancstate}
                    nodeseq=node.sequence.tostring();nodeseq_len=len(nodeseq)
                    elems[node.longName]['nuc']=nuc_dt

            elems['root'] = {}
            elems['root']['nuc'] = self.tree.root.sequence.tostring()

            self.sequences_fname=path+self.clusterID+'_seq.json'
            write_json(elems, self.sequences_fname, indent=None)
                        default=False,
                        action='store_true',
                        help='infer substitution model')
    parser.add_argument('--keep_overhangs',
                        default=False,
                        action='store_true',
                        help='do not fill terminal gaps')
    params = parser.parse_args()

    ###########################################################################
    ### ANCESTRAL RECONSTRUCTION
    ###########################################################################
    model = 'aa' if params.prot else 'Jukes-Cantor'
    treeanc = TreeAnc(params.tree,
                      aln=params.aln,
                      gtr=model,
                      verbose=4,
                      fill_overhangs=not params.keep_overhangs)
    treeanc.infer_ancestral_sequences('ml',
                                      infer_gtr=params.infer_gtr,
                                      marginal=params.marginal)

    ###########################################################################
    ### OUTPUT and saving of results
    ###########################################################################
    if params.infer_gtr:
        print('\nInferred GTR model:')
        print(treeanc.gtr)

    outaln_name = '.'.join(
        params.aln.split('/')[-1].split('.')[:-1]) + '_ancestral.fasta'
Esempio n. 39
0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None,
                                weights=None, verbose=0):
    unique_states = sorted(set(traits.values()))
    nc = len(unique_states)
    if nc>180:
        print("mugration: can't have more than 180 states!", file=sys.stderr)
        return 1
    elif nc<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return 1

    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################
    alphabet = [chr(65+i) for i,state in enumerate(unique_states)]
    missing_char = chr(65+nc)
    letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)}
    letter_to_state[missing_char]=missing_data
    reverse_alphabet = {v:k for k,v in letter_to_state.items()}

    ###########################################################################
    ### construct gtr model
    ###########################################################################
    if type(weights)==str:
        tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',',
                             skipinitialspace=True)
        weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()}
        mean_weight = np.mean(list(weights.values()))
        weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float)
        weights/=weights.sum()
    else:
        weights = None

    # set up dummy matrix
    W = np.ones((nc,nc), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous=missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose,
                      convert_upper=False, one_mutation=0.001)
    treeanc.use_mutation_length = False
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[traits[n.name]]
                           if n.name in traits else missing_char))
                   for n in treeanc.tree.get_terminals()]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=pc, marginal=True, normalized_rate=False,
            fixed_pi=weights)

    if ndiff==ttconf.ERROR: # if reconstruction failed, exit
        return 1

    if sampling_bias_correction:
        treeanc.gtr.mu *= sampling_bias_correction
        treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                     marginal=True, normalized_rate=False)
    return treeanc, letter_to_state, reverse_alphabet
Esempio n. 40
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    tree_meta = {'alignment': args.alignment}
    attributes = ['branch_length']
    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            tree_meta['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.ancestral or args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference"
            )
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    else:
        aln = args.alignment

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree and T:
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction"
            )
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_limit:
            args.year_limit.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_fmt,
                                    min_max_year=args.year_limit)
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(
                args.root
        ) == 1:  #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = timetree(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.root or 'best',
            Tc=args.coalescent if args.coalescent is not None else
            0.01,  #Otherwise can't set to 0
            use_marginal=args.time_marginal or False,
            branch_length_mode=args.branch_length_mode or 'auto',
            clock_rate=args.clock_rate,
            n_iqd=args.n_iqd)

        tree_meta['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        attributes.extend([
            'numdate', 'clock_length', 'mutation_length', 'mutations',
            'raw_date', 'date'
        ])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
        if args.date_confidence:
            attributes.append('num_date_confidence')
    elif args.ancestral in ['joint', 'marginal']:
        tt = ancestral_sequence_inference(
            tree=T,
            aln=aln,
            ref=ref,
            marginal=args.ancestral,
            optimize_branch_length=args.branchlengths,
            branch_length_mode=args.branch_length_mode)
        attributes.extend(['mutation_length', 'mutations'])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    if is_vcf:
        #TreeTime overwrites ambig sites on tips during ancestral reconst.
        #Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    tree_meta['nodes'] = prep_tree(T, attributes, is_vcf)

    if T:
        import json
        tree_success = Phylo.write(T,
                                   tree_fname,
                                   'newick',
                                   format_branch_length='%1.8f')
        if args.node_data:
            node_data_fname = args.node_data
        else:
            node_data_fname = '.'.join(
                args.alignment.split('.')[:-1]) + '.node_data'

        with open(node_data_fname, 'w') as ofile:
            meta_success = json.dump(tree_meta, ofile)

    #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs
    if is_vcf and (args.ancestral or args.timetree):
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)

        return 0 if (tree_success and meta_success) else -1
    else:
        return -1