def ancestral_reconstruction(params): """ implementing treetime ancestral """ # set up if assure_tree(params, tmp_dir='ancestral_tmp'): return 1 outdir = get_outdir(params, '_ancestral') basename = get_basename(params, outdir) gtr = create_gtr(params) ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1, fill_overhangs=not params.keep_overhangs) try: ndiff = treeanc.infer_ancestral_sequences( 'ml', infer_gtr=params.gtr == 'infer', marginal=params.marginal, fixed_pi=fixed_pi, reconstruct_tip_states=params.reconstruct_tip_states) except TreeTimeError as e: print( "\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n" ) raise e ########################################################################### ### OUTPUT and saving of results ########################################################################### if params.gtr == 'infer': fname = outdir + '/sequence_evolution_model.txt' with open(fname, 'w', encoding='utf-8') as ofile: ofile.write(str(treeanc.gtr) + '\n') print('\nInferred sequence evolution model (saved as %s):' % fname) print(treeanc.gtr) export_sequences_and_tree( treeanc, basename, is_vcf, params.zero_based, report_ambiguous=params.report_ambiguous, reconstruct_tip_states=params.reconstruct_tip_states) return 0
def test_ancestral(): import os from Bio import AlignIO import numpy as np from treetime import TreeAnc, GTR root_dir = os.path.dirname(os.path.realpath(__file__)) fasta = str(os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.fasta')) nwk = str(os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.nwk')) for marginal in [True, False]: print('loading flu example') t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta) print('ancestral reconstruction' + ("marginal" if marginal else "joint")) t.reconstruct_anc(method='ml', marginal=marginal) assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA' print('testing LH normalization') from StringIO import StringIO from Bio import Phylo,AlignIO tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick') tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n" ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n" ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta') mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4))) t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln) t.reconstruct_anc('ml', marginal=True, debug=True) lhsum = (t.tree.root.marginal_profile.sum(axis=1) * np.exp(t.tree.root.marginal_subtree_LH_prefactor)).sum() print (lhsum) assert(np.abs(lhsum-1.0)<1e-6) t.optimize_branch_len()
def infer_gene_gain_loss(path, rates=[1.0, 1.0]): # initialize GTR model with default parameters mu = np.sum(rates) gene_pi = np.array(rates) / mu gain_loss_model = GTR.custom(pi=gene_pi, mu=mu, W=np.ones((2, 2)), alphabet=np.array(['0', '1'])) # add "unknown" state to profile gain_loss_model.profile_map['-'] = np.ones(2) root_dir = os.path.dirname(os.path.realpath(__file__)) # define file names for pseudo alignment of presence/absence patterns as in 001001010110 sep = '/' fasta = sep.join([path.rstrip(sep), 'geneCluster', 'genePresence.aln']) # strain tree based on core gene SNPs nwk = sep.join([path.rstrip(sep), 'geneCluster', 'strain_tree.nwk']) # instantiate treetime with custom GTR t = TreeAnc(nwk, gtr=gain_loss_model, verbose=2) # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence for leaf in t.tree.get_terminals(): if leaf.name is None: leaf.name = str(leaf.confidence) t.aln = fasta t.tree.root.branch_length = 0.0001 t.reconstruct_anc(method='ml') for n in t.tree.find_clades(): n.genepresence = n.sequence return t
def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, marginal=False, fill_overhangs=True, infer_tips=False): """infer ancestral sequences using TreeTime Parameters ---------- tree : Bio.Phylo tree or str tree or filename of tree aln : Bio.Align.MultipleSeqAlignment or str alignment or filename of alignment infer_gtr : bool, optional Description marginal : bool, optional Description fill_overhangs : bool In some cases, the missing data on both ends of the alignment is filled with the gap character ('-'). If set to True, these end-gaps are converted to "ambiguous" characters ('N' for nucleotides, 'X' for aminoacids). Otherwise, the alignment is treated as-is infer_tips : bool Since v0.7, TreeTime does not reconstruct tip states by default. This is only relevant when tip-state are not exactly specified, e.g. via characters that signify ambiguous states. To replace those with the most-likely state, set infer_tips=True Returns ------- TreeAnc treetime.TreeAnc instance """ from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', fill_overhangs=fill_overhangs, verbose=1) # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False bool_marginal = (marginal == "marginal") # only infer ancestral sequences, leave branch length untouched tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal, reconstruct_tip_states=infer_tips) print( "\nInferred ancestral sequence states using TreeTime:" "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis" "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n" ) return tt
def test_ancestral(): import os from Bio import AlignIO import numpy as np from treetime import TreeAnc, GTR root_dir = os.path.dirname(os.path.realpath(__file__)) fasta = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.fasta')) nwk = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.nwk')) for marginal in [True, False]: print('loading flu example') t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta) print('ancestral reconstruction' + ("marginal" if marginal else "joint")) t.reconstruct_anc(method='ml', marginal=marginal) assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA' print('testing LH normalization') from Bio import Phylo,AlignIO tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick') tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n" ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n" ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta') mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4))) t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln) t.reconstruct_anc('ml', marginal=True, debug=True) lhsum = np.exp(t.sequence_LH(pos=np.arange(4**3))).sum() print (lhsum) assert(np.abs(lhsum-1.0)<1e-6) t.optimize_branch_len()
def build_tree(focal_alignment): """ Parameters ---------- focal_alignment : path-like path to a fasta file containing the sequences to build the focal-alignment tree. Returns ------- Bio.Phylo.Newick.Tree pyhlogenetic tree of focal-alignment. """ tree_cmd = [ "fasttree", "-nt", "-noml", "-nome", "-nosupport", focal_alignment ] T = Phylo.read(io.StringIO(subprocess.check_output(tree_cmd).decode()), 'newick') T.root_with_outgroup('Wuhan/Hu-1/2019') #T.root_at_midpoint() tt = TreeAnc(tree=T, aln=focal_alignment) tt.infer_ancestral_sequences(reconstruct_tip_states=False) tt.prune_short_branches() tt.optimize_tree() return tt.tree
def real_lh(): """ Likelihood of the sequences calculated by the joint ancestral sequence reconstruction """ tiny_aln_1 = AlignIO.read(StringIO(">A\n"+A_char+"\n" ">B\n"+B_char+"\n" ">D\n"+D_char+"\n"), 'fasta') myTree_1 = TreeAnc(gtr=mygtr, tree = tiny_tree, aln=tiny_aln_1, verbose = 4) myTree_1.reconstruct_anc(method='ml', marginal=False, debug=True) logLH = myTree_1.tree.sequence_LH return logLH
def ancestral_reconstruction(params): """ implementing treetime ancestral """ # set up if assure_tree(params, tmp_dir='ancestral_tmp'): return 1 outdir = get_outdir(params, '_ancestral') basename = get_basename(params, outdir) gtr = create_gtr(params) ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1, fill_overhangs=not params.keep_overhangs) ndiff = treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr == 'infer', marginal=params.marginal, fixed_pi=fixed_pi) if ndiff == ttconf.ERROR: # if reconstruction failed, exit return 1 ########################################################################### ### OUTPUT and saving of results ########################################################################### if params.gtr == "infer": print('\nInferred GTR model:') print(treeanc.gtr) export_sequences_and_tree(treeanc, basename, is_vcf, params.zero_based, report_ambiguous=params.report_ambiguous) return 0
def ref_lh(): """ reference likelihood - LH values for all possible variants of the internal node sequences """ tiny_aln = AlignIO.read( StringIO(">A\n" + A_seq + "\n" ">B\n" + B_seq + "\n" ">D\n" + D_seq + "\n" ">C\nAAAACCCCGGGGTTTT\n" ">E\nACGTACGTACGTACGT\n"), 'fasta') myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln, verbose=4) logLH_ref = myTree.ancestral_likelihood() return logLH_ref
def ref_lh(): """ reference likelihood - LH values for all possible variants of the internal node sequences """ tiny_aln = AlignIO.read(StringIO(">A\n" + A_seq + "\n" ">B\n" + B_seq + "\n" ">D\n" + D_seq + "\n" ">C\nAAAACCCCGGGGTTTT\n" ">E\nACGTACGTACGTACGT\n"), 'fasta') myTree = TreeAnc(gtr=mygtr, tree = tiny_tree, aln =tiny_aln, verbose = 4) logLH_ref = myTree.ancestral_likelihood() return logLH_ref
def run_tree(fname, out_prefix, alphabet, model, n_iter=20): params = parse_alignment_name(fname) m = params['m'] tree = Phylo.read(tree_name(prefix, params), 'newick') tree.root.branch_length = 0.001 tree.ladderize() for n in tree.find_clades(): n.branch_length *= m*(0.6+0.4*np.random.random()) with gzip.open(alignment_name(prefix, params), 'rt') as fh: aln = AlignIO.read(fh, 'fasta') tt = TreeAnc(tree=tree, aln=aln, gtr = model, compress=False, alphabet=alphabet, verbose=3) tt.optimize_tree(branch_length_mode='marginal', max_iter=n_iter, infer_gtr=False) return tt
def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, marginal=False): from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', verbose=1) # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False bool_marginal = (marginal == "marginal") # only infer ancestral sequences, leave branch length untouched tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal) print( "\nInferred ancestral sequence states using TreeTime:" "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis" "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n" ) return tt
def ancestral_reconstruction(params): """ implementing treetime ancestral """ # set up if assure_tree(params, tmp_dir='ancestral_tmp'): return 1 outdir = get_outdir(params, '_ancestral') basename = get_basename(params, outdir) gtr = create_gtr(params) ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1, fill_overhangs=not params.keep_overhangs) ndiff =treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer', marginal=params.marginal, fixed_pi=fixed_pi) if ndiff==ttconf.ERROR: # if reconstruction failed, exit return 1 ########################################################################### ### OUTPUT and saving of results ########################################################################### if params.gtr=="infer": print('\nInferred GTR model:') print(treeanc.gtr) export_sequences_and_tree(treeanc, basename, is_vcf, params.zero_based, report_ambiguous=params.report_ambiguous) return 0
def getAge(start,end,species,mytree,margin=50): from StringIO import StringIO from Bio import Phylo,AlignIO from treetime import TreeAnc myfasta='' for spec in species.keys(): myfasta=myfasta+'>'+spec+'\nA'+str(''.join(species[spec][(start-margin-analysis_start):(end+margin-analysis_start)]))+'A\n' my_aln = AlignIO.read(StringIO(myfasta), 'fasta') ta = TreeAnc(tree=mytree, aln=my_aln, gtr='JC69') ta.infer_ancestral_sequences(method = 'ml', infer_gtr=True, marginal=False) seq=ta.get_reconstructed_alignment() print 'ancestral seq inferred' # for i in range(len(seq)): # print seq[i].id+' '+seq[i].seq[(margin+1):(margin+1+end-start)] Phylo.write(ta.tree, home+'/Annotation/Conservation/SEqAlign/test_'+id+'.nwk', 'newick') mytreeAnnot=Phylo.read(home+'/Annotation/Conservation/SEqAlign/test_'+id+'.nwk', "newick") AncestralSeq={} for y in seq: AncestralSeq[y.id]=str(y.seq[(margin+1):(margin+1+end-start)]) AppearedInCommonAncestor=['human','chimp','gorilla','orangutan','macaque','marmoset','tarsier','lemur','treeShrew','mouse','cow','elephant','opossum','platypus','chicken','frog','zebrafish','lamprey'] AppearedInCommonAncestorScientific=['hg19','panTro2','gorGor1','ponAbe2','rheMac2','calJac1','tarSyr1','micMur1','tupBel1','mm9','bosTau4','loxAfr3','monDom5','ornAna1','galGal3','xenTro2','danRer6','petMar1'] CommonAncestor=['hg19','NODE_0000044','NODE_0000043','NODE_0000042','NODE_0000040','NODE_0000039','NODE_0000038','NODE_0000036','NODE_0000035','NODE_0000028','NODE_0000018','NODE_0000013','NODE_0000011','NODE_0000010','NODE_0000007','NODE_0000006','NODE_0000001','NODE_0000000'] CommonAncestorName=['Humans','Chimpanzees','Gorillas','Orangutans','Gibbons','Monkeys','Tarsiers','Lemurs','Shrews','Rodents','Boroeutheria','Afroeutheria','Marsupials','Monotremes','Sauropsids','Amphibians','Vertebrates','Chordates'] PhyloGeneticDistance=[ mytreeAnnot.distance('hg19', CommonAncestor[i]) for i in range(0,len(CommonAncestor))]
def run_beast(args): ''' BEAST MCC tree to newick and node-data JSON for further augur processing / export ''' verbose = args.verbose print("importing from BEAST MCC tree", args.mcc) if args.recursion_limit: print("Setting recursion limit to %d" % (args.recursion_limit)) sys.setrecursionlimit(args.recursion_limit) # node data is the dict that will be exported as json node_data = { 'comment': "Imported from a BEAST MCC tree using `augur import beast`", 'mcc_file': args.mcc } tree = parse_nexus(tree_path=args.mcc, verbose=args.verbose) summarise_parsed_traits(tree) # Phylo.draw_ascii(tree) # instantiate treetime for the sole reason to name internal nodes (!) # note that tt.tree = tree, and this is modified in-place by this function tt = TreeAnc(tree=tree, aln=fake_alignment(tree), ref=None, gtr='JC69', verbose=1) # extract date information from the tree root_date_offset, most_recent_tip = calc_tree_dates( tree, args.most_recent_tip_date, args.tip_date_regex, args.tip_date_format, args.tip_date_delimeter) compute_entropies_for_discrete_traits(tree) node_data['nodes'] = collect_node_data(tree, root_date_offset, most_recent_tip) tree_success = Phylo.write(tree, args.output_tree, 'newick', format_branch_length='%1.8f') json_success = write_json(node_data, args.output_node_data) print_what_to_do_next(nodes=node_data['nodes'], mcc_path=args.mcc, tree_path=args.output_tree, node_data_path=args.output_node_data)
def ancestral_sequence_inference(tree=None, aln=None, infer_gtr=True, optimize_branch_length=True): from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr='JC69') if optimize_branch_length: tt.optimize_seq_and_branch_len(infer_gtr=infer_gtr) else: # only infer ancestral sequences, leave branch length untouched tt.infer_ancestral_sequences(infer_gtr=infer_gtr) return tt
def assure_tree(params, tmp_dir='treetime_tmp'): """ Function that attempts to load a tree and build it from the alignment if no tree is provided. """ if params.tree is None: params.tree = os.path.basename(params.aln)+'.nwk' print("No tree given: inferring tree") utils.tree_inference(params.aln, params.tree, tmp_dir = tmp_dir) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) try: tt = TreeAnc(params.tree) except: print("Tree loading/building failed.") return 1 return 0
def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True): ''' build a phylogenetic tree using fasttree and raxML (optional) based on nextflu tree building pipeline ''' import subprocess cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) AlignIO.write(self.aln, 'origin.fasta', 'fasta') name_translation = make_strains_unique(self.aln) AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = [fasttree_program] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml==False: #shutil.copy('initial_tree.newick', out_fname) polytomies_midpointRooting('initial_tree.newick',out_fname, self.clusterID) elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')]))>3: ## only for tree with >3 strains if raxml_time_limit>0: tmp_tree = Phylo.read('initial_tree.newick','newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter<10): resolve_iter+=1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree,'initial_tree.newick', 'newick') AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") print( "RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system("raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick") shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) if treetime_used: # load the resulting tree as a treetime instance from treetime import TreeAnc self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0) # provide short cut to tree and revert names that conflicted with newick format self.tree = self.tt.tree else: self.tree = Phylo.read(out_fname,'newick') self.tree.root.branch_length=0.0001 restore_strain_name(name_translation, self.aln) restore_strain_name(name_translation, self.tree.get_terminals()) for node in self.tree.find_clades(): if node.name is not None: if node.name.startswith('NODE_')==False: node.ann=node.name else: node.name='NODE_0' os.chdir(cwd) remove_dir(self.run_dir) self.is_timetree=False
def scan_homoplasies(params): """ the function implementing treetime homoplasies """ if assure_tree(params, tmp_dir='homoplasy_tmp'): return 1 gtr = create_gtr(params) ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False ########################################################################### ### ANCESTRAL RECONSTRUCTION ########################################################################### treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1, fill_overhangs=True) if treeanc.aln is None: # if alignment didn't load, exit return 1 if is_vcf: L = len(ref) + params.const else: L = treeanc.aln.get_alignment_length() + params.const N_seq = len(treeanc.aln) N_tree = treeanc.tree.count_terminals() if params.rescale!=1.0: for n in treeanc.tree.find_clades(): n.branch_length *= params.rescale n.mutation_length = n.branch_length print("read alignment from file %s with %d sequences of length %d"%(params.aln,N_seq,L)) print("read tree from file %s with %d leaves"%(params.tree,N_tree)) print("\ninferring ancestral sequences...") ndiff = treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer', marginal=False, fixed_pi=fixed_pi) print("...done.") if ndiff==ttconf.ERROR: # if reconstruction failed, exit print("Something went wrong during ancestral reconstruction, please check your input files.", file=sys.stderr) return 1 else: print("...done.") if is_vcf: treeanc.recover_var_ambigs() ########################################################################### ### analysis of reconstruction ########################################################################### from collections import defaultdict from scipy.stats import poisson offset = 0 if params.zero_based else 1 if params.drms: DRM_info = read_in_DRMs(params.drms, offset) drms = DRM_info['DRMs'] # construct dictionaries gathering mutations and positions mutations = defaultdict(list) positions = defaultdict(list) terminal_mutations = defaultdict(list) for n in treeanc.tree.find_clades(): if n.up is None: continue if len(n.mutations): for (a,pos, d) in n.mutations: if '-' not in [a,d] and 'N' not in [a,d]: mutations[(a,pos+offset,d)].append(n) positions[pos+offset].append(n) if n.is_terminal(): for (a,pos, d) in n.mutations: if '-' not in [a,d] and 'N' not in [a,d]: terminal_mutations[(a,pos+offset,d)].append(n) # gather homoplasic mutations by strain mutation_by_strain = defaultdict(list) for n in treeanc.tree.get_terminals(): for a,pos,d in n.mutations: if pos+offset in positions and len(positions[pos+offset])>1: if '-' not in [a,d] and 'N' not in [a,d]: mutation_by_strain[n.name].append([(a,pos+offset,d), len(positions[pos])]) # total_branch_length is the expected number of substitutions # corrected_branch_length is the expected number of observable substitutions # (probability of an odd number of substitutions at a particular site) total_branch_length = treeanc.tree.total_branch_length() corrected_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length) for x in treeanc.tree.find_clades()]) corrected_terminal_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length) for x in treeanc.tree.get_terminals()]) expected_mutations = L*corrected_branch_length expected_terminal_mutations = L*corrected_terminal_branch_length # make histograms and sum mutations in different categories multiplicities = np.bincount([len(x) for x in mutations.values()]) total_mutations = np.sum([len(x) for x in mutations.values()]) multiplicities_terminal = np.bincount([len(x) for x in terminal_mutations.values()]) terminal_mutation_count = np.sum([len(x) for x in terminal_mutations.values()]) multiplicities_positions = np.bincount([len(x) for x in positions.values()]) multiplicities_positions[0] = L - np.sum(multiplicities_positions) ########################################################################### ### Output the distribution of times particular mutations are observed ########################################################################### print("\nThe TOTAL tree length is %1.3e and %d mutations were observed." %(total_branch_length,total_mutations)) print("Of these %d mutations,"%total_mutations +"".join(['\n\t - %d occur %d times'%(n,mi) for mi,n in enumerate(multiplicities) if n])) # additional optional output this for terminal mutations only if params.detailed: print("\nThe TERMINAL branch length is %1.3e and %d mutations were observed." %(corrected_terminal_branch_length,terminal_mutation_count)) print("Of these %d mutations,"%terminal_mutation_count +"".join(['\n\t - %d occur %d times'%(n,mi) for mi,n in enumerate(multiplicities_terminal) if n])) ########################################################################### ### Output the distribution of times mutations at particular positions are observed ########################################################################### print("\nOf the %d positions in the genome,"%L +"".join(['\n\t - %d were hit %d times (expected %1.2f)'%(n,mi,L*poisson.pmf(mi,1.0*total_mutations/L)) for mi,n in enumerate(multiplicities_positions) if n])) # compare that distribution to a Poisson distribution with the same mean p = poisson.pmf(np.arange(10*multiplicities_positions.max()),1.0*total_mutations/L) print("\nlog-likelihood difference to Poisson distribution with same mean: %1.3e"%( - L*np.sum(p*np.log(p+1e-100)) + np.sum(multiplicities_positions*np.log(p[:len(multiplicities_positions)]+1e-100)))) ########################################################################### ### Output the mutations that are observed most often ########################################################################### if params.drms: print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)") mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val), " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else "")) else: break else: print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity") mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val))) else: break # optional output specifically for mutations on terminal branches if params.detailed: if params.drms: print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)") terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in terminal_mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val), " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else "")) else: break else: print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity") terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in terminal_mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val))) else: break ########################################################################### ### Output strains that have many homoplasic mutations ########################################################################### # TODO: add statistical criterion if params.detailed: if params.drms: print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations\t# DRM") mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True) for name, val in mutation_by_strain_sorted[:params.n]: if len(val): print("\t%s\t%d\t%d"%(name, len(val), len([mut for mut,l in val if mut[1] in drms]))) else: print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations") mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True) for name, val in mutation_by_strain_sorted[:params.n]: if len(val): print("\t%s\t%d"%(name, len(val))) return 0
def mugration(params): """ implementing treetime mugration """ ########################################################################### ### Parse states ########################################################################### if os.path.isfile(params.states): states = pd.read_csv(params.states, sep='\t' if params.states[-3:]=='tsv' else ',', skipinitialspace=True) else: print("file with states does not exist") return 1 outdir = get_outdir(params, '_mugration') taxon_name = 'name' if 'name' in states.columns else states.columns[0] if params.attribute: if params.attribute in states.columns: attr = params.attribute else: print("The specified attribute was not found in the metadata file "+params.states, file=sys.stderr) print("Available columns are: "+", ".join(states.columns), file=sys.stderr) return 1 else: attr = states.columns[1] print("Attribute for mugration inference was not specified. Using "+attr, file=sys.stderr) leaf_to_attr = {x[taxon_name]:x[attr] for xi, x in states.iterrows() if x[attr]!=params.missing_data} unique_states = sorted(set(leaf_to_attr.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return 1 elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return 1 ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=params.missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if params.weights: params.infer_gtr = True tmp_weights = pd.read_csv(params.weights, sep='\t' if params.states[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = np.ones(nc, dtype=float)/nc # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(params.tree, gtr=mugration_GTR, verbose=params.verbose, convert_upper=False, one_mutation=0.001) pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[leaf_to_attr[n.name]] if n.name in leaf_to_attr else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=params.pc, marginal=True, normalized_rate=False, fixed_pi=weights if params.weights else None) if ndiff==ttconf.ERROR: # if reconstruction failed, exit return 1 ########################################################################### ### output ########################################################################### print("\nCompleted mugration model inference of attribute '%s' for"%attr,params.tree) basename = get_basename(params, outdir) gtr_name = basename + 'GTR.txt' with open(gtr_name, 'w') as ofile: ofile.write('Character to attribute mapping:\n') for state in unique_states: ofile.write(' %s: %s\n'%(reverse_alphabet[state], state)) ofile.write('\n\n'+str(treeanc.gtr)+'\n') print("\nSaved inferred mugration model as:", gtr_name) terminal_count = 0 for n in treeanc.tree.find_clades(): if n.up is None: continue n.confidence=None # due to a bug in older versions of biopython that truncated filenames in nexus export # we truncate them by hand and make them unique. if n.is_terminal() and len(n.name)>40 and bioversion<"1.69": n.name = n.name[:35]+'_%03d'%terminal_count terminal_count+=1 n.comment= '&%s="'%attr + letter_to_state[n.sequence[0]] +'"' if params.confidence: conf_name = basename+'confidence.csv' with open(conf_name, 'w') as ofile: ofile.write('#name, '+', '.join(unique_states)+'\n') for n in treeanc.tree.find_clades(): ofile.write(n.name + ', '+', '.join([str(x) for x in n.marginal_profile[0]])+'\n') print("Saved table with ancestral state confidences as:", conf_name) # write tree to file outtree_name = basename+'annotated_tree.nexus' Phylo.write(treeanc.tree, outtree_name, 'nexus') print("Saved annotated tree as:",outtree_name) return 0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0, iterations=5): """take a set of discrete states associated with tips of a tree and reconstruct their ancestral states along with a GTR model that approximately maximizes the likelihood of the states on the tree. Parameters ---------- tree : str, Bio.Phylo.Tree name of tree file or Biopython tree object traits : dict dictionary linking tips to straits missing_data : str, optional string indicating missing data pc : float, optional number of pseudo-counts to be used during GTR inference, default 1.0 sampling_bias_correction : float, optional factor to inflate overall switching rate by to counteract sampling bias weights : str, optional name of file with equilibirum frequencies verbose : int, optional level of verbosity in output iterations : int, optional number of times non-linear optimization of overall rate and transmission estimation are iterated Returns ------- tuple tuple of treeanc object, forward and reverse alphabets Raises ------ TreeTimeError raise error if ancestral reconstruction errors out """ unique_states = sorted(set(traits.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return None, None, None elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return None, None, None ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if type(weights)==str: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = None # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) try: ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights, reconstruct_tip_states=True) treeanc.optimize_gtr_rate() except TreeTimeError as e: print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n") raise e for i in range(iterations): treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc) treeanc.optimize_gtr_rate() if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False, reconstruct_tip_states=True) print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. " "TreeTime now optimizes the overall rate numerically and thus allows for long branches " "along which multiple changes accumulated. This is expected to affect estimates of the " "overall rate while leaving the relative rates mostly unchanged.")) return treeanc, letter_to_state, reverse_alphabet
from Bio import Phylo import matplotlib.pyplot as plt from Bio import Phylo if __name__ == '__main__': plt.ion() # load data base_name = 'data/H3N2_NA_allyears_NA.20' T = Phylo.read(base_name + ".nwk", "newick") T.root_with_outgroup( T.find_clades(lambda x: x.name.startswith('A/Scot')).next()) # instantiate treetime myTree = TreeAnc(gtr='Jukes-Cantor', tree=T, aln=base_name + '.fasta', verbose=0) # infer optimize branch length, infer a GTR model, and reoptimize branch length myTree.optimize_sequences_and_branch_length(infer_gtr=True) # lets examine the properties of a node in the tree after ancestral inference node = myTree.tree.get_nonterminals()[7] # each node now has an inferred sequence print("the inferred sequences is an array of states:", node.sequence) # in addition, each node of the tree now has an mutation object attached # note that the mutation numbering starts at 0 rather than 1 print("mutations of node %s:" % node.name, node.mutations) # we can readily verify these mutations by checking the inferred sequences
args = parser.parse_args() genes = args.genes if type(args.genes)==list else [args.genes] translations = args.translations if type(args.translations)==list else [args.translations] T = Phylo.read(args.tree, 'newick') leafs = {n.name for n in T.get_terminals()} node_data = {} for gene, translation in zip(genes, translations): seqs = [] for s in SeqIO.parse(translation, 'fasta'): if s.id in leafs: seqs.append(s) tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa') tt.infer_ancestral_sequences(reconstruct_tip_states=True) with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh: for n in tt.tree.find_clades(): if n.name not in node_data: node_data[n.name] = {"aa_muts":{}} node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations] fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n") with open(args.output, 'w') as fh: json.dump({"nodes":node_data}, fh)
"GTR params are not specified. Creating GTR model with default parameters" ) gtr = GTR.standard(model, **kwargs) except: print( "Could not create GTR model from input arguments. Using default (Jukes-Cantor 1969)" ) gtr = GTR.standard('jc') ########################################################################### ### ANCESTRAL RECONSTRUCTION ########################################################################### treeanc = TreeAnc(params.tree, aln=params.aln, gtr=gtr, verbose=1, fill_overhangs=True) L = treeanc.aln.get_alignment_length() N_seq = len(treeanc.aln) N_tree = treeanc.tree.count_terminals() print("read alignment from file %s with %d sequences of length %d" % (params.aln, N_seq, L)) print("read tree from file %s with %d leaves" % (params.tree, N_tree)) print("\ninferring ancestral sequences...") treeanc.infer_ancestral_sequences('ml', infer_gtr=infer_gtr, marginal=False) print("...done.")
class mpm_tree(object): ''' class that aligns a set of sequences and infers a tree ''' def __init__(self, cluster_seq_filepath, **kwarks): self.clusterID = cluster_seq_filepath.split('/')[-1].split('.fna')[0] if 'speciesID' in kwarks: folderID = kwarks['speciesID'] else: folderID = cluster_seq_filepath.split('/')[-3] self.seqs = { x.id: x for x in SeqIO.parse(cluster_seq_filepath, 'fasta') } if 'run_dir' not in kwarks: import random #self.run_dir = '_'.join(['tmp', self.clusterID]) self.run_dir = 'tmp/' self.run_dir += '_'.join([ folderID, 'tmp', time.strftime('%H%M%S', time.gmtime()), str(random.randint(0, 100000000)) ]) else: self.run_dir = kwarks['run_dir'] self.nuc = True def codon_align(self, alignment_tool="mafft", prune=True, discard_premature_stops=False): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) # translate aa_seqs = {} for seq in self.seqs.values(): tempseq = seq.seq.translate(table="Bacterial") # use only sequences that translate without trouble if not discard_premature_stops or '*' not in str( tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) else: print(seq.id, "has premature stops, discarding") tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'mafft': os.system( 'mafft --reorder --amino temp_in.fasta 1> temp_out.fasta') aln_aa = AlignIO.read('temp_out.fasta', "fasta") elif alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") else: print 'Alignment tool not supported:' + alignment_tool #return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) os.chdir(cwd) remove_dir(self.run_dir) def align(self): ''' align sequencences in self.seqs using mafft ''' cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta") os.system( 'mafft --reorder --anysymbol temp_in.fasta 1> temp_out.fasta 2> mafft.log' ) self.aln = AlignIO.read('temp_out.fasta', 'fasta') os.chdir(cwd) remove_dir(self.run_dir) def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True): ''' build a phylogenetic tree using fasttree and raxML (optional) based on nextflu tree building pipeline ''' import subprocess cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) AlignIO.write(self.aln, 'origin.fasta', 'fasta') name_translation = make_strains_unique(self.aln) AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = [fasttree_program] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml == False: #shutil.copy('initial_tree.newick', out_fname) polytomies_midpointRooting('initial_tree.newick', out_fname, self.clusterID) elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')])) > 3: ## only for tree with >3 strains if raxml_time_limit > 0: tmp_tree = Phylo.read('initial_tree.newick', 'newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter < 10): resolve_iter += 1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree, 'initial_tree.newick', 'newick') AlignIO.write(self.aln, "temp.phyx", "phylip-relaxed") print("RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit * 3600) process = subprocess.Popen( "exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system( "raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick" ) shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) if treetime_used: # load the resulting tree as a treetime instance from treetime import TreeAnc self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0) # provide short cut to tree and revert names that conflicted with newick format self.tree = self.tt.tree else: self.tree = Phylo.read(out_fname, 'newick') self.tree.root.branch_length = 0.0001 restore_strain_name(name_translation, self.aln) restore_strain_name(name_translation, self.tree.get_terminals()) for node in self.tree.find_clades(): if node.name is not None: if node.name.startswith('NODE_') == False: node.ann = node.name else: node.name = 'NODE_0' os.chdir(cwd) remove_dir(self.run_dir) self.is_timetree = False def ancestral(self, translate_tree=False): ''' infer ancestral nucleotide sequences using maximum likelihood and translate the resulting sequences (+ terminals) to amino acids ''' try: self.tt.reconstruct_anc(method='ml') except: print "trouble at self.tt.reconstruct_anc(method='ml')" if translate_tree: for node in self.tree.find_clades(): node.aa_sequence = np.fromstring(str( self.translate_seq("".join(node.sequence))), dtype='S1') def refine(self, CDS=True): ''' determine mutations on each branch and attach as string to the branches ''' for node in self.tree.find_clades(): if node.up is not None: node.muts = ",".join([ "".join(map(str, x)) for x in node.mutations if '-' not in x ]) if CDS == True: node.aa_muts = ",".join([ anc + str(pos + 1) + der for pos, (anc, der) in enumerate( zip(node.up.aa_sequence, node.aa_sequence)) if anc != der and '-' not in anc and '-' not in der ]) def translate_seq(self, seq): ''' custom translation sequence that handles gaps ''' if type(seq) != str: str_seq = str(seq.seq) else: str_seq = seq try: # soon not needed as future biopython version will translate --- into - tmp_seq = Seq( str( Seq(str_seq.replace('---', 'NNN')).translate( table="Bacterial")).replace('X', '-')) except: tmp_seq = Seq( str( Seq(str_seq.replace( '-', 'N')).translate(table="Bacterial")).replace('X', '-')) return tmp_seq def translate(self): ''' translate the nucleotide alignment to an amino acid alignment ''' aa_seqs = [] for seq in self.aln: aa_seqs.append( SeqRecord(seq=self.translate_seq(seq), id=seq.id, name=seq.name, description=seq.description)) self.aa_aln = MultipleSeqAlignment(aa_seqs) def mean_std_seqLen(self): """ returen mean and standard deviation of sequence lengths """ seqLen_arr = np.array([len(seq) for seq in self.seqs.values()]) return np.mean(seqLen_arr, axis=0), np.std(seqLen_arr, axis=0) def paralogy_statistics(self): best_split = find_best_split(self.tree) return len(best_split.para_nodes), best_split.branch_length def diversity_statistics_nuc(self): ''' calculate alignment entropy of nucleotide alignments ''' TINY = 1e-10 if not hasattr(self, "aln"): print("calculate alignment first") return self.af_nuc = calc_af(self.aln, nuc_alpha) is_valid = self.af_nuc[:-2].sum(axis=0) > 0.5 tmp_af = self.af_nuc[:-2, is_valid] / self.af_nuc[:-2, is_valid].sum(axis=0) #self.entropy_nuc = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)) self.diversity_nuc = np.mean(1.0 - (tmp_af**2).sum(axis=0)) def diversity_statistics_aa(self): ''' calculate alignment entropy of nucleotide alignments ''' TINY = 1e-10 if not hasattr(self, "aln"): print("calculate alignment first") return self.af_aa = calc_af(self.aa_aln, aa_alpha) is_valid = self.af_aa[:-2].sum(axis=0) > 0.5 tmp_af = self.af_aa[:-2, is_valid] / self.af_aa[:-2, is_valid].sum(axis=0) #self.entropy_aa = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)) self.diversity_aa = np.mean(1.0 - (tmp_af**2).sum(axis=0)) def mutations_to_branch(self): self.mut_to_branch = defaultdict(list) for node in self.tree.find_clades(): if node.up is not None: for mut in node.mutations: self.mut_to_branch[mut].append(node) def reduce_alignments(self, RNA_specific=False): if RNA_specific: self.aa_aln = None self.af_aa = None else: self.af_aa = calc_af(self.aa_aln, aa_alpha) for attr, aln, alpha, freq in [[ "aln_reduced", self.aln, nuc_alpha, self.af_nuc ], ["aa_aln_reduced", self.aa_aln, aa_alpha, self.af_aa]]: try: if RNA_specific and attr == "aa_aln_reduced": pass #** no reduced amino alignment for RNA else: consensus = np.array(list(alpha))[freq.argmax(axis=0)] aln_array = np.array(aln) aln_array[aln_array == consensus] = '.' new_seqs = [ SeqRecord(seq=Seq("".join(consensus)), name="consensus", id="consensus") ] for si, seq in enumerate(aln): new_seqs.append( SeqRecord(seq=Seq("".join(aln_array[si])), name=seq.name, id=seq.id, description=seq.description)) self.__setattr__(attr, MultipleSeqAlignment(new_seqs)) except: print( "sf_geneCluster_align_MakeTree: aligment reduction failed") #def export(self, path = '', extra_attr = ['aa_muts','ann','branch_length','name','longName'], RNA_specific=False): def export(self, path='', extra_attr=[ 'aa_muts', 'annotation', 'branch_length', 'name', 'accession' ], RNA_specific=False): ## write tree Phylo.write(self.tree, path + self.clusterID + '.nwk', 'newick') ## processing node name for node in self.tree.get_terminals(): #node.name = node.ann.split('|')[0] node.accession = node.ann.split('|')[0] #node.longName = node.ann.split('-')[0] node.name = node.ann.split('-')[0] #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase annotation = node.ann.split('-', 2) if len(annotation) == 3: node.annotation = annotation[2] else: node.annotation = annotation[0] ## write tree json for n in self.tree.root.find_clades(): if n.branch_length < 1e-6: n.branch_length = 1e-6 timetree_fname = path + self.clusterID + '_tree.json' tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr) write_json(tree_json, timetree_fname, indent=None) self.reduce_alignments(RNA_specific) ## msa compatible for i_aln in self.aln: i_aln.id = i_aln.id.replace('|', '-', 1) for i_alnr in self.aln_reduced: i_alnr.id = i_alnr.id.replace('|', '-', 1) AlignIO.write(self.aln, path + self.clusterID + '_na_aln.fa', 'fasta') AlignIO.write(self.aln_reduced, path + self.clusterID + '_na_aln_reduced.fa', 'fasta') if RNA_specific == False: for i_aa_aln in self.aa_aln: i_aa_aln.id = i_aa_aln.id.replace('|', '-', 1) for i_aa_alnr in self.aa_aln_reduced: i_aa_alnr.id = i_aa_alnr.id.replace('|', '-', 1) AlignIO.write(self.aa_aln, path + self.clusterID + '_aa_aln.fa', 'fasta') AlignIO.write(self.aa_aln_reduced, path + self.clusterID + '_aa_aln_reduced.fa', 'fasta') ## write seq json write_seq_json = 0 if write_seq_json: elems = {} for node in self.tree.find_clades(): if hasattr(node, "sequence"): if hasattr(node, "longName") == False: node.longName = node.name elems[node.longName] = {} nuc_dt = { pos: state for pos, (state, ancstate) in enumerate( izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state != ancstate } nodeseq = node.sequence.tostring() nodeseq_len = len(nodeseq) elems[node.longName]['nuc'] = nuc_dt elems['root'] = {} elems['root']['nuc'] = self.tree.root.sequence.tostring() self.sequences_fname = path + self.clusterID + '_seq.json' write_json(elems, self.sequences_fname, indent=None)
dtype=float) weights /= weights.sum() else: weights = np.ones(nc, dtype=float) / nc # set up dummy matrix W = np.ones((nc, nc), dtype=float) mugration_GTR = GTR.custom(pi=weights, W=W, alphabet=np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous = missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(params.tree, gtr=mugration_GTR, verbose=params.verbose) pseudo_seqs = [ SeqRecord(id=n.name, name=n.name, seq=Seq(reverse_alphabet[leaf_to_attr[n.name]] if n.name in leaf_to_attr else missing)) for n in treeanc.tree.get_terminals() ] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) treeanc.infer_ancestral_sequences( method='ml', infer_gtr=True, store_compressed=False, pc=params.pc, marginal=True,
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0, iterations=5): """take a set of discrete states associated with tips of a tree and reconstruct their ancestral states along with a GTR model that approximately maximizes the likelihood of the states on the tree. Parameters ---------- tree : str, Bio.Phylo.Tree name of tree file or Biopython tree object traits : dict dictionary linking tips to straits missing_data : str, optional string indicating missing data pc : float, optional number of pseudo-counts to be used during GTR inference, default 1.0 sampling_bias_correction : float, optional factor to inflate overall switching rate by to counteract sampling bias weights : str, optional name of file with equilibirum frequencies verbose : int, optional level of verbosity in output iterations : int, optional number of times non-linear optimization of overall rate and transmission estimation are iterated Returns ------- tuple tuple of treeanc object, forward and reverse alphabets Raises ------ TreeTimeError raise error if ancestral reconstruction errors out """ ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### unique_states = set(traits.values()) n_observed_states = len(unique_states) # load weights from file and convert to dict if supplied as string if type(weights)==str: try: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weight_dict = {row[0]:row[1] for ri,row in tmp_weights.iterrows() if not np.isnan(row[1])} except: raise ValueError("Loading of weights file '%s' failed!"%weights) elif type(weights)==dict: weight_dict = weights else: weight_dict = None # add weights to unique states for alphabet construction if weight_dict is not None: unique_states.update(weight_dict.keys()) missing_weights = [c for c in unique_states if c not in weight_dict and c is not missing_data] if len(missing_weights): print("Missing weights for values: " + ", ".join(missing_weights)) if len(missing_weights)>0.5*n_observed_states: print("More than half of discrete states missing from the weights file") print("Weights read from file are:", weights) raise TreeTimeError("More than half of discrete states missing from the weights file") unique_states=sorted(unique_states) # make a map from states (excluding missing data) to characters in the alphabet # note that gap character '-' is chr(45) and will never be included here reverse_alphabet = {state:chr(65+i) for i,state in enumerate(unique_states) if state!=missing_data} alphabet = list(reverse_alphabet.values()) # construct a look up from alphabet character to states letter_to_state = {v:k for k,v in reverse_alphabet.items()} # construct the vector with weights to be used as equilibrium frequency if weight_dict is not None: mean_weight = np.mean(list(weight_dict.values())) weights = np.array([weight_dict[letter_to_state[c]] if letter_to_state[c] in weight_dict else mean_weight for c in alphabet], dtype=float) weights/=weights.sum() # consistency checks if len(alphabet)<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return None, None, None n_states = len(alphabet) missing_char = chr(65+n_states) reverse_alphabet[missing_data]=missing_char letter_to_state[missing_char]=missing_data ########################################################################### ### construct gtr model ########################################################################### # set up dummy matrix W = np.ones((n_states,n_states), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(n_states) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] valid_seq = np.array([str(s.seq)!=missing_char for s in pseudo_seqs]) print("Assigned discrete traits to %d out of %d taxa.\n"%(np.sum(valid_seq),len(valid_seq))) treeanc.aln = MultipleSeqAlignment(pseudo_seqs) try: ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights, reconstruct_tip_states=True) treeanc.optimize_gtr_rate() except TreeTimeError as e: print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n") raise e for i in range(iterations): treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc, fixed_pi=weights) treeanc.optimize_gtr_rate() if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False, reconstruct_tip_states=True) print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. " "TreeTime now optimizes the overall rate numerically and thus allows for long branches " "along which multiple changes accumulated. This is expected to affect estimates of the " "overall rate while leaving the relative rates mostly unchanged.")) return treeanc, letter_to_state, reverse_alphabet
def scan_homoplasies(params): """ the function implementing treetime homoplasies """ if assure_tree(params, tmp_dir='homoplasy_tmp'): return 1 gtr = create_gtr(params) ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False ########################################################################### ### ANCESTRAL RECONSTRUCTION ########################################################################### treeanc = TreeAnc(params.tree, aln=aln, ref=ref, gtr=gtr, verbose=1, fill_overhangs=True) if treeanc.aln is None: # if alignment didn't load, exit return 1 if is_vcf: L = len(ref) + params.const else: L = treeanc.data.full_length + params.const N_seq = len(treeanc.aln) N_tree = treeanc.tree.count_terminals() if params.rescale!=1.0: for n in treeanc.tree.find_clades(): n.branch_length *= params.rescale n.mutation_length = n.branch_length print("read alignment from file %s with %d sequences of length %d"%(params.aln,N_seq,L)) print("read tree from file %s with %d leaves"%(params.tree,N_tree)) print("\ninferring ancestral sequences...") ndiff = treeanc.infer_ancestral_sequences('ml', infer_gtr=params.gtr=='infer', marginal=False, fixed_pi=fixed_pi) print("...done.") if is_vcf: treeanc.recover_var_ambigs() ########################################################################### ### analysis of reconstruction ########################################################################### from collections import defaultdict from scipy.stats import poisson offset = 0 if params.zero_based else 1 if params.drms: DRM_info = read_in_DRMs(params.drms, offset) drms = DRM_info['DRMs'] # construct dictionaries gathering mutations and positions mutations = defaultdict(list) positions = defaultdict(list) terminal_mutations = defaultdict(list) for n in treeanc.tree.find_clades(): if n.up is None: continue if len(n.mutations): for (a,pos, d) in n.mutations: if '-' not in [a,d] and 'N' not in [a,d]: mutations[(a,pos+offset,d)].append(n) positions[pos+offset].append(n) if n.is_terminal(): for (a,pos, d) in n.mutations: if '-' not in [a,d] and 'N' not in [a,d]: terminal_mutations[(a,pos+offset,d)].append(n) # gather homoplasic mutations by strain mutation_by_strain = defaultdict(list) for n in treeanc.tree.get_terminals(): for a,pos,d in n.mutations: if pos+offset in positions and len(positions[pos+offset])>1: if '-' not in [a,d] and 'N' not in [a,d]: mutation_by_strain[n.name].append([(a,pos+offset,d), len(positions[pos])]) # total_branch_length is the expected number of substitutions # corrected_branch_length is the expected number of observable substitutions # (probability of an odd number of substitutions at a particular site) total_branch_length = treeanc.tree.total_branch_length() corrected_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length) for x in treeanc.tree.find_clades()]) corrected_terminal_branch_length = np.sum([np.exp(-x.branch_length)*np.sinh(x.branch_length) for x in treeanc.tree.get_terminals()]) expected_mutations = L*corrected_branch_length expected_terminal_mutations = L*corrected_terminal_branch_length # make histograms and sum mutations in different categories multiplicities = np.bincount([len(x) for x in mutations.values()]) total_mutations = np.sum([len(x) for x in mutations.values()]) multiplicities_terminal = np.bincount([len(x) for x in terminal_mutations.values()]) terminal_mutation_count = np.sum([len(x) for x in terminal_mutations.values()]) multiplicities_positions = np.bincount([len(x) for x in positions.values()]) multiplicities_positions[0] = L - np.sum(multiplicities_positions) ########################################################################### ### Output the distribution of times particular mutations are observed ########################################################################### print("\nThe TOTAL tree length is %1.3e and %d mutations were observed." %(total_branch_length,total_mutations)) print("Of these %d mutations,"%total_mutations +"".join(['\n\t - %d occur %d times'%(n,mi) for mi,n in enumerate(multiplicities) if n])) # additional optional output this for terminal mutations only if params.detailed: print("\nThe TERMINAL branch length is %1.3e and %d mutations were observed." %(corrected_terminal_branch_length,terminal_mutation_count)) print("Of these %d mutations,"%terminal_mutation_count +"".join(['\n\t - %d occur %d times'%(n,mi) for mi,n in enumerate(multiplicities_terminal) if n])) ########################################################################### ### Output the distribution of times mutations at particular positions are observed ########################################################################### print("\nOf the %d positions in the genome,"%L +"".join(['\n\t - %d were hit %d times (expected %1.2f)'%(n,mi,L*poisson.pmf(mi,1.0*total_mutations/L)) for mi,n in enumerate(multiplicities_positions) if n])) # compare that distribution to a Poisson distribution with the same mean p = poisson.pmf(np.arange(10*multiplicities_positions.max()),1.0*total_mutations/L) print("\nlog-likelihood difference to Poisson distribution with same mean: %1.3e"%( - L*np.sum(p*np.log(p+1e-100)) + np.sum(multiplicities_positions*np.log(p[:len(multiplicities_positions)]+1e-100)))) ########################################################################### ### Output the mutations that are observed most often ########################################################################### if params.drms: print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)") mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val), " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else "")) else: break else: print("\n\nThe ten most homoplasic mutations are:\n\tmut\tmultiplicity") mutations_sorted = sorted(mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val))) else: break # optional output specifically for mutations on terminal branches if params.detailed: if params.drms: print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity\tDRM details (gene drug AAmut)") terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in terminal_mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d\t%s"%(mut[0], mut[1], mut[2], len(val), " ".join([drms[mut[1]]['gene'], drms[mut[1]]['drug'], drms[mut[1]]['alt_base'][mut[2]]]) if mut[1] in drms else "")) else: break else: print("\n\nThe ten most homoplasic mutation on terminal branches are:\n\tmut\tmultiplicity") terminal_mutations_sorted = sorted(terminal_mutations.items(), key=lambda x:len(x[1])-0.1*x[0][1]/L, reverse=True) for mut, val in terminal_mutations_sorted[:params.n]: if len(val)>1: print("\t%s%d%s\t%d"%(mut[0], mut[1], mut[2], len(val))) else: break ########################################################################### ### Output strains that have many homoplasic mutations ########################################################################### # TODO: add statistical criterion if params.detailed: if params.drms: print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations\t# DRM") mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True) for name, val in mutation_by_strain_sorted[:params.n]: if len(val): print("\t%s\t%d\t%d"%(name, len(val), len([mut for mut,l in val if mut[1] in drms]))) else: print("\n\nTaxons that carry positions that mutated elsewhere in the tree:\n\ttaxon name\t#of homoplasic mutations") mutation_by_strain_sorted = sorted(mutation_by_strain.items(), key=lambda x:len(x[1]), reverse=True) for name, val in mutation_by_strain_sorted[:params.n]: if len(val): print("\t%s\t%d"%(name, len(val))) return 0
def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True): ''' build a phylogenetic tree using fasttree and raxML (optional) based on nextflu tree building pipeline ''' import subprocess cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) AlignIO.write(self.aln, 'origin.fasta', 'fasta') name_translation = make_strains_unique(self.aln) AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = [fasttree_program] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml == False: #shutil.copy('initial_tree.newick', out_fname) polytomies_midpointRooting('initial_tree.newick', out_fname, self.clusterID) elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')])) > 3: ## only for tree with >3 strains if raxml_time_limit > 0: tmp_tree = Phylo.read('initial_tree.newick', 'newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter < 10): resolve_iter += 1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree, 'initial_tree.newick', 'newick') AlignIO.write(self.aln, "temp.phyx", "phylip-relaxed") print("RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit * 3600) process = subprocess.Popen( "exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system( "raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick" ) shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) if treetime_used: # load the resulting tree as a treetime instance from treetime import TreeAnc self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0) # provide short cut to tree and revert names that conflicted with newick format self.tree = self.tt.tree else: self.tree = Phylo.read(out_fname, 'newick') self.tree.root.branch_length = 0.0001 restore_strain_name(name_translation, self.aln) restore_strain_name(name_translation, self.tree.get_terminals()) for node in self.tree.find_clades(): if node.name is not None: if node.name.startswith('NODE_') == False: node.ann = node.name else: node.name = 'NODE_0' os.chdir(cwd) remove_dir(self.run_dir) self.is_timetree = False
def test_seq_joint_reconstruction_correct(): """ evolve the random sequence, get the alignment at the leaf nodes. Reconstruct the sequences of the internal nodes (joint) and prove the reconstruction is correct. In addition, compute the likelihood of the particular realization of the sequences on the tree and prove that this likelihood is exactly the same as calculated in the joint reconstruction """ from treetime import TreeAnc, GTR from treetime import seq_utils from Bio import Phylo, AlignIO import numpy as np try: from itertools import izip except ImportError: #python3.x izip = zip from collections import defaultdict def exclusion(a, b): """ Intersection of two lists """ return list(set(a) - set(b)) tiny_tree = Phylo.read(StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick') mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4,4))) seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400) myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4) # simulate evolution, set resulting sequence as ref_seq tree = myTree.tree seq_len = 400 tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len) print ("Root sequence: " + ''.join(tree.root.ref_seq)) mutation_list = defaultdict(list) for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t) # normalie profile p=(p.T/p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0])]) node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs]) node.ref_mutations = [(anc, pos, der) for pos, (anc, der) in enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc!=der] for anc, pos, der in node.ref_mutations: print(pos) mutation_list[pos].append((node.name, anc, der)) print (node.name, len(node.ref_mutations), node.ref_mutations) # set as the starting sequences to the terminal nodes: alnstr = "" i = 1 for leaf in tree.get_terminals(): alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n' i += 1 print (alnstr) myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta') myTree._attach_sequences_to_nodes() # reconstruct ancestral sequences: myTree._ml_anc_joint(debug=True) diff_count = 0 mut_count = 0 for node in myTree.tree.find_clades(): if node.up is not None: mut_count += len(node.ref_mutations) diff_count += np.sum(node.sequence != node.ref_seq)==0 if np.sum(node.sequence != node.ref_seq): print("%s: True sequence does not equal inferred sequence. parent %s"%(node.name, node.up.name)) else: print("%s: True sequence equals inferred sequence. parent %s"%(node.name, node.up.name)) print (node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations) # the assignment of mutations to the root node is probabilistic. Hence some differences are expected assert diff_count/seq_len<2*(1.0*mut_count/seq_len)**2 # prove the likelihood value calculation is correct LH = myTree.ancestral_likelihood() LH_p = (myTree.tree.sequence_LH) print ("Difference between reference and inferred LH:", (LH - LH_p).sum()) assert ((LH - LH_p).sum())<1e-9 return myTree
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True, infer_gtr=True, root_state=None, missing='?'): from treetime import GTR from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import Phylo T = Phylo.read(tree, 'newick') nodes = {n.name:n for n in T.get_terminals()} # Determine alphabet only counting tips in the tree places = set() for name, meta in seq_meta.items(): if field in meta and name in nodes: places.add(meta[field]) if root_state is not None: places.add(root_state) # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc>180: print("ERROR: geo_inference: can't have more than 180 places!", file=sys.stderr) return None,None elif nc==1: print("WARNING: geo_inference: only one place found -- set every internal node to %s!"%places[0], file=sys.stderr) return None,None elif nc==0: print("ERROR: geo_inference: list of places is empty!", file=sys.stderr) return None,None else: # set up model alphabet = {chr(65+i):place for i,place in enumerate(places)} model = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)), alphabet = np.array(sorted(alphabet.keys()))) missing_char = chr(65+nc) alphabet[missing_char]=missing model.profile_map[missing_char] = np.ones(nc) model.ambiguous = missing_char alphabet_rev = {v:k for k,v in alphabet.items()} # construct pseudo alignment pseudo_seqs = [] for name, meta in seq_meta.items(): if name in nodes: s=alphabet_rev[meta[field]] if field in meta else missing_char pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name)) aln = MultipleSeqAlignment(pseudo_seqs) # set up treetime and infer from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0) tt.use_mutation_length=False tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=5.0, marginal=True, normalized_rate=False) # attach inferred states as e.g. node.region = 'africa' for node in tt.tree.find_clades(): node.__setattr__(field, alphabet[node.sequence[0]]) # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03 if confidence: for node in tt.tree.find_clades(): pdis = node.marginal_profile[0] S = -np.sum(pdis*np.log(pdis+TINY)) marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [(a, b) for a, b in marginal if b > 0.001][:4] #only take stuff over .1% and the top 4 elements conf = {a:b for a,b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return tt, alphabet
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True, infer_gtr=True, root_state=None, missing='?', sampling_bias_correction=None): """ Infer likely ancestral states of a discrete character assuming a time reversible model. Parameters ---------- tree : str name of tree file seq_meta : dict meta data associated with sequences field : str, optional meta data field to use confidence : bool, optional calculate confidence values for inferences infer_gtr : bool, optional infer a GTR model for trait transitions (otherwises uses a flat model with rate 1) root_state : None, optional force the state of the root node (currently not implemented) missing : str, optional character that is to be interpreted as missing data, default='?' Returns ------- T : Phylo.Tree Biophyton tree gtr : treetime.GTR GTR model alphabet : dict mapping of character states to """ from treetime import GTR from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import Phylo T = Phylo.read(tree, 'newick') nodes = {n.name:n for n in T.get_terminals()} # Determine alphabet only counting tips in the tree places = set() for name, meta in seq_meta.items(): if field in meta and name in nodes: places.add(meta[field]) if root_state is not None: places.add(root_state) # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc>180: print("ERROR: geo_inference: can't have more than 180 places!", file=sys.stderr) return None,None,None elif nc==0: print("ERROR: geo_inference: list of places is empty!", file=sys.stderr) return None,None,None elif nc==1: print("WARNING: geo_inference: only one place found -- set every internal node to %s!"%places[0], file=sys.stderr) alphabet = {'A':places[0]} alphabet_values = ['A'] gtr = None for node in T.find_clades(): node.sequence=['A'] node.marginal_profile=np.array([[1.0]]) else: # set up model alphabet = {chr(65+i):place for i,place in enumerate(places)} model = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)), alphabet = np.array(sorted(alphabet.keys()))) missing_char = chr(65+nc) alphabet[missing_char]=missing model.profile_map[missing_char] = np.ones(nc) model.ambiguous = missing_char alphabet_rev = {v:k for k,v in alphabet.items()} # construct pseudo alignment pseudo_seqs = [] for name, meta in seq_meta.items(): if name in nodes: s=alphabet_rev[meta[field]] if field in meta else missing_char pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name)) aln = MultipleSeqAlignment(pseudo_seqs) # set up treetime and infer from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0) tt.use_mutation_length = False tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=1.0, marginal=True, normalized_rate=False) if sampling_bias_correction: tt.gtr.mu *= sampling_bias_correction tt.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False) T = tt.tree gtr = tt.gtr alphabet_values = tt.gtr.alphabet # attach inferred states as e.g. node.region = 'africa' for node in T.find_clades(): node.__setattr__(field, alphabet[node.sequence[0]]) # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03 if confidence: for node in T.find_clades(): pdis = node.marginal_profile[0] S = -np.sum(pdis*np.log(pdis+TINY)) marginal = [(alphabet[alphabet_values[i]], pdis[i]) for i in range(len(alphabet_values))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [(a, b) for a, b in marginal if b > 0.001][:4] #only take stuff over .1% and the top 4 elements conf = {a:b for a,b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return T, gtr, alphabet
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) node_data['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed."%args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.timetree: print("ERROR: alignment is required for ancestral reconstruction or timetree inference") return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print("ERROR: meta data with dates is required for time tree reconstruction") return -1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal = args.date_inference == 'marginal', branch_length_inference = args.branch_length_inference or 'auto', clock_rate=args.clock_rate, clock_filter_iqd=args.clock_filter_iqd) node_data['clock'] = {'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate} attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to",tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data else: node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json' json_success = write_json(node_data, node_data_fname) print("node attributes written to",node_data_fname, file=sys.stdout) return 0 if (tree_success and json_success) else 1
def run(args): if args.seed is not None: np.random.seed(args.seed) # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] try: T = read_tree(args.tree) node_data['input_tree'] = args.tree except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 if not args.alignment: if args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference", file=sys.stderr) return 1 if args.divergence_units == 'mutations': print( "ERROR: alignment is required for divergence in units of mutations", file=sys.stderr) return 1 # fake alignment to appease treetime when only using it for naming nodes... from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments", file=sys.stderr) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment from treetime import version as treetime_version print(f"augur refine is using TreeTime version {treetime_version}") # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree elif args.alignment: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' else: tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk' if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] if args.keep_root: # This flag overrides anything specified by 'root' args.root = None if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] tt = refine( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args. root, # or 'best', # We now have a default in param spec - this just adds confusion. Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal=args.date_inference == 'marginal', branch_length_inference=args.branch_length_inference or 'auto', precision='auto' if args.precision is None else args.precision, clock_rate=args.clock_rate, clock_std=args.clock_std_dev, clock_filter_iqd=args.clock_filter_iqd, covariance=args.covariance, resolve_polytomies=(not args.keep_polytomies)) node_data['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } if args.coalescent == 'skyline': try: skyline, conf = tt.merger_model.skyline_inferred( gen=args.gen_per_year, confidence=2) node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]], [float(y) for y in skyline.y], [float(y) for y in conf[1]]] except: print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr) return 1 attributes.extend( ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes if args.root: if args.root == 'best': print( "Warning: To root without inferring a timetree, you must specify an explicit outgroup." ) print( "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n" ) elif args.root in ['least-squares', 'min_dev', 'oldest']: raise TypeError( "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup." % args.root) else: T.root_with_outgroup(args.root) tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) if args.divergence_units == 'mutations-per-site': #default pass elif args.divergence_units == 'mutations': if not args.timetree: tt.infer_ancestral_sequences() nuc_map = profile_maps['nuc'] def are_sequence_states_different(nuc1, nuc2): ''' determine whether two ancestral states should count as mutation for divergence estimates while correctly accounting for ambiguous nucleotides ''' if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']: return False elif nuc1 in nuc_map and nuc2 in nuc_map: return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0 else: return False for node in T.find_clades(): n_muts = len([ position for ancestral, position, derived in node.mutations if are_sequence_states_different(ancestral, derived) ]) if args.timetree: node_data['nodes'][node.name]['mutation_length'] = n_muts node_data['nodes'][node.name]['branch_length'] = n_muts else: print("ERROR: divergence unit", args.divergence_units, "not supported!", file=sys.stderr) return 1 # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to", tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data elif args.alignment: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data.json' else: node_data_fname = '.'.join( args.tree.split('.')[:-1]) + '.node_data.json' write_json(node_data, node_data_fname) print("node attributes written to", node_data_fname, file=sys.stdout) return 0 if tree_success else 1
def test_seq_joint_reconstruction_correct(): """ evolve the random sequence, get the alignment at the leaf nodes. Reconstruct the sequences of the internal nodes (joint) and prove the reconstruction is correct. In addition, compute the likelihood of the particular realization of the sequences on the tree and prove that this likelihood is exactly the same as calculated in the joint reconstruction """ from treetime import TreeAnc, GTR from treetime import seq_utils from Bio import Phylo, AlignIO from StringIO import StringIO import numpy as np try: from itertools import izip except ImportError: #python3.x izip = zip from collections import defaultdict def exclusion(a, b): """ Intersection of two lists """ return list(set(a) - set(b)) tiny_tree = Phylo.read( StringIO("((A:.060,B:.01200)C:.020,D:.0050)E:.004;"), 'newick') mygtr = GTR.custom(alphabet=np.array(['A', 'C', 'G', 'T']), pi=np.array([0.15, 0.95, 0.05, 0.3]), W=np.ones((4, 4))) seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=400) myTree = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=None, verbose=4) # simulate evolution, set resulting sequence as ref_seq tree = myTree.tree seq_len = 400 tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=seq_len) print("Root sequence: " + ''.join(tree.root.ref_seq)) mutation_list = defaultdict(list) for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t) # normalie profile p = (p.T / p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([ int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0]) ]) node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs]) node.ref_mutations = [ (anc, pos, der) for pos, (anc, der) in enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc != der ] for anc, pos, der in node.ref_mutations: print(pos) mutation_list[pos].append((node.name, anc, der)) print(node.name, len(node.ref_mutations), node.ref_mutations) # set as the starting sequences to the terminal nodes: alnstr = "" i = 1 for leaf in tree.get_terminals(): alnstr += ">" + leaf.name + "\n" + ''.join(leaf.ref_seq) + '\n' i += 1 print(alnstr) myTree.aln = AlignIO.read(StringIO(alnstr), 'fasta') myTree._attach_sequences_to_nodes() # reconstruct ancestral sequences: myTree._ml_anc_joint(debug=True) diff_count = 0 mut_count = 0 for node in myTree.tree.find_clades(): if node.up is not None: mut_count += len(node.ref_mutations) diff_count += np.sum(node.sequence != node.ref_seq) == 0 if np.sum(node.sequence != node.ref_seq): print( "%s: True sequence does not equal inferred sequence. parent %s" % (node.name, node.up.name)) else: print("%s: True sequence equals inferred sequence. parent %s" % (node.name, node.up.name)) print(node.name, np.sum(node.sequence != node.ref_seq), np.where(node.sequence != node.ref_seq), len(node.mutations), node.mutations) # the assignment of mutations to the root node is probabilistic. Hence some differences are expected assert diff_count / seq_len < 2 * (1.0 * mut_count / seq_len)**2 # prove the likelihood value calculation is correct LH = myTree.ancestral_likelihood() LH_p = (myTree.tree.sequence_LH) print("Difference between reference and inferred LH:", (LH - LH_p).sum()) assert ((LH - LH_p).sum()) < 1e-9 return myTree
class mpm_tree(object): ''' class that aligns a set of sequences and infers a tree ''' def __init__(self, cluster_seq_filepath, **kwarks): self.clusterID= cluster_seq_filepath.split('/')[-1].split('.fna')[0] if 'speciesID' in kwarks: folderID=kwarks['speciesID'] else: folderID= cluster_seq_filepath.split('/')[-3] self.seqs = {x.id:x for x in SeqIO.parse(cluster_seq_filepath, 'fasta')} if 'run_dir' not in kwarks: import random #self.run_dir = '_'.join(['tmp', self.clusterID]) self.run_dir = 'tmp/' self.run_dir += '_'.join([folderID, 'tmp', time.strftime('%H%M%S',time.gmtime()), str(random.randint(0,100000000))]) else: self.run_dir = kwarks['run_dir'] self.nuc=True def codon_align(self, alignment_tool="mafft", prune=True, discard_premature_stops=False): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) # translate aa_seqs = {} for seq in self.seqs.values(): tempseq = seq.seq.translate(table="Bacterial") # use only sequences that translate without trouble if not discard_premature_stops or '*' not in str(tempseq)[:-1] or prune==False: aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id) else: print(seq.id,"has premature stops, discarding") tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname,'fasta') if alignment_tool=='mafft': os.system('mafft --reorder --amino temp_in.fasta 1> temp_out.fasta') aln_aa = AlignIO.read('temp_out.fasta', "fasta") elif alignment_tool=='muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta") else: print 'Alignment tool not supported:'+alignment_tool #return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) os.chdir(cwd) remove_dir(self.run_dir) def align(self): ''' align sequencences in self.seqs using mafft ''' cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta") os.system('mafft --reorder --anysymbol temp_in.fasta 1> temp_out.fasta 2> mafft.log') self.aln = AlignIO.read('temp_out.fasta', 'fasta') os.chdir(cwd) remove_dir(self.run_dir) def build(self, root='midpoint', raxml=True, fasttree_program='fasttree', raxml_time_limit=0.5, treetime_used=True): ''' build a phylogenetic tree using fasttree and raxML (optional) based on nextflu tree building pipeline ''' import subprocess cwd = os.getcwd() make_dir(self.run_dir) os.chdir(self.run_dir) AlignIO.write(self.aln, 'origin.fasta', 'fasta') name_translation = make_strains_unique(self.aln) AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = [fasttree_program] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta 1> initial_tree.newick 2> fasttree.log") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml==False: #shutil.copy('initial_tree.newick', out_fname) polytomies_midpointRooting('initial_tree.newick',out_fname, self.clusterID) elif len(set([x.id for x in SeqIO.parse('temp.fasta', 'fasta')]))>3: ## only for tree with >3 strains if raxml_time_limit>0: tmp_tree = Phylo.read('initial_tree.newick','newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter<10): resolve_iter+=1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree,'initial_tree.newick', 'newick') AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") print( "RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system("raxml -f e -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick") shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) if treetime_used: # load the resulting tree as a treetime instance from treetime import TreeAnc self.tt = TreeAnc(tree=out_fname, aln=self.aln, gtr='Jukes-Cantor', verbose=0) # provide short cut to tree and revert names that conflicted with newick format self.tree = self.tt.tree else: self.tree = Phylo.read(out_fname,'newick') self.tree.root.branch_length=0.0001 restore_strain_name(name_translation, self.aln) restore_strain_name(name_translation, self.tree.get_terminals()) for node in self.tree.find_clades(): if node.name is not None: if node.name.startswith('NODE_')==False: node.ann=node.name else: node.name='NODE_0' os.chdir(cwd) remove_dir(self.run_dir) self.is_timetree=False def ancestral(self, translate_tree = False): ''' infer ancestral nucleotide sequences using maximum likelihood and translate the resulting sequences (+ terminals) to amino acids ''' try: self.tt.reconstruct_anc(method='ml') except: print "trouble at self.tt.reconstruct_anc(method='ml')" if translate_tree: for node in self.tree.find_clades(): node.aa_sequence = np.fromstring(str(self.translate_seq("".join(node.sequence))), dtype='S1') def refine(self, CDS = True): ''' determine mutations on each branch and attach as string to the branches ''' for node in self.tree.find_clades(): if node.up is not None: node.muts = ",".join(["".join(map(str, x)) for x in node.mutations if '-' not in x]) if CDS == True: node.aa_muts = ",".join([anc+str(pos+1)+der for pos, (anc, der) in enumerate(zip(node.up.aa_sequence, node.aa_sequence)) if anc!=der and '-' not in anc and '-' not in der]) def translate_seq(self, seq): ''' custom translation sequence that handles gaps ''' if type(seq) not in [str, unicode]: str_seq = str(seq.seq) else: str_seq = seq try: # soon not needed as future biopython version will translate --- into - tmp_seq = Seq(str(Seq(str_seq.replace('---', 'NNN')).translate(table="Bacterial")).replace('X','-')) except: tmp_seq = Seq(str(Seq(str_seq.replace('-', 'N')).translate(table="Bacterial")).replace('X','-')) return tmp_seq def translate(self): ''' translate the nucleotide alignment to an amino acid alignment ''' aa_seqs = [] for seq in self.aln: aa_seqs.append(SeqRecord(seq=self.translate_seq(seq), id=seq.id, name=seq.name, description=seq.description)) self.aa_aln = MultipleSeqAlignment(aa_seqs) def mean_std_seqLen(self): """ returen mean and standard deviation of sequence lengths """ seqLen_arr = np.array([ len(seq) for seq in self.seqs.values()]) return np.mean(seqLen_arr, axis=0), np.std(seqLen_arr, axis=0) def paralogy_statistics(self): best_split = find_best_split(self.tree) return len(best_split.para_nodes), best_split.branch_length def diversity_statistics_nuc(self): ''' calculate alignment entropy of nucleotide alignments ''' TINY = 1e-10 if not hasattr(self, "aln"): print("calculate alignment first") return self.af_nuc = calc_af(self.aln, nuc_alpha) is_valid = self.af_nuc[:-2].sum(axis=0)>0.5 tmp_af = self.af_nuc[:-2,is_valid]/self.af_nuc[:-2,is_valid].sum(axis=0) #self.entropy_nuc = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)) self.diversity_nuc = np.mean(1.0-(tmp_af**2).sum(axis=0)) def diversity_statistics_aa(self): ''' calculate alignment entropy of nucleotide alignments ''' TINY = 1e-10 if not hasattr(self, "aln"): print("calculate alignment first") return self.af_aa = calc_af(self.aa_aln, aa_alpha) is_valid = self.af_aa[:-2].sum(axis=0)>0.5 tmp_af = self.af_aa[:-2,is_valid]/self.af_aa[:-2,is_valid].sum(axis=0) #self.entropy_aa = np.mean(-(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)) self.diversity_aa = np.mean(1.0-(tmp_af**2).sum(axis=0)) def mutations_to_branch(self): self.mut_to_branch = defaultdict(list) for node in self.tree.find_clades(): if node.up is not None: for mut in node.mutations: self.mut_to_branch[mut].append(node) def reduce_alignments(self,RNA_specific=False): if RNA_specific: self.aa_aln=None self.af_aa =None else: self.af_aa= calc_af(self.aa_aln, aa_alpha) for attr, aln, alpha, freq in [["aln_reduced", self.aln, nuc_alpha, self.af_nuc], ["aa_aln_reduced", self.aa_aln, aa_alpha, self.af_aa]]: try: if RNA_specific and attr=="aa_aln_reduced": pass #** no reduced amino alignment for RNA else: consensus = np.array(list(alpha))[freq.argmax(axis=0)] aln_array = np.array(aln) aln_array[aln_array==consensus]='.' new_seqs = [SeqRecord(seq=Seq("".join(consensus)), name="consensus", id="consensus")] for si, seq in enumerate(aln): new_seqs.append(SeqRecord(seq=Seq("".join(aln_array[si])), name=seq.name, id=seq.id, description=seq.description)) self.__setattr__(attr, MultipleSeqAlignment(new_seqs)) except: print("sf_geneCluster_align_MakeTree: aligment reduction failed") #def export(self, path = '', extra_attr = ['aa_muts','ann','branch_length','name','longName'], RNA_specific=False): def export(self, path = '', extra_attr = ['aa_muts','annotation','branch_length','name','accession'], RNA_specific=False): ## write tree Phylo.write(self.tree, path+self.clusterID+'.nwk', 'newick') ## processing node name for node in self.tree.get_terminals(): #node.name = node.ann.split('|')[0] node.accession = node.ann.split('|')[0] #node.longName = node.ann.split('-')[0] node.name = node.ann.split('-')[0] #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase annotation=node.ann.split('-',2) if len(annotation)==3: node.annotation= annotation[2] else: node.annotation= annotation[0] ## write tree json for n in self.tree.root.find_clades(): if n.branch_length<1e-6: n.branch_length = 1e-6 timetree_fname = path+self.clusterID+'_tree.json' tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr) write_json(tree_json, timetree_fname, indent=None) self.reduce_alignments(RNA_specific) ## msa compatible for i_aln in self.aln: i_aln.id=i_aln.id.replace('|','-',1) for i_alnr in self.aln_reduced: i_alnr.id=i_alnr.id.replace('|','-',1) AlignIO.write(self.aln, path+self.clusterID+'_na_aln.fa', 'fasta') AlignIO.write(self.aln_reduced, path+self.clusterID+'_na_aln_reduced.fa', 'fasta') if RNA_specific==False: for i_aa_aln in self.aa_aln: i_aa_aln.id=i_aa_aln.id.replace('|','-',1) for i_aa_alnr in self.aa_aln_reduced: i_aa_alnr.id=i_aa_alnr.id.replace('|','-',1) AlignIO.write(self.aa_aln, path+self.clusterID+'_aa_aln.fa', 'fasta') AlignIO.write(self.aa_aln_reduced, path+self.clusterID+'_aa_aln_reduced.fa', 'fasta') ## write seq json write_seq_json=0 if write_seq_json: elems = {} for node in self.tree.find_clades(): if hasattr(node, "sequence"): if hasattr(node, "longName")==False: node.longName=node.name elems[node.longName] = {} nuc_dt= {pos:state for pos, (state, ancstate) in enumerate(izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state!=ancstate} nodeseq=node.sequence.tostring();nodeseq_len=len(nodeseq) elems[node.longName]['nuc']=nuc_dt elems['root'] = {} elems['root']['nuc'] = self.tree.root.sequence.tostring() self.sequences_fname=path+self.clusterID+'_seq.json' write_json(elems, self.sequences_fname, indent=None)
default=False, action='store_true', help='infer substitution model') parser.add_argument('--keep_overhangs', default=False, action='store_true', help='do not fill terminal gaps') params = parser.parse_args() ########################################################################### ### ANCESTRAL RECONSTRUCTION ########################################################################### model = 'aa' if params.prot else 'Jukes-Cantor' treeanc = TreeAnc(params.tree, aln=params.aln, gtr=model, verbose=4, fill_overhangs=not params.keep_overhangs) treeanc.infer_ancestral_sequences('ml', infer_gtr=params.infer_gtr, marginal=params.marginal) ########################################################################### ### OUTPUT and saving of results ########################################################################### if params.infer_gtr: print('\nInferred GTR model:') print(treeanc.gtr) outaln_name = '.'.join( params.aln.split('/')[-1].split('.')[:-1]) + '_ancestral.fasta'
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0): unique_states = sorted(set(traits.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return 1 elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return 1 ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if type(weights)==str: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = None # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights) if ndiff==ttconf.ERROR: # if reconstruction failed, exit return 1 if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False) return treeanc, letter_to_state, reverse_alphabet
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None tree_meta = {'alignment': args.alignment} attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) tree_meta['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.ancestral or args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference" ) return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences else: aln = args.alignment if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree and T: if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction" ) return -1 metadata, columns = read_metadata(args.metadata) if args.year_limit: args.year_limit.sort() dates = get_numerical_dates(metadata, fmt=args.date_fmt, min_max_year=args.year_limit) for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = timetree( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=args.coalescent if args.coalescent is not None else 0.01, #Otherwise can't set to 0 use_marginal=args.time_marginal or False, branch_length_mode=args.branch_length_mode or 'auto', clock_rate=args.clock_rate, n_iqd=args.n_iqd) tree_meta['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } attributes.extend([ 'numdate', 'clock_length', 'mutation_length', 'mutations', 'raw_date', 'date' ]) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! if args.date_confidence: attributes.append('num_date_confidence') elif args.ancestral in ['joint', 'marginal']: tt = ancestral_sequence_inference( tree=T, aln=aln, ref=ref, marginal=args.ancestral, optimize_branch_length=args.branchlengths, branch_length_mode=args.branch_length_mode) attributes.extend(['mutation_length', 'mutations']) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) if is_vcf: #TreeTime overwrites ambig sites on tips during ancestral reconst. #Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() tree_meta['nodes'] = prep_tree(T, attributes, is_vcf) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') if args.node_data: node_data_fname = args.node_data else: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data' with open(node_data_fname, 'w') as ofile: meta_success = json.dump(tree_meta, ofile) #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs if is_vcf and (args.ancestral or args.timetree): if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) return 0 if (tree_success and meta_success) else -1 else: return -1