def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} try: T = read_tree(args.tree) except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 import numpy as np missing_internal_node_names = [ n.name is None for n in T.get_nonterminals() ] if np.all(missing_internal_node_names): print("\n*** WARNING: Tree has no internal node names!") print( "*** Without internal node names, ancestral sequences can't be linked up to the correct node later." ) print( "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'." ) print( "*** If you haven't run 'augur refine', you can add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (args.tree)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Enfore treetime 0.7 or later from distutils.version import StrictVersion import treetime if StrictVersion(treetime.version) < StrictVersion('0.7.0'): print("ERROR: this version of augur requires TreeTime 0.7 or later.") return 1 # Infer ambiguous bases if the user has requested that we infer them (either # explicitly or by default) and the user has not explicitly requested that # we keep them. infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs=not (args.keep_overhangs), infer_tips=infer_ambiguous) character_map = {} for x in tt.gtr.profile_map: if tt.gtr.profile_map[x].sum() == tt.gtr.n_states: # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous # To clean up auspice output, we map all those to 'N' character_map[x] = 'N' else: character_map[x] = x anc_seqs['nodes'] = collect_mutations_and_sequences( tt, full_sequences=not is_vcf, infer_tips=infer_ambiguous, character_map=character_map) # add reference sequence to json structure. This is the sequence with # respect to which mutations on the tree are defined. if is_vcf: anc_seqs['reference'] = {"nuc": compress_seq['reference']} else: anc_seqs['reference'] = { "nuc": "".join(T.root.sequence) if hasattr(T.root, 'sequence') else '' } out_name = get_json_name( args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) print("ancestral mutations written to", out_name, file=sys.stdout) if args.output_sequences: if args.output_vcf: print( "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.", file=sys.stderr) else: records = [ SeqRecord(Seq(node_data["sequence"]), id=node_name, description="") for node_name, node_data in anc_seqs["nodes"].items() ] SeqIO.write(records, args.output_sequences, "fasta") print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) return 0
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} try: T = read_tree(args.tree) except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 import numpy as np missing_internal_node_names = [ n.name is None for n in T.get_nonterminals() ] if np.all(missing_internal_node_names): print("\n*** WARNING: Tree has no internal node names!") print( "*** Without internal node names, ancestral sequences can't be linked up to the correct node later." ) print( "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'." ) print( "*** If you haven't run 'augur refine', you can add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (args.tree)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer # Otherwise it returns nonsense. from distutils.version import StrictVersion import treetime if args.keep_ambiguous and not is_vcf and StrictVersion( treetime.version) < StrictVersion('0.5.6'): print( "ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer." + "\nYour version is " + treetime.version + "\nUpdate TreeTime or run without the --keep-ambiguous flag.") return 1 tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs=not (args.keep_overhangs)) if is_vcf or args.keep_ambiguous: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) # add reference sequence to json structure. This is the sequence with # respect to which mutations on the tree are defined. if is_vcf: anc_seqs['reference'] = {"nuc": compress_seq['reference']} else: anc_seqs['reference'] = { "nuc": "".join(T.root.sequence) if hasattr(T.root, 'sequence') else '' } out_name = get_json_name( args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) print("ancestral mutations written to", out_name, file=sys.stdout) if args.output_sequences: if args.output_vcf: print( "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.", file=sys.stderr) else: records = [ SeqRecord(Seq(node_data["sequence"]), id=node_name, description="") for node_name, node_data in anc_seqs["nodes"].items() ] SeqIO.write(records, args.output_sequences, "fasta") print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) return 0
def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False, report_ambiguous=False, timetree=False, confidence=False): seq_info = is_vcf or tt.aln if is_vcf: tt.recover_var_ambigs() outaln_name = basename + 'ancestral_sequences.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), outaln_name) elif tt.aln: outaln_name = basename + 'ancestral_sequences.fasta' AlignIO.write(tt.get_reconstructed_alignment(), outaln_name, 'fasta') if seq_info: print("\n--- alignment including ancestral nodes saved as \n\t %s\n"%outaln_name) # decorate tree with inferred mutations terminal_count = 0 offset = 0 if zero_based else 1 if timetree: dates_fname = basename + 'dates.tsv' fh_dates = open(dates_fname, 'w') if confidence: fh_dates.write('#Lower and upper bound delineate the 90% max posterior region\n') fh_dates.write('#node\tdate\tnumeric date\tlower bound\tupper bound\n') else: fh_dates.write('#node\tdate\tnumeric date\n') for n in tt.tree.find_clades(): if timetree: if confidence: conf = tt.get_max_posterior_region(n, fraction=0.9) fh_dates.write('%s\t%s\t%f\t%f\t%f\n'%(n.name, n.date, n.numdate,conf[0], conf[1])) else: fh_dates.write('%s\t%s\t%f\n'%(n.name, n.date, n.numdate)) n.confidence=None # due to a bug in older versions of biopython that truncated filenames in nexus export # we truncate them by hand and make them unique. if n.is_terminal() and len(n.name)>40 and bioversion<"1.69": n.name = n.name[:35]+'_%03d'%terminal_count terminal_count+=1 n.comment='' if seq_info and len(n.mutations): if report_ambiguous: n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations])+'"' else: n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations if tt.gtr.ambiguous not in [a,d]])+'"' if timetree: n.comment+=(',' if n.comment else '&') + 'date=%1.2f'%n.numdate # write tree to file fmt_bl = "%1.6f" if tt.seq_len<1e6 else "%1.8e" if timetree: outtree_name = basename + 'timetree.nexus' print("--- saved divergence times in \n\t %s\n"%dates_fname) Phylo.write(tt.tree, outtree_name, 'nexus') else: outtree_name = basename + 'annotated_tree.nexus' Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl) print("--- tree saved in nexus format as \n\t %s\n"%outtree_name) if timetree: for n in tt.tree.find_clades(): n.branch_length = n.mutation_length outtree_name = basename + 'divergence_tree.nexus' Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl) print("--- divergence tree saved in nexus format as \n\t %s\n"%outtree_name)
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None tree_meta = {'alignment': args.alignment} attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) tree_meta['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.ancestral or args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference" ) return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences else: aln = args.alignment if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree and T: if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction" ) return -1 metadata, columns = read_metadata(args.metadata) if args.year_limit: args.year_limit.sort() dates = get_numerical_dates(metadata, fmt=args.date_fmt, min_max_year=args.year_limit) for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = timetree( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=args.coalescent if args.coalescent is not None else 0.01, #Otherwise can't set to 0 use_marginal=args.time_marginal or False, branch_length_mode=args.branch_length_mode or 'auto', clock_rate=args.clock_rate, n_iqd=args.n_iqd) tree_meta['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } attributes.extend([ 'numdate', 'clock_length', 'mutation_length', 'mutations', 'raw_date', 'date' ]) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! if args.date_confidence: attributes.append('num_date_confidence') elif args.ancestral in ['joint', 'marginal']: tt = ancestral_sequence_inference( tree=T, aln=aln, ref=ref, marginal=args.ancestral, optimize_branch_length=args.branchlengths, branch_length_mode=args.branch_length_mode) attributes.extend(['mutation_length', 'mutations']) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) if is_vcf: #TreeTime overwrites ambig sites on tips during ancestral reconst. #Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() tree_meta['nodes'] = prep_tree(T, attributes, is_vcf) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') if args.node_data: node_data_fname = args.node_data else: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data' with open(node_data_fname, 'w') as ofile: meta_success = json.dump(tree_meta, ofile) #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs if is_vcf and (args.ancestral or args.timetree): if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) return 0 if (tree_success and meta_success) else -1 else: return -1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} # check if tree is provided and can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) break except: pass if T is None: print("ERROR: reading tree from %s failed."%args.tree) return 1 if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer # Otherwise it returns nonsense. from distutils.version import StrictVersion import treetime if args.keep_ambiguous and not is_vcf and StrictVersion(treetime.version) < StrictVersion('0.5.6'): print("ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."+ "\nYour version is "+treetime.version+ "\nUpdate TreeTime or run without the --keep-ambiguous flag.") return 1 tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs = not(args.keep_overhangs)) if is_vcf or args.keep_ambiguous: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) if args.output: anc_seqs_fname = args.output else: anc_seqs_fname = '.'.join(args.alignment.split('.')[:-1]) + '.anc_seqs.json' write_json(anc_seqs, anc_seqs_fname) print("ancestral sequences written to",anc_seqs_fname, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout) return 0
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} # check if tree is provided and can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference) if is_vcf: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) if args.output: anc_seqs_fname = args.output else: anc_seqs_fname = '.'.join( args.alignment.split('.')[:-1]) + '.anc_seqs.json' anc_seqs_success = write_json(anc_seqs, anc_seqs_fname) print("ancestral sequences written to", anc_seqs_fname, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) if anc_seqs_success: return 0 else: return 1