def read_if_vcf(params): """ Checks if input is VCF and reads in appropriately if it is """ ref = None aln = params.aln fixed_pi = None if hasattr(params, 'aln') and params.aln is not None: if any([params.aln.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not params.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return -1 compress_seq = read_vcf(params.aln, params.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] aln = sequences if not hasattr(params, 'gtr') or params.gtr=="infer": #if not specified, set it: alpha = alphabets['aa'] if params.aa else alphabets['nuc'] fixed_pi = [ref.count(base)/len(ref) for base in alpha] if fixed_pi[-1] == 0: fixed_pi[-1] = 0.05 fixed_pi = [v-0.01 for v in fixed_pi] return aln, ref, fixed_pi
def run(args): ''' This should be modified to work on Fasta-input files!! ''' print("This method may change in future! Please use 'augur sequence-traits -h' to check the latest options.") ## check file format and read in sequences is_vcf = False if ( (args.ancestral_sequences and any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) or (args.translations and any([args.translations.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) ): if ((args.ancestral_sequences and not args.vcf_reference) or (args.translations and not args.vcf_translate_reference)): print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 is_vcf = True compress_seq = defaultdict(dict) if args.translations: compress_seq = read_in_translate_vcf(args.translations, args.vcf_translate_reference) if args.ancestral_sequences: compress_seq["nuc"] = read_vcf(args.ancestral_sequences, args.vcf_reference) else: # TO-DO fill in fasta-format processing aln = args.alignment features = read_in_features(args.features) annotations = annotate_strains(features, compress_seq) #convert the annotations into string label that auspice can display seq_features = attach_features(annotations, args.label, args.count) #write out json with open(args.output, 'w') as results: json.dump({"nodes":seq_features}, results, indent=1, sort_keys = True)
def run(args): ''' This should be modified to work on Fasta-input files!! ''' print("This method may change in future! Please use 'augur sequence-traits -h' to check the latest options.") print("Unfortunately this method currently only works with VCF input.") ## check file format and read in sequences is_vcf = False if ( (args.ancestral_sequences and any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) or (args.translations and any([args.translations.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) ): if ((args.ancestral_sequences and not args.vcf_reference) or (args.translations and not args.vcf_translate_reference)): print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 is_vcf = True compress_seq = defaultdict(dict) if args.translations: compress_seq = read_in_translate_vcf(args.translations, args.vcf_translate_reference) if args.ancestral_sequences: compress_seq["nuc"] = read_vcf(args.ancestral_sequences, args.vcf_reference) else: # TO-DO fill in fasta-format processing aln = args.ancestral_sequences print("\nERROR: Unfortunately this feature currently only works with VCF input! It will be expanded to work with Fasta-input soon.") return 1 features = read_in_features(args.features) annotations = annotate_strains(features, compress_seq) #convert the annotations into string label that auspice can display seq_features = attach_features(annotations, args.label, args.count) #write out json out_name = get_json_name(args) write_json({"nodes":seq_features},out_name) print("sequence traits written to", out_name, file=sys.stdout)
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences else: aln = args.alignment start = time.time() if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk' # construct reduced alignment if needed if is_vcf: variable_fasta = write_out_informative_fasta( compress_seq, args.alignment, stripFile=args.strip_sites) fasta = variable_fasta else: fasta = aln if args.iqmodel and not args.method == 'iqtree': print( "Cannot specify model unless using IQTree. Model specification ignored." ) if args.method == 'raxml': T = build_raxml(fasta, tree_fname, args.nthreads) elif args.method == 'iqtree': T = build_iqtree(fasta, tree_fname, args.iqmodel, args.nthreads) else: #use fasttree - if add more options, put another check here T = build_fasttree(fasta, tree_fname) end = time.time() print("Building original tree took {} seconds".format(str(end - start))) if is_vcf and not args.keep_vcf_fasta: os.remove(variable_fasta) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') else: return -1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): # Prepare a multiple sequence alignment from the given variants VCF and # reference FASTA. if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences elif args.exclude_sites: # Mask excluded sites from the given multiple sequence alignment. aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites) else: # Use the multiple sequence alignment as is. aln = args.alignment start = time.time() if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk' # construct reduced alignment if needed if is_vcf: variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites) fasta = variable_fasta else: fasta = aln if args.substitution_model and not args.method=='iqtree': print("Cannot specify model unless using IQTree. Model specification ignored.") if args.method=='raxml': T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args) elif args.method=='iqtree': T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args) elif args.method=='fasttree': T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args) else: print("ERROR: unknown tree builder provided to --method: %s" % args.method, file = sys.stderr) return 1 end = time.time() print("\nBuilding original tree took {} seconds".format(str(end-start))) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') else: return 1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): # Prepare a multiple sequence alignment from the given variants VCF and # reference FASTA. if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences elif args.exclude_sites: # Mask excluded sites from the given multiple sequence alignment. aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites) else: # Use the multiple sequence alignment as is. aln = args.alignment start = time.time() if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk' # construct reduced alignment if needed if is_vcf: variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites) fasta = variable_fasta else: fasta = aln if args.substitution_model and not args.method=='iqtree': print("Cannot specify model unless using IQTree. Model specification ignored.") if args.method=='raxml': T = build_raxml(fasta, tree_fname, nthreads=args.nthreads) elif args.method=='iqtree': T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads) elif args.method=='fasttree': T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads) else: print("ERROR: unknown tree builder provided to --method: %s" % args.method, file = sys.stderr) return 1 end = time.time() print("Building original tree took {} seconds".format(str(end-start))) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') else: return 1
def run(args): if args.seed is not None: np.random.seed(args.seed) # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] try: T = read_tree(args.tree) node_data['input_tree'] = args.tree except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 if not args.alignment: if args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference", file=sys.stderr) return 1 if args.divergence_units == 'mutations': print( "ERROR: alignment is required for divergence in units of mutations", file=sys.stderr) return 1 # fake alignment to appease treetime when only using it for naming nodes... from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments", file=sys.stderr) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment from treetime import version as treetime_version print(f"augur refine is using TreeTime version {treetime_version}") # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree elif args.alignment: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' else: tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk' if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] if args.keep_root: # This flag overrides anything specified by 'root' args.root = None if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] tt = refine( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args. root, # or 'best', # We now have a default in param spec - this just adds confusion. Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal=args.date_inference == 'marginal', branch_length_inference=args.branch_length_inference or 'auto', precision='auto' if args.precision is None else args.precision, clock_rate=args.clock_rate, clock_std=args.clock_std_dev, clock_filter_iqd=args.clock_filter_iqd, covariance=args.covariance, resolve_polytomies=(not args.keep_polytomies)) node_data['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } if args.coalescent == 'skyline': try: skyline, conf = tt.merger_model.skyline_inferred( gen=args.gen_per_year, confidence=2) node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]], [float(y) for y in skyline.y], [float(y) for y in conf[1]]] except: print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr) return 1 attributes.extend( ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes if args.root: if args.root == 'best': print( "Warning: To root without inferring a timetree, you must specify an explicit outgroup." ) print( "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n" ) elif args.root in ['least-squares', 'min_dev', 'oldest']: raise TypeError( "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup." % args.root) else: T.root_with_outgroup(args.root) tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) if args.divergence_units == 'mutations-per-site': #default pass elif args.divergence_units == 'mutations': if not args.timetree: tt.infer_ancestral_sequences() nuc_map = profile_maps['nuc'] def are_sequence_states_different(nuc1, nuc2): ''' determine whether two ancestral states should count as mutation for divergence estimates while correctly accounting for ambiguous nucleotides ''' if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']: return False elif nuc1 in nuc_map and nuc2 in nuc_map: return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0 else: return False for node in T.find_clades(): n_muts = len([ position for ancestral, position, derived in node.mutations if are_sequence_states_different(ancestral, derived) ]) if args.timetree: node_data['nodes'][node.name]['mutation_length'] = n_muts node_data['nodes'][node.name]['branch_length'] = n_muts else: print("ERROR: divergence unit", args.divergence_units, "not supported!", file=sys.stderr) return 1 # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to", tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data elif args.alignment: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data.json' else: node_data_fname = '.'.join( args.tree.split('.')[:-1]) + '.node_data.json' write_json(node_data, node_data_fname) print("node attributes written to", node_data_fname, file=sys.stdout) return 0 if tree_success else 1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} try: T = read_tree(args.tree) except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 import numpy as np missing_internal_node_names = [ n.name is None for n in T.get_nonterminals() ] if np.all(missing_internal_node_names): print("\n*** WARNING: Tree has no internal node names!") print( "*** Without internal node names, ancestral sequences can't be linked up to the correct node later." ) print( "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'." ) print( "*** If you haven't run 'augur refine', you can add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (args.tree)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer # Otherwise it returns nonsense. from distutils.version import StrictVersion import treetime if args.keep_ambiguous and not is_vcf and StrictVersion( treetime.version) < StrictVersion('0.5.6'): print( "ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer." + "\nYour version is " + treetime.version + "\nUpdate TreeTime or run without the --keep-ambiguous flag.") return 1 tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs=not (args.keep_overhangs)) if is_vcf or args.keep_ambiguous: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) # add reference sequence to json structure. This is the sequence with # respect to which mutations on the tree are defined. if is_vcf: anc_seqs['reference'] = {"nuc": compress_seq['reference']} else: anc_seqs['reference'] = { "nuc": "".join(T.root.sequence) if hasattr(T.root, 'sequence') else '' } out_name = get_json_name( args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) print("ancestral mutations written to", out_name, file=sys.stdout) if args.output_sequences: if args.output_vcf: print( "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.", file=sys.stderr) else: records = [ SeqRecord(Seq(node_data["sequence"]), id=node_name, description="") for node_name, node_data in anc_seqs["nodes"].items() ] SeqIO.write(records, args.output_sequences, "fasta") print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) return 0
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} try: T = read_tree(args.tree) except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 import numpy as np missing_internal_node_names = [ n.name is None for n in T.get_nonterminals() ] if np.all(missing_internal_node_names): print("\n*** WARNING: Tree has no internal node names!") print( "*** Without internal node names, ancestral sequences can't be linked up to the correct node later." ) print( "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'." ) print( "*** If you haven't run 'augur refine', you can add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (args.tree)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Enfore treetime 0.7 or later from distutils.version import StrictVersion import treetime if StrictVersion(treetime.version) < StrictVersion('0.7.0'): print("ERROR: this version of augur requires TreeTime 0.7 or later.") return 1 # Infer ambiguous bases if the user has requested that we infer them (either # explicitly or by default) and the user has not explicitly requested that # we keep them. infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs=not (args.keep_overhangs), infer_tips=infer_ambiguous) character_map = {} for x in tt.gtr.profile_map: if tt.gtr.profile_map[x].sum() == tt.gtr.n_states: # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous # To clean up auspice output, we map all those to 'N' character_map[x] = 'N' else: character_map[x] = x anc_seqs['nodes'] = collect_mutations_and_sequences( tt, full_sequences=not is_vcf, infer_tips=infer_ambiguous, character_map=character_map) # add reference sequence to json structure. This is the sequence with # respect to which mutations on the tree are defined. if is_vcf: anc_seqs['reference'] = {"nuc": compress_seq['reference']} else: anc_seqs['reference'] = { "nuc": "".join(T.root.sequence) if hasattr(T.root, 'sequence') else '' } out_name = get_json_name( args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) print("ancestral mutations written to", out_name, file=sys.stdout) if args.output_sequences: if args.output_vcf: print( "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.", file=sys.stderr) else: records = [ SeqRecord(Seq(node_data["sequence"]), id=node_name, description="") for node_name, node_data in anc_seqs["nodes"].items() ] SeqIO.write(records, args.output_sequences, "fasta") print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) return 0
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None tree_meta = {'alignment': args.alignment} attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) tree_meta['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.ancestral or args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference" ) return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences else: aln = args.alignment if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree and T: if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction" ) return -1 metadata, columns = read_metadata(args.metadata) if args.year_limit: args.year_limit.sort() dates = get_numerical_dates(metadata, fmt=args.date_fmt, min_max_year=args.year_limit) for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = timetree( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=args.coalescent if args.coalescent is not None else 0.01, #Otherwise can't set to 0 use_marginal=args.time_marginal or False, branch_length_mode=args.branch_length_mode or 'auto', clock_rate=args.clock_rate, n_iqd=args.n_iqd) tree_meta['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } attributes.extend([ 'numdate', 'clock_length', 'mutation_length', 'mutations', 'raw_date', 'date' ]) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! if args.date_confidence: attributes.append('num_date_confidence') elif args.ancestral in ['joint', 'marginal']: tt = ancestral_sequence_inference( tree=T, aln=aln, ref=ref, marginal=args.ancestral, optimize_branch_length=args.branchlengths, branch_length_mode=args.branch_length_mode) attributes.extend(['mutation_length', 'mutations']) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) if is_vcf: #TreeTime overwrites ambig sites on tips during ancestral reconst. #Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() tree_meta['nodes'] = prep_tree(T, attributes, is_vcf) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') if args.node_data: node_data_fname = args.node_data else: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data' with open(node_data_fname, 'w') as ofile: meta_success = json.dump(tree_meta, ofile) #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs if is_vcf and (args.ancestral or args.timetree): if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) return 0 if (tree_success and meta_success) else -1 else: return -1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} # check if tree is provided and can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) break except: pass if T is None: print("ERROR: reading tree from %s failed."%args.tree) return 1 if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer # Otherwise it returns nonsense. from distutils.version import StrictVersion import treetime if args.keep_ambiguous and not is_vcf and StrictVersion(treetime.version) < StrictVersion('0.5.6'): print("ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."+ "\nYour version is "+treetime.version+ "\nUpdate TreeTime or run without the --keep-ambiguous flag.") return 1 tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference, fill_overhangs = not(args.keep_overhangs)) if is_vcf or args.keep_ambiguous: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) if args.output: anc_seqs_fname = args.output else: anc_seqs_fname = '.'.join(args.alignment.split('.')[:-1]) + '.anc_seqs.json' write_json(anc_seqs, anc_seqs_fname) print("ancestral sequences written to",anc_seqs_fname, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout) return 0
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) node_data['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed."%args.tree) return 1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.timetree: print("ERROR: alignment is required for ancestral reconstruction or timetree inference") return 1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print("ERROR: meta data with dates is required for time tree reconstruction") return 1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal = args.date_inference == 'marginal', branch_length_inference = args.branch_length_inference or 'auto', clock_rate=args.clock_rate, clock_std=args.clock_std_dev, clock_filter_iqd=args.clock_filter_iqd) node_data['clock'] = {'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate} attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to",tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data else: node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json' json_success = write_json(node_data, node_data_fname) print("node attributes written to",node_data_fname, file=sys.stdout) return 0 if (tree_success and json_success) else 1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) node_data['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed."%args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.timetree: print("ERROR: alignment is required for ancestral reconstruction or timetree inference") return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format alignments") return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print("ERROR: meta data with dates is required for time tree reconstruction") return -1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal = args.date_inference == 'marginal', branch_length_inference = args.branch_length_inference or 'auto', clock_rate=args.clock_rate, clock_filter_iqd=args.clock_filter_iqd) node_data['clock'] = {'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate} attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to",tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data else: node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json' json_success = write_json(node_data, node_data_fname) print("node attributes written to",node_data_fname, file=sys.stdout) return 0 if (tree_success and json_success) else 1
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] try: T = read_tree(args.tree) node_data['input_tree'] = args.tree except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference" ) return 1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree elif args.alignment: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' else: tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk' if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] if args.keep_root: # This flag overrides anything specified by 'root' args.root = None if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction" ) return 1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] tt = refine( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args. root, # or 'best', # We now have a default in param spec - this just adds confusion. Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal=args.date_inference == 'marginal', branch_length_inference=args.branch_length_inference or 'auto', clock_rate=args.clock_rate, clock_std=args.clock_std_dev, clock_filter_iqd=args.clock_filter_iqd, covariance=args.covariance, resolve_polytomies=(not args.keep_polytomies)) node_data['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } attributes.extend( ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes if args.root: if args.root == 'best': print( "Warning: To root without inferring a timetree, you must specify an explicit outgroup." ) print( "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n" ) elif args.root in ['least-squares', 'min_dev', 'oldest']: raise TypeError( "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup." % args.root) else: T.root_with_outgroup(args.root) tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to", tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data elif args.alignment: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data.json' else: node_data_fname = '.'.join( args.tree.split('.')[:-1]) + '.node_data.json' write_json(node_data, node_data_fname) print("node attributes written to", node_data_fname, file=sys.stdout) return 0 if tree_success else 1
def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): genes = get_genes_from_file(args.genes[0]) else: genes = args.genes ## check file format and read in sequences is_vcf = False if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format input") return -1 compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: node_data = read_node_data(args.ancestral_sequences, args.tree) if node_data is None: print("ERROR: could not read node data (incl sequences)") return -1 # extract sequences from node meta data sequences = {} for k,v in node_data['nodes'].items(): if 'sequence' in v: sequences[k] = v['sequence'] ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) print("Read in {} features from reference sequence file".format(len(features))) if features is None: print("ERROR: could not read features of reference sequence file") return -1 ### translate every feature - but not 'nuc'! translations = {} deleted = [] for fname, feat in features.items(): if is_vcf: trans = translate_vcf_feature(sequences, ref, feat) if trans: translations[fname] = trans else: deleted.append(fname) else: if feat.type != 'source': translations[fname] = translate_feature(sequences, feat) if len(deleted) != 0: print("{} genes had no mutations and so have been be excluded.".format(len(deleted))) ## glob the annotations for later auspice export annotations = {} for fname, feat in features.items(): increment = 0 if feat.type != 'source' else 1 #'nuc' goes to 0, unsure why - make 1 annotations[fname] = {'start':int(feat.location.start)+increment, 'end':int(feat.location.end), 'strand': feat.location.strand} if is_vcf: #need to add our own nuc annotations['nuc'] = {'start': 0, 'end': len(ref), 'strand': 1} ## determine amino acid mutations for each node if is_vcf: aa_muts = assign_aa_vcf(tree, translations) else: aa_muts = {} for n in tree.get_nonterminals(): for c in n: aa_muts[c.name]={"aa_muts":{}} for fname, aln in translations.items(): for c in n: if c.name in aln and n.name in aln: tmp = [construct_mut(a, int(pos+1), d) for pos, (a,d) in enumerate(zip(aln[n.name], aln[c.name])) if a!=d] aa_muts[c.name]["aa_muts"][fname] = tmp else: print("no sequence pair for nodes %s-%s"%(c.name, n.name)) write_json({'annotations':annotations, 'nodes':aa_muts}, args.output) print("amino acid mutations written to",args.output, file=sys.stdout) ## write alignments to file is requested if args.alignment_output: if is_vcf: ## write VCF-style output if requested fileEndings = -1 if args.alignment_output.lower().endswith('.gz'): fileEndings = -2 vcf_out_ref = '.'.join(args.alignment_output.split('.')[:fileEndings]) + '_reference.fasta' write_VCF_translation(translations, args.alignment_output, vcf_out_ref) else: ## write fasta-style output if requested if '%GENE' in args.alignment_output: for fname, seqs in translations.items(): SeqIO.write([SeqRecord.SeqRecord(seq=Seq.Seq(s), id=sname, name=sname, description='') for sname, s in seqs.items()], args.alignment_output.replace('%GENE', fname), 'fasta') else: print("ERROR: alignment output file does not contain '%GENE', so will not be written.")
def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): genes = get_genes_from_file(args.genes[0]) else: genes = args.genes ## check file format and read in sequences is_vcf = False if any([ args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz'] ]): if not args.vcf_reference: print("ERROR: a reference Fasta is required with VCF-format input") return 1 compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: node_data = read_node_data(args.ancestral_sequences, args.tree) if node_data is None: print("ERROR: could not read node data (incl sequences)") return 1 # extract sequences from node meta data sequences = {} for k, v in node_data['nodes'].items(): if 'sequence' in v: sequences[k] = v['sequence'] ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) print("Read in {} features from reference sequence file".format( len(features))) if features is None: print("ERROR: could not read features of reference sequence file") return 1 ### translate every feature - but not 'nuc'! translations = {} deleted = [] for fname, feat in features.items(): if is_vcf: trans = translate_vcf_feature(sequences, ref, feat) if trans: translations[fname] = trans else: deleted.append(fname) else: if feat.type != 'source': translations[fname] = translate_feature(sequences, feat) if len(deleted) != 0: print("{} genes had no mutations and so have been be excluded.".format( len(deleted))) ## glob the annotations for later auspice export # # Note that BioPython FeatureLocations use # "Pythonic" coordinates: [zero-origin, half-open) # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] annotations = {} for fname, feat in features.items(): annotations[fname] = { 'seqid': args.reference_sequence, 'type': feat.type, 'start': int(feat.location.start) + 1, 'end': int(feat.location.end), 'strand': '+' if feat.location.strand else '-' } if is_vcf: #need to add our own nuc annotations['nuc'] = { 'seqid': args.reference_sequence, 'type': feat.type, 'start': 1, 'end': len(ref), 'strand': '+' } ## determine amino acid mutations for each node try: if is_vcf: aa_muts = assign_aa_vcf(tree, translations) else: aa_muts = assign_aa_fasta(tree, translations) except MissingNodeError as err: print("\n*** ERROR: Some/all nodes have no node names!") print( "*** Please check you are providing the tree output by 'augur refine'." ) print( "*** If you haven't run 'augur refine', please add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (args.tree)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) return 1 except MismatchNodeError as err: print("\n*** ERROR: Mismatch between node names in %s and in %s" % (args.tree, args.ancestral_sequences)) print( "*** Ensure you are using the same tree you used to run 'ancestral' as input here." ) print( "*** Or, re-run 'ancestral' using %s, then use the new %s as input here." % (args.tree, args.ancestral_sequences)) return 1 output_data = {'annotations': annotations, 'nodes': aa_muts} if is_vcf: output_data['reference'] = {} for fname in translations: output_data['reference'][fname] = translations[fname]['reference'] else: output_data['reference'] = aa_muts[tree.root.name]['aa_sequences'] out_name = get_json_name( args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json') write_json(output_data, out_name) print("amino acid mutations written to", out_name, file=sys.stdout) ## write alignments to file is requested if args.alignment_output: if is_vcf: ## write VCF-style output if requested fileEndings = -1 if args.alignment_output.lower().endswith('.gz'): fileEndings = -2 vcf_out_ref = args.vcf_reference_output or '.'.join( args.alignment_output.split('.') [:fileEndings]) + '_reference.fasta' write_VCF_translation(translations, args.alignment_output, vcf_out_ref) else: ## write fasta-style output if requested if '%GENE' in args.alignment_output: for fname, seqs in translations.items(): SeqIO.write([ SeqRecord.SeqRecord(seq=Seq.Seq(s), id=sname, name=sname, description='') for sname, s in seqs.items() ], args.alignment_output.replace('%GENE', fname), 'fasta') else: print( "ERROR: alignment output file does not contain '%GENE', so will not be written." )
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None anc_seqs = {} # check if tree is provided and can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference) if is_vcf: # TreeTime overwrites ambig sites on tips during ancestral reconst. # Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf) if args.output: anc_seqs_fname = args.output else: anc_seqs_fname = '.'.join( args.alignment.split('.')[:-1]) + '.anc_seqs.json' anc_seqs_success = write_json(anc_seqs, anc_seqs_fname) print("ancestral sequences written to", anc_seqs_fname, file=sys.stdout) # If VCF, output VCF including new ancestral seqs if is_vcf: if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) print("ancestral sequences as vcf-file written to", vcf_fname, file=sys.stdout) if anc_seqs_success: return 0 else: return 1