def export_sequence_fasta(T, path): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment from Bio import AlignIO fname = tree_sequence_alignment(path, 'nuc') seqs = [SeqRecord(Seq(''.join(T.root.sequence)), name='root', id='root')] for node in T.find_clades(): seqs.append(SeqRecord(Seq(''.join(node.sequence)), name=node.name, id=node.name)) AlignIO.write(MultipleSeqAlignment(seqs), fname, 'fasta') if __name__ == '__main__': parser = generic_argparse("Build the tree from the prepared sequence data") parser.add_argument('--nthreads', type=int, default=2, help='number of threads') parser.add_argument('--ancestral', action='store_true', default=False, help='calculate ancestral sequences') parser.add_argument('--timetree', action='store_true', default=False, help='infer time stamped phylogeny') parser.add_argument('--confidence', action='store_true', default=False, help='estimate confidence intervals for node timing') parser.add_argument('--Tc', type=float, default=0.0, help='coalescence time scale measured in substitution rate units') parser.add_argument('--keeproot', action='store_true', default=False, help="don't reroot the tree") args = parser.parse_args() path = args.path
def tree_layout(T): yval = T.count_terminals() for n in T.find_clades(order='postorder'): if n.is_terminal(): n.yvalue = yval yval -= 1 else: child_yvalues = [c.yvalue for c in n] n.yvalue = 0.5 * (np.min(child_yvalues) + np.max(child_yvalues)) n.xvalue = n.attr['div'] if __name__ == '__main__': parser = generic_argparse("Export precomputed data as auspice jsons") parser.add_argument( '--prefix', required=True, help= "prefix for json files that are passed on to auspice (e.g., zika.fasta)" ) parser.add_argument( '--reference', required=True, help="reference sequence needed for entropy feature export") args = parser.parse_args() path = args.path T = Phylo.read(tree_newick(path), 'newick')
ref_array = np.array(seqs[reference]) ungapped = ref_array != '-' ref_aln_array = np.array(aln)[:, ungapped] else: print("reference", reference, "not found in alignment") return out_seqs = [] for seq, seq_array in zip(aln, ref_aln_array): seq.seq = Seq.Seq(''.join(seq_array)) if keep_reference or seq.name != reference: out_seqs.append(seq) return out_seqs if __name__ == '__main__': parser = generic_argparse( "strip out all positions that don't align to the reference") parser.add_argument('--reference', required=True, help='the name of the reference sequence') parser.add_argument('--keep_reference', action='store_true', default=False, help='keep the reference as part of the alignment') args = parser.parse_args() seqs = strip_non_reference(args.path, args.reference) write_fasta(seqs, ref_alignment(args.path))
seqs[seq.name] = seq muts = {} muts[T.root.name]='' for node in T.get_nonterminals(): pseq = seqs[node.name] for c in node: cseq = seqs[c.name] muts[c.name]=','.join([anc+str(pos+1)+der for pos, (anc, der) in enumerate(zip(pseq, cseq)) if anc!=der]) return muts if __name__ == '__main__': parser = generic_argparse("Assign amino acid mutations to the tree") args = parser.parse_args() path = args.path tree_meta = read_tree_meta_data(path) T = Phylo.read(tree_newick(path), 'newick') for gene, aln_fname in get_genes_and_alignments(path, tree=True): if gene!='nuc': muts = get_amino_acid_mutations(T, aln_fname) for node_name in tree_meta: tree_meta[node_name][gene+'_mutations'] = muts[node_name] write_tree_meta_data(path, tree_meta)
#N were causing problems later. Removing all variance and allowing these #regions to be the same as Ref should be the same, anyway. #with open(ref_fasta(path), "w") as output_handle: # SeqIO.write(maskedRef_seqRec, output_handle, "fasta") return maskRefFile if __name__ == '__main__': #to do - add so can pass vcf file instead of gzvcf file? import time start = time.time() parser = generic_argparse( "parse vcf/vcf.gz file and meta_data to drop samples") parser.add_argument("--gzvcf", required=True, type=str, help="file with input sequences as gunzipped vcf") parser.add_argument( "--ref", required=True, type=str, help="fasta file with reference sequence that vcf is mapped to") parser.add_argument("--strip_loci", required=False, type=str, help="file that contains loci to strip from analysis") args = parser.parse_args() path = args.path
dropped_strains = [] if os.path.isfile(fname): with open(fname) as ifile: for line in ifile: fields = line.strip().split('#') if fields[0].strip(): dropped_strains.append(fields[0].strip()) else: print("File with dropped strains not found. Looking for", fname) return dropped_strains if __name__ == '__main__': parser = generic_argparse("parse fasta file and separate meta_data into table") parser.add_argument("--sequences", required=True, type=str, help = "file with input sequences as fasta") args = parser.parse_args() path = args.path header_fields = {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'url', 12:'title', 13: 'journal', 14: 'paper_url'} sequences, meta = parse_fasta(args.sequences, header_fields) dropped_strains = get_dropped_strains(path) sequences = {k:v for k,v in sequences.items() if k not in dropped_strains} meta = {k:v for k,v in meta.items() if k not in dropped_strains}
if numResist not in drugMuts["Drug_Resistance"]: drugMuts["Drug_Resistance"].append(numResist) #for any with no resistance, add a 0 to tree_meta for seq, v in tree_meta.iteritems(): if 'Drug_Resistance' not in tree_meta[seq]: tree_meta[seq]["Drug_Resistance"] = '0' write_tree_meta_data(path, tree_meta) return drugMuts if __name__ == '__main__': parser = generic_argparse( "Find drug resistance mutations according to supplied file. ONLY WORKS FOR VCF FILES." ) parser.add_argument('--drm', type=str, help="file of DRMs to find") args = parser.parse_args() path = args.path import time start = time.time() compress_seq = read_in_vcf(tree_vcf_alignment(path), ref_fasta(path)) sequences = compress_seq['sequences'] positions = compress_seq['positions'] ref = compress_seq['reference']
for line in ifile: fields = line.strip().split('#') if fields[0].strip(): genes.append(fields[0].strip()) else: print("File with genes not found. Looking for", fname) featN = np.array(genes) if len(np.unique(featN)) != len(genes): print "You have duplicates in your genes file. They are being ignored." return genes if __name__ == '__main__': parser = generic_argparse("Translate the nucleotide alignments") parser.add_argument('--reference', required=True, help='genbank file containing the annotation') parser.add_argument('--genes', nargs='+', help="genes to translate") #EBH 11 Dec 17 parser.add_argument('--vcf', action='store_true', default=False, help="sequence is in VCF format") parser.add_argument('--assignMuts', action='store_true', default=False, help="write amino acid mutations onto the tree") args = parser.parse_args() path = args.path #The original way of doing this called load_features twice! if not args.genes: genes = None #if load_features is passed None it loads all
pdis = node.marginal_profile[0] S = -np.sum(pdis*np.log(pdis+TINY)) marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [(a, b) for a, b in marginal if b > 0.01][:4] #only take stuff over 1% and the top 4 elements conf = {a:b for a,b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return tt, alphabet if __name__ == '__main__': parser = generic_argparse("Infer ancestral states for a discrete character") parser.add_argument('--field', default='region', help='meta data field to perform discrete reconstruction on') parser.add_argument('--confidence',action="store_true", help='record the distribution of subleading mugration states') parser.add_argument('--vcf', action='store_true', default=False, help="sequence is in VCF format") args = parser.parse_args() path = args.path T = tree_newick(path) import time start = time.time() seq_meta = read_sequence_meta_data(path)
import os from filenames import sequence_input, raw_alignment from util import generic_argparse if __name__ == '__main__': parser = generic_argparse("Align sequences") parser.add_argument('--nthreads', type=int, default=2, help="number of threads used by mafft") parser.add_argument('--aligner', default='mafft', help="analysis path, e.g. zika") args = parser.parse_args() in_file = sequence_input(args.path) out_file = raw_alignment(args.path) if args.aligner=='mafft': os.system("mafft --anysymbol --thread %d %s 1> %s 2>mafft_stderr"%(args.nthreads, in_file, out_file)) else: print('not implemented') from Bio import AlignIO aln = AlignIO.read(out_file, 'fasta') for seq in aln: seq.seq = seq.seq.upper() AlignIO.write(aln, out_file, 'fasta')