def main(): options = interface() array_alignments = make_align_list(options.input_dir, options.input_file_format) # DO BOOTSTRAPPING AND WRITE REPLICATES TO OUTFILES # options.bootstrap_reps += 1 boot_alignments = phylo.bootstrap(array_alignments, options.bootstrap_reps) bootstrapped_datasets = [] for rep_count, boot_rep in enumerate(boot_alignments): boot_bases = [] # setup outfile names (make into function?) if options.output_file_format == 'nexus': fname = 'bootrep_%s.nex' % rep_count if options.output_file_format == 'phylip': fname = 'bootrep_%s.phylip' % rep_count final_path = os.path.join(options.output_dir,fname) fout = open(final_path,'a') for count, align in enumerate(boot_rep): seqs = copy(align[0]) # lots of copying to be 'safe' ids = copy(align[1]) bases_by_col = np.column_stack(seqs) # flip rows and columns bs_bases = phylo.bootstrap(bases_by_col, 1) # bootstrap the bases within the bootstrapped alignments bs_bases = np.column_stack(bs_bases[0]) # [0] corrects weirdnesss due to extra set of brackets bs_bases = bs_bases.copy() # copy modified replicate pair = [bs_bases, ids] biopy_align = [strarray2biopy(pair)] AlignIO.write(biopy_align, fout, options.output_file_format) fout.write('\n') fout.close()
def emboss_piped_AlignIO_convert(alignments, old_format, new_format): """Run seqret, returns alignments (as a generator).""" # Setup, this assumes for all the format names used # Biopython and EMBOSS names are consistent! cline = SeqretCommandline(exes["seqret"], sformat=old_format, osformat=new_format, auto=True, # no prompting filter=True) # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) try: AlignIO.write(alignments, child.stdin, old_format) except Exception as err: child.stdin.close() child.stderr.close() child.stdout.close() raise child.stdin.close() child.stderr.close() # TODO - Is there a nice way to return an iterator AND # automatically close the handle? try: aligns = list(AlignIO.parse(child.stdout, new_format)) except Exception as err: child.stdout.close() raise child.stdout.close() return aligns
def taxit_create(taxit_executable_loc, aln_fasta, hmm_file, tree_file, tree_stats, pfam_acc, output_location, aln_stockholm): ''' Calls taxit ''' #taxit create --clobber --aln-fasta ./PF14424.dedup.fasta --profile ./PF14424.wholefam.hmm --tree-file ./PF14424.dedup.nh --locus PF14424 --package-name PF14424.pplacer cmd = taxit_executable_loc \ + " create --clobber" \ + " --aln-fasta " + aln_fasta \ + " --profile " + hmm_file \ + " --tree-file " + tree_file \ + " --tree-stats " + tree_stats \ + " --locus " + pfam_acc \ + " --package-name " + output_location raw_data = subprocess.check_call(cmd, shell=True) input_handle = open(aln_fasta, "rU") output_handle = open(aln_stockholm, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "stockholm") output_handle.close() input_handle.close()
def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5): from Bio import Phylo, AlignIO import subprocess, glob, shutil make_dir(self.run_dir) os.chdir(self.run_dir) for seq in self.aln: seq.name=seq.id AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = ["fasttree"] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta") tree_cmd.append(">") tree_cmd.append("initial_tree.newick") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml: if raxml_time_limit>0: tmp_tree = Phylo.read('initial_tree.newick','newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter<10): resolve_iter+=1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree,'initial_tree.newick', 'newick') AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") print( "RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick") shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) else: shutil.copy('initial_tree.newick', out_fname) self.tt_from_file(out_fname, root) os.chdir('..') remove_dir(self.run_dir) self.is_timetree=False
def AlignClean(file, out): global countN, total_len #search pattern match = re.compile(r'(N)\1*') #create list to append start/stop to Ns = [] handle = open(file, 'rU') outhandle = open(out, 'w') alignment = AlignIO.read(handle, 'fasta') for rec in alignment: total_len = len(rec.seq) string = str(rec.seq).upper() Seq = re.sub('N[-]*N', repl, string) #replace gaps between N's with N's for the next regex step for m in match.finditer(Seq): Ns.append( [m.start(),m.end()] ) Ns.sort(key=lambda x: x[0]) #now run the combinelist function as many times as necessary run1 = combinelists(Ns) flat = flatten(run1) flat.insert(0,0) flat.append(total_len) final = zip(*[iter(flat)] * 2) test = [] for i in range(len(final)): cmd = "alignment[:, %i:%i]" % (final[i][0], final[i][1]) test.append(cmd) edited = ' + '.join(test) AlignIO.write(eval(edited), outhandle, 'fasta') handle.close() countN = len(run1)
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.nexus) old_names = set() for f in files: for align in AlignIO.parse(f, 'nexus'): for seq in list(align): old_names.update([seq.name]) #pdb.set_trace() name_map = abbreviator(old_names) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): new_seq_name = name_map[seq.name] new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def mcmc(alignment, num_imp, dem_ratios, directory, length, burnin): acceptances = 0 # Build first state of Markov chain print 'Imputing first alignment...' current = impute.imp_align(num_imp, alignment, dem_ratios) current.loglik = loglik(current) print '\t Log likelihood %2f' % current.loglik if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta') # Run chain for i in xrange(1,length+1): proposal = impute.imp_align(1, minoneimp(current, num_imp), dem_ratios) proposal.loglik = loglik(proposal) p = proposal.loglik-current.loglik print 'Current LLH: %2f; Proposed LLH: %2f' % (current.loglik, proposal.loglik) print '\tAcceptance probability %e' % math.exp(p) if p>0: current = proposal acceptances += 1 print '\tAccepted' elif random.random()<math.exp(p): current = proposal acceptances += 1 print '\tAccepted' else: print '\tNot accepted' if i > burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta') return float(acceptances)/length
def mcmc_ks(alignment, num_imp, dem_ratios, directory, length, burnin): acceptances = 0 d = transprobs(TRANSITIONS, MARGINAL) pd = pdn(alignment) mins = np.array([sorted(i)[1] for i in pd]) # Build first state of Markov chain print 'Imputing first alignment...' start = impute.imp_align(num_imp, alignment, dem_ratios) current = deepcopy(start) current.loglik = loglik(current)+math.log(dist_ks(current, num_imp, mins, 1000)) print '\t Log likelihood %2f' % current.loglik if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta') # Run chain for i in xrange(1,length+1): proposal = propose(current,num_imp,max(norm(loc=2,scale=1).rvs(),1), d) l1 = loglik(proposal) l2 = math.log(dist_ks(proposal, num_imp, mins, 1000)) proposal.loglik = l1+l2 p = proposal.loglik-current.loglik print 'Current LLH: %2f; Proposed LLH: %2f' % (current.loglik, proposal.loglik) print '\tPhylogeny component: %2f; Distance component: %2f' % (l1, l2) print '\tAcceptance probability %e' % math.exp(p) if random.random()<math.exp(p): current = proposal acceptances += 1 print '\tAccepted' else: print '\tNot accepted' if i > burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta') return float(acceptances)/length, start
def filter_alignment(args): filein = args.input fileout = args.output filetype = args.type outtype = args.outtype variablesites = args.variable skip = args.skip seqs = AlignIO.read(filein,filetype) newalignment= [''] * len(seqs) for i in range(0,seqs.get_alignment_length()): baselist = list(seqs[:,i]) if not is_biallelic(baselist): continue #Since we have a maximum of 1 mutation, at ambiguous sites we have a constant site and a variable site c, v = diploidify(baselist) #turn baselist into 2 baselists, expanding using IUPAC notation # if not is_single_mutation(c,v): continue # if variablesites: # c = remove_nonvariable_sites(c) # v = remove_nonvariable_sites(v) # if c == v == [''] * len(c): continue c, v = filter_cv(c,v,args) if c == v == None: continue combined = list() if skip: combined = baselist else: combined = combine_cv(c,v) newalignment = [newalignment[j] + combined[j] for j in range(0,len(combined))] newseqobjs = [SeqRecord(Seq(newalignment[l], IUPAC.unambiguous_dna), id=seqs[l].id, description='') for l in range(0,len(seqs))] newalnobj = MultipleSeqAlignment(newseqobjs) newalnobj = remove_duplicate_seqs(newalnobj) AlignIO.write(newalnobj,fileout,outtype)
def biolikplot(alignment, num_imp, dem_ratios, length, threshold): acceptances = 0 seq_len = len(alignment[0]) al_len = len(alignment) clusters, logliks = [], [] d = transprobs(TRANSITIONS, MARGINAL) #Get statistics for input alignment pd = pdn(alignment) mins = np.array(sorted([sorted(i)[1] for i in pd])) clusters.append(clustering(seq_len, mins, threshold)) logliks.append(loglik(alignment)) print 'Original alignment (len %dx%d) has clustering %.2f and LLH %2f' % (len(alignment), len(alignment[0]), clusters[-1], logliks[-1]) #Delete some sequences so we can re-impute for xval alignment = MultipleSeqAlignment(random.sample(alignment,len(alignment)-num_imp)) #Get statistics for "deletions" alignment pd = pdn(alignment) mins = np.array(sorted([sorted(i)[1] for i in pd])) clusters.append(clustering(seq_len, mins, threshold)) logliks.append(loglik(alignment)) print 'Deleted alignment (len %dx%d) has clustering %.2f and LLH %2f' % (len(alignment), len(alignment[0]), clusters[-1], logliks[-1]) pssm = SummaryInfo(alignment).pos_specific_score_matrix() probs = 1-np.array([max(pssm[i].values()) for i in xrange(seq_len)])/al_len #Weight site selection by empirical probability of mutation at that site probs /= sum(probs) # Build first state of Markov chain print 'Imputing first alignment...' current = impute.imp_align(num_imp, alignment, dem_ratios) current.loglik = loglik(current) current.distarray = np.array([list(s.seq) for s in current]) current.pd = pdn(current) curmins = np.array(sorted([sorted(i)[1] for i in current.pd])) clusters.append(clustering(seq_len, curmins, threshold)) logliks.append(current.loglik) print '\t Log likelihood %2f' % current.loglik # if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta') # Run chain for i in xrange(1,length): proposal = propmat(current,num_imp,max(norm(loc=2,scale=1).rvs(),1), d, probs)[1] proposal.loglik = loglik(proposal) for m,n in itertools.product(range(proposal.pd.shape[0]), range(proposal.pd.shape[1])): if (proposal.pd[m][n] < 10) and m!=n: proposal.loglik = -sys.maxint-1; print m,n, proposal.pd[m][n] p = proposal.loglik-current.loglik print 'Current LLH: %2f; Proposed LLH: %2f; Acceptance probability %e' % (current.loglik, proposal.loglik, math.exp(p)) if random.random()<math.exp(p): current = proposal acceptances += 1 print '\tAccepted' else: print '\tNot accepted' curmins = np.array(sorted([sorted(i)[1] for i in current.pd])) clusters.append(clustering(seq_len, curmins, threshold)) logliks.append(current.loglik) # if i > burnin: # AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta') r=random.randint(0,1000000) print r AlignIO.write(current, '%d.fasta'%r, 'fasta') return np.vstack((logliks,clusters))
def main(): args = get_args() files = list() prefiles = os.listdir(args.in_dir) for prefile in prefiles: # Remove hidden files if not prefile.startswith('.'): files.append(prefile) os.chdir(args.in_dir) for file in files: print file alignment = AlignIO.read("{0}{1}".format(args.in_dir, file), "fasta") alignment1 = alignment[:2,:] alignment1.append(alignment[6,:]) alignment1.append(alignment[7,:]) alignment1.append(alignment[8,:]) alignment1.append(alignment[9,:]) alignment1.append(alignment[14,:]) alignment1.append(alignment[15,:]) alignment2 = alignment[2:6,:] alignment2.append(alignment[10,:]) alignment2.append(alignment[11,:]) alignment2.append(alignment[12,:]) alignment2.append(alignment[13,:]) print alignment1 print alignment2 AlignIO.write(alignment1, "{0}trans_{1}".format(args.out_dir1, file), "fasta") AlignIO.write(alignment2, "{0}cis_{1}".format(args.out_dir2, file), "fasta")
def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0): taxa_to_remove = [] if remove_identical_sequences < 1: taxa_to_remove = self.taxa_missing_too_much_data() else: taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data() with open(self.input_filename) as input_handle: with open(output_filename, "w+") as output_handle: alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] number_of_included_alignments = 0 for alignment in alignments: for record in alignment: if record.id not in taxa_to_remove: output_alignments.append(record) number_of_included_alignments += 1 if number_of_included_alignments <= 1: sys.exit("Not enough sequences are left after removing duplicates.Please check you input data.") AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") output_handle.close() input_handle.close() return taxa_to_remove
def main(): alignment = AlignIO.read(infile, 'fasta') new_align_list = list() removed_list = list() for record in alignment: flag=True flag = leftGapOperations(record) if flag != True: removed_list.append(record.id) #print "Removing %s from alignment due to \nexceeding left gap cutoff" % record.id else: flag = rightGapOperations(record) if flag == False: removed_list.append(record.id) #print "Removing %s from alignment due to \nexceeding right gap cutoff" % record.id else: new_align_list.append(record) removed_outfile_name = outfile.split('.')[0] + ".rem" removed_handle = open(removed_outfile_name, 'w') removed_handle.write('\n'.join(removed_list)) removed_handle.close() new_align = MultipleSeqAlignment(new_align_list, alphabet=IUPAC.extended_dna) #print new_align #print getLeftTerminalCutoff() #print getRightTerminalCutoff() trim_align = trimSelection(new_align) print "Trimmed %i left and %i right" % (getLeftTerminalCutoff(),getRightTerminalCutoff()*-1) print "Removed %i sequences due to exceeding gap limits" % (len(removed_list)) AlignIO.write(trim_align, outfile, 'fasta')
def convert(infile, type, outtype, outfile): """Make AlignIO call to convert using the specified parameters""" from Bio import AlignIO ifh = AlignIO.parse(infile, type) AlignIO.write(ifh, outfile, outtype)
def constructor(self, recalculate): f = global_stuff.the_file_manager.get_file_handle(pdb_chain_msa_file_wrapper(self.params), recalculate) msa = AlignIO.read(f.name, 'fasta') # search for the query sequence idx = -1 for i in range(len(msa)): if msa[i].id == 'QUERY': idx = i break #print 'AAAAAAAAAAAAAAAAAAAAAAAAAAAA', idx #pdb.set_trace() # find the first non-insertion column i = 0 while msa[idx,i] == '-': #print msa[idx,i] i = i + 1 #print idx, i to_return = msa[:,i:(i+1)] print 'EEEEEEEEEEEEEEE' # add in all the other columns for k in range(i+1, msa.get_alignment_length()): if msa[idx,k] != '-': #print k to_return = to_return + msa[:,k:(k+1)] AlignIO.write(to_return, open(self.get_file_location(),'w'), 'fasta')
def save(cls, alignments, filename, schema=None): try: AlignIO.write(alignments, filename, cls.schema(filename, schema)) return True except Exception, e: print 'Unable to save alignments to: %s\n%s' % (filename, str(e)) return False
def write_alignment(self, filename, file_format, interleaved=None): """ Write the alignment to file using Bio.AlignIO """ if file_format == 'phylip': file_format = 'phylip-relaxed' AlignIO.write(self._msa, filename, file_format)
def main(): indexfile = open('indexfile.txt','r') for line in indexfile: files = line.split() # print "Seqfile name= ",files[0]," and aln file= ",files[1] seqs = SeqIO.to_dict(SeqIO.parse(files[0],'fasta')) # print "seqs = "+str(seqs) align = AlignIO.read(files[1],'clustal') # print "align= "+str(align) seqnames = seqs.keys() # print "seqnames = "+str(seqnames) name_idx ={} for s in seqnames: # n = s.split() # print "s = ",s," and full desc= ",seqs[s].description name_idx[s] = seqs[s].description # print "name_idx = "+str(name_idx) aln_dict = {} for x in range(0,len(align)): aln_dict[align[x].id] = x # print "aln_dict = "+str(aln_dict) for sname in name_idx: # print "sname = ",sname if aln_dict.has_key(sname): align[aln_dict[sname]].id = name_idx[sname] # print "new align should be "+str(align) newalign = open('new_'+files[1],"w") AlignIO.write(align,newalign,'clustal') newalign.close()
def tree(alignment, run_id = 'T%05i' % (0,), bionj = False): old_cwd = os.getcwd() new_wd = config.dataPath('phyml') if not os.path.isdir(new_wd): os.mkdir(new_wd) os.chdir(new_wd) infilepath = 'infile{0}'.format(run_id) infile = open(infilepath,'w') aio.write(alignment, infile, 'phylip') infile.close() command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' ) print command subprocess.call(command, shell = True, stdout = subprocess.PIPE) treefilepath = infilepath + '_phyml_tree.txt' treefile = open(treefilepath) tree =phylo.read(treefile, 'newick') treefile.close() os.chdir(old_cwd) return tree
def get_newick_tree(self): temp = None # quicktree expects a stockholm format input file if self.local_file.name and self.format == "stockholm": fname = self.local_file.path else: temp = tempfile.NamedTemporaryFile() print "writing stockholm format file..." AlignIO.write([self.biopy_alignment], temp, "stockholm") temp.flush() fname = temp.name print "opening quicktree on stockholm format file %s" % fname quicktree_out = os.popen('quicktree %s' % fname) # subprocess.Popen hangs the Django dev server # there should be some elementary error checking here... newick_tree = quicktree_out.read() print "quicktree finished" if temp: # 'temp' is unlinked immediately after creation--so be sure to close it only after we're certain # that quicktree succesfully opened it (i.e, only after read(), not just after popen()) temp.close() return newick_tree
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input) all_taxa = set([]) for count, f in enumerate(files): #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) new_align = MultipleSeqAlignment([], generic_dna) for align in AlignIO.parse(f, 'nexus'): for seq in list(align): #pdb.set_trace() fname = os.path.splitext(os.path.basename(f))[0] new_seq_name = re.sub("^{}_*".format(fname), "", seq.name) all_taxa.add(new_seq_name) seq.id = new_seq_name seq.name = new_seq_name new_align.append(seq) assert len(all_taxa) == args.taxa, "Taxon names are not identical" outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
def main(args): with open(args.fasta, 'r') as handle: align = AlignIO.read(handle, "fasta") to_delete = [] old_length = align.get_alignment_length() logging.info('Examining {} columns of aligned fasta file'.format(old_length)) for pos in range(old_length): column = align[ : , pos] if column == '-' * len(column): to_delete.append(pos) if len(to_delete) > 0: logging.info('Removing {} gap-only columns: {}'.format(len(to_delete), to_delete)) to_delete.sort() to_delete.reverse() for pos in to_delete: align = align[:, :pos] + align[:, pos+1:] new_length = align.get_alignment_length() logging.info('Done! Old length: {} New length: {} Difference: {}'. format(old_length, new_length, old_length-new_length)) output_filename = os.path.basename(args.fasta) + '_degapped.fasta' with open(output_filename, 'w') as handle: AlignIO.write(align, handle, "fasta")
def add(alignment, sequence, timeout, logger, wd, threads): """Align sequence(s) to an alignment using mafft (external program)""" alignment_file = "alignment_in.fasta" sequence_file = "sequence_in.fasta" output_file = "alignment_out.fasta" + ".fasta" command_line = "{0} --auto --thread {1} --add {2} {3} > {4}".format( mafft, threads, sequence_file, alignment_file, output_file ) with open(os.path.join(wd, sequence_file), "w") as file: SeqIO.write(sequence, file, "fasta") with open(os.path.join(wd, alignment_file), "w") as file: AlignIO.write(alignment, file, "fasta") pipe = TerminationPipe(command_line, timeout=timeout, cwd=wd) pipe.run() os.remove(os.path.join(wd, alignment_file)) os.remove(os.path.join(wd, sequence_file)) if not pipe.failure: try: res = AlignIO.read(os.path.join(wd, output_file), "fasta") except: logger.info(pipe.output) raise MafftError() else: os.remove(os.path.join(wd, output_file)) else: logger.debug(".... add timeout ....") return genNonAlignment(len(alignment) + 1, len(alignment.get_alignment_length())) return res
def split_family_seqs(): alis_dir = cfg.dataPath('rfam/family_alis/') meta_dir = cfg.dataPath('rfam/family_metas/') fopen = open(cfg.dataPath('rfam/Rfam.seed')) alis = aio.parse(fopen,'stockholm') while 1: infos = {} start = fopen.tell() while 1: l = fopen.readline() if l == '': break if l[0] == '#': ukey = str(l[5:7]) infos.update( [(ukey, infos.get(ukey,'') + l[8:])]) else: if l.strip() != '': break fopen.seek(start) ali = alis.next() if not ali: break rfname = infos['AC'].strip() alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w') metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w') aio.write(ali, alifile, 'fasta') pickle.dump(infos, metafile) alifile.close() metafile.close()
def load_tree(seqfname): """Load an alignment, build & prep a tree, return the tree object.""" if seqfname.endswith('.aln'): aln = AlignIO.read(seqfname, 'clustal') elif seqfname.endswith('.fasta'): # Run MAFFT quickly alndata = subprocess.check_output(['mafft', '--quiet', '--auto', seqfname]) aln = AlignIO.read(StringIO(alndata), 'fasta') else: raise ValueError("Input sequences must be a Clustal alignment (.aln) " "or unaligned FASTA (.fasta)") # Use conserved (less-gappy) blocks to build the tree aln = alnutils.blocks(aln, 0.4) with tempfile.NamedTemporaryFile(mode='w') as tmp: AlignIO.write(aln, tmp, 'fasta') tmp.flush() treedata = subprocess.check_output(['fasttree', '-pseudo', '-gamma', '-wag', tmp.name]) tree = Phylo.read(StringIO(treedata), 'newick') # Collapse weakly supported splits confs = [c.confidence for c in tree.find_clades() if c.confidence is not None] # ENH: accept min_confidence as an option min_confidence = math.fsum(confs) / len(confs) tree.collapse_all(lambda c: c.confidence < min_confidence) tree.ladderize(reverse=True) tree.root.branch_length = 0.0 return tree
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs): """ build maximum likelihood tree of DNA seqs with RAxML """ work_dir = op.join(work_dir, "work") mkdir(work_dir) phy_file = op.join(work_dir, "aln.phy") AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="a", model="GTRGAMMA", \ parsimony_seed=12345, rapid_bootstrap_seed=12345, \ num_replicates=100, name="aln", \ working_dir=raxml_work, **kwargs) logging.debug("Building ML tree using RAxML: %s" % raxml_cl) stdout, stderr = raxml_cl() tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work) if not op.exists(tree_file): print("***RAxML failed.", file=sys.stderr) sh("rm -rf %s" % raxml_work, log=False) return None sh("cp {0} {1}".format(tree_file, outfile), log=False) logging.debug("ML tree printed to %s" % outfile) sh("rm -rf %s" % raxml_work) return outfile, phy_file
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename outdir = sys.argv[3] #Output directory if os.path.isdir(outdir): #Checks the presence of directory print "Directory exists. New directory not created" else: command= "mkdir "+ outdir os.system(command) #outpath defines path of the subfolder we want to store results in outpath = outdir + '/' + sys.argv[1] command = "mkdir " + outpath os.system(command) #write the result to output align = MultipleSeqAlignment([]) output_file = outpath + '/' + filename + '.' + 'output' #print output_file #path = outdir + '/'+ output_file for i in range(len(seqs)): align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i])) AlignIO.write(align, output_file ,"fasta")
def __init__(self,aln,treef,cmd=None): if os.path.isfile(aln): self.alnfile=aln self.aln = AlignIO.read(open(self.alnfile),'fasta') else: self.aln=aln self.alnfile = tempfile.NamedTemporaryFile() AlignIO.write(aln,self.alnfile,'fasta') self.alnfile.flush() if not cmd: import sys if sys.maxint==9223372036854775807: #64 bit cmd='rate4site64' else: cmd='rate4site' if isinstance(treef,dendropy.Tree): parent_tree=treef elif os.path.isfile(treef): parent_tree=dendropy.Tree.get_from_path(treef,'newick') self.tree = narrow_tree(parent_tree,self.aln) self.treefile = tempfile.NamedTemporaryFile() # self.tree.write(self.treefile,'newick',internal_labels=False) self.treefile.write(self.tree.as_string('newick',internal_labels=False)[5:]) self.treefile.flush() self.cmd=cmd
def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose): input_handle = open(input_filename, "rU") output_handle = open(output_filename, "w+") alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] taxa_removed = [] number_of_included_alignments = 0 for alignment in alignments: for record in alignment: number_of_gaps = 0 number_of_gaps += record.seq.count('n') number_of_gaps += record.seq.count('N') number_of_gaps += record.seq.count('-') sequence_length = len(record.seq) if sequence_length == 0: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because there werent enough bases in it" elif((number_of_gaps*100/sequence_length) <= filter_percentage): output_alignments.append(record) number_of_included_alignments += 1 else: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed" if number_of_included_alignments <= 1: sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter") AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") output_handle.close() input_handle.close() return taxa_removed
def read_alignment(alignment, informat, outformat, start, stop): align = AlignIO.read(alignment, informat, alphabet=generic_dna) out_basename = os.path.splitext(alignment)[0] algn_length = align.get_alignment_length() print "\nInput alignment is "+str(algn_length)+" characters." end_pos = stop if stop>algn_length: print "\nNB: you have requested an end position beyond the "+\ "length of the alignment. " end_pos = algn_length if stop<start or start<0: print "\nFatal: your begin and end positions need re-assessment."+\ " Exiting now." print "" sys.exit() outname = out_basename+"_pos"+str(start)+"to"+str(end_pos)+"."+outformat with open(outname, "w") as output_handle: algn = align[:, start:stop] AlignIO.write(algn, output_handle, outformat) print "\nExtracted "+outformat+"-formatted sub-alignment from "+\ "positions "+str(start)+" to "+str(end_pos)+" and written it to "+\ outname+". Here is a preview:" print "" print algn print ""
# Expected - check the error assert "Repeated name 'longsequen'" in str(e) check_phylip_reject_duplicate() #Check parsers can cope with an empty file for t_format in AlignIO._FormatToIterator: handle = StringIO() alignments = list(AlignIO.parse(handle, t_format)) assert len(alignments) == 0 #Check writers can cope with no alignments for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter): handle = StringIO() assert 0 == AlignIO.write([], handle, t_format), \ "Writing no alignments to %s format should work!" \ % t_format #Check writers reject non-alignments list_of_records = list(AlignIO.read(open("Clustalw/opuntia.aln"),"clustal")) for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter): handle = StringIO() try: AlignIO.write([list_of_records], handle, t_format) assert False, "Writing non-alignment to %s format should fail!" \ % t_format except (TypeError, AttributeError, ValueError): pass del handle del list_of_records, t_format
clustaltextout, clustaltexterr = clustalw_cline() if len(clustaltexterr) > 0: print("error:\n%s"%(clustaltexterr)) exit() print ("clustalw output:\n %s"%(clustaltextout)) # read in the alignment file and create a MultipleSeqAlignment object clustalalignment = AlignIO.read("p53_homologous.aln", "clustal") #write the alignment to a format that can be read by PhyML alignout_filename = "p53_homologous.out" AlignIO.write(clustalalignment,alignout_filename,"phylip-relaxed") print("Making tree (takes around 75 seconds): ") # specify the location of the phyml executable (this depends on your machine) phyml_exe_path = r"D:\SCHOOL\fall 2020\Biological Models in Python\Week 7\PhyML-3.1_win32.exe" #optional check to see if that path exists assert os.path.isfile(phyml_exe_path), "PhyML executable missing" # create an instance of a Bio.AlignApplication that can be called like a function and runs phyml phymlcmd = PhymlCommandline(cmd=phyml_exe_path,input=alignout_filename) phymltextout,phymltexterr = phymlcmd() print ("PhyML output:\n%s"%(phymltextout))
def simplex(params, out_prefix=None, yule=True, n_model=5, n_seqgen=5, JC=False, alphabet='nuc_nogap', alpha=1.0, rate_alpha=1.5, W_dirichlet_alpha=2.0): """Generate a tree and random GTR model with frequency parameters sampled from a Dirichlet distribution on the simplex Parameters ---------- params : dict dictionary with parameters of the evolutionary process, sample size etc out_prefix : None, optional save the generated data using this prefix and otherwise standardized file names yule : bool, optional generate a Yule tree instead of a Kingman Coalesccent tree n_model : int, optional number of distinct models to draw for each tree n_seqgen : int, optional number of times sequences are evolved for each tree/model combination JC : bool, optional Use a Jukes Cantor model for the preference but include rate variation alphabet : str, optional alphabet of the GTR model alpha : float, optional parameter of the Dirichlet distribution for frequencies rate_alpha : float, optional parameter of the rate distribution (Gamma) W_dirichlet_alpha : float, optional parameter of the Dirichlet distribution of W matrix elements """ from Bio import AlignIO # generate a model T = betatree(params['n'], alpha=2.0) T.yule = yule T.coalesce() # ladderize the tree and name internal nodes via loading into TreeAnc T.BioTree.ladderize() tt = TreeAnc(tree=T.BioTree) if out_prefix: Phylo.write(tt.tree, tree_name(out_prefix, params), 'newick') for mi in range(n_model): params['model'] = mi if JC: myGTR = GTR_site_specific.random(L=params['L'], alphabet=alphabet, pi_dirichlet_alpha=False, W_dirichlet_alpha=False, mu_gamma_alpha=rate_alpha) else: myGTR = GTR_site_specific.random( L=params['L'], alphabet=alphabet, pi_dirichlet_alpha=alpha, mu_gamma_alpha=rate_alpha, W_dirichlet_alpha=W_dirichlet_alpha) myGTR.mu *= params['m'] if out_prefix: save_model(myGTR, model_name(out_prefix, params)) for si in range(n_seqgen): params['seqgen'] = si # generate sequences mySeq = SeqGen(params['L'], gtr=myGTR, tree=T.BioTree) mySeq.evolve() if out_prefix: save_mutation_count(mySeq, mutation_count_name(out_prefix, params)) with open(alignment_name_raw(out_prefix, params), 'wt') as fh: AlignIO.write(mySeq.get_aln(), fh, 'fasta') reconstruct_tree(out_prefix, params, aa='aa' in alphabet) os.system('gzip ' + alignment_name_raw(out_prefix, params))
from Bio import AlignIO import sys input_handle = open(sys.argv[1], "rU") output_handle = open(sys.argv[2], "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "phylip") output_handle.close() input_handle.close()