def align(seqs): ''' Multiple seqeuence alignment using ClustalW @param seqs: list of biological sequences @type seqs: list of strs @return : aligned sequences @rtype : list of strs ''' name = 'tmpFoo%d' % time.time() tfile = open(name+'.fasta','w') try: for i,seq in enumerate(seqs): tfile.write('>%d\n' % (i+1)) tfile.write('%s\n' % seq) tfile.flush() cline = MultipleAlignCL(tfile.name) cline.set_output(name+'.aln') alignment = do_alignment(cline) return [ rec.seq.tostring() for rec in alignment.get_all_seqs() ] finally: os.unlink(name + '.fasta') os.unlink(name + '.dnd') os.unlink(name + '.aln')
def align(self): seqs = True while seqs: seqs = self.phamServer.request_seqs(self.server, self.numSeqs, self.client) try: import pynotify if pynotify.init("Phamerator"): n = pynotify.Notification("Phamerator Update", "doing clustalw alignments", "file:///home/steve/Applications/git/PhamDB/phageManager_logo.png") n.show() else: pass #print "there was a problem initializing the pynotify module" except: pass if len(seqs) == 0: self._logger.log('server returned no sequences to align.') return self._logger.log('server returned ' + str(len(seqs)) + ' to align') self._logger.log('aligning sequences') results = [] ########################################################################### # BEGIN MATT'S ALTERATIONS # # # ########################################################################### # tuple of all parallel python servers to connect with ppservers = () # Creates jobserver with automatically detected number of workers job_server = pp.Server(ppservers=ppservers) #grab number of processors numcpus = job_server.get_ncpus() print "numcpus =",numcpus #for n, person in enumerate(people): #for seq in seqs: for i,currentseq in enumerate(seqs): jobs = [] id, querySeq, subjectSeq = currentseq f = open(os.path.join(self.rootdir, 'temp' + str(i) + '.fasta'), 'w') f.write('>' + 'a' + '\n' + querySeq + '\n>' + 'b' + '\n' + subjectSeq + '\n') f.close() cline = MultipleAlignCL(os.path.join(self.rootdir, 'temp' + str(i) + '.fasta')) #cline.is_quick = True cline.set_output(os.path.join(self.rootdir, 'temp' + str(i) + '.aln')) #alignment = Clustalw.do_alignment(cline) #jobs = [(input, job_server.submit(sum_primes,(input,), (isprime,), ("math",))) for input in inputs] jobs.append(job_server.submit(clustalwAligner.run_clustalw, (id,cline), (), ("clustalwAligner.run_clustalw","Bio.Clustalw",))) for job in jobs: id,alignment = job() length = alignment.get_alignment_length() star = alignment._star_info.count('*') score = float(star)/length #print 'length:', length, 'identical:', star, 'score:', score #_logger.log queryName, subjectName, score results.append((id, score)) self._logger.log('reporting scores back to server') self.phamServer.report_scores(results, self.server, self.client)
def Align(self, s1, s2, result): result.clear() handle_tmpfile1, filename_tmpfile1 = tempfile.mkstemp() handle_tmpfile2, filename_tmpfile2 = tempfile.mkstemp() os.write(handle_tmpfile1, ">s1\n%s\n" % (s1)) os.write(handle_tmpfile1, ">s2\n%s\n" % (s2)) os.close(handle_tmpfile1) os.close(handle_tmpfile2) cline = MultipleAlignCL(filename_tmpfile1) cline.set_output(filename_tmpfile2) align = do_alignment(cline) seqs = align.get_all_seqs() if len(seqs) != 2: return result a1 = seqs[0].seq.tostring() a2 = seqs[1].seq.tostring() x1 = 0 x2 = 0 for pos in range(len(a1)): if a1[pos] not in "Nn-" and a2[pos] not in "Nn-": result.addPair(x1, x2, 0) x1 += 1 x2 += 1 continue if a1[pos] != "-": x1 += 1 if a2[pos] != "-": x2 += 1 os.remove(filename_tmpfile1) os.remove(filename_tmpfile2) return result
def align_seq_clustalw(sequences,mat='BLOSUM',output_order='INPUT'): """Return a clustal alignment of an arbitrary number of sequences. Requires clustalw on path. Sequences are of type SeqRecord""" assert(len(sequences) > 1) from Bio.Clustalw import MultipleAlignCL from Bio import Clustalw from Bio.SeqIO.FastaIO import FastaWriter import tempfile def write(f): i = 0 for seqrecord in sequences: if seqrecord.id == "<unknown id>": seqrecord.id = str(i) i+=1 writer = FastaWriter(f) writer.write_file(sequences) f.flush() #IMPORTANT input = tempfile.NamedTemporaryFile("w") #input = open("input","w") try: write(input) cline = MultipleAlignCL(input.name) cline.set_protein_matrix(MultipleAlignCL.PROTEIN_MATRIX[MultipleAlignCL.PROTEIN_MATRIX.index(mat)]) cline.is_quick = False # Set to PAM matrix. output = tempfile.NamedTemporaryFile("w") #output = open("output","w") try: cline.set_output(output.name, output_order=MultipleAlignCL.OUTPUT_ORDER[MultipleAlignCL.OUTPUT_ORDER.index(output_order)]) #print "input",open(input.name).read() #print "cline",cline subprocess.check_call(str(cline),shell=True,stdout=open(os.devnull,"w")) f = open(output.name) try: iter = Bio.AlignIO.ClustalIO.ClustalIterator(f) # There should only be one alignment in the file alignments = list(iter) assert(len(alignments)==1) alignment = alignments[0] finally: f.close() #print "output",output.read() return alignment finally: output.close() finally: input.close()
def Align_Results(OutputFileName): import os FileIN_Name = """/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/Consensus_Results/%s.FASTA""" % (OutputFileName) FileOUT_ALN = """/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/Consensus_Results/%s.ALN""" % (OutputFileName) print FileIN_Name print FileOUT_ALN from Bio.Clustalw import MultipleAlignCL from Bio import Clustalw cline = MultipleAlignCL(os.path.join(os.curdir, FileIN_Name)) cline.set_output(FileOUT_ALN) alignment = Clustalw.do_alignment(cline) cline.close()
and parse the results into an object that can be dealt with easily.""" # standard library import os # biopython from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Clustalw import MultipleAlignCL from Bio.Align import AlignInfo from Bio.SubsMat import FreqTable # create the command line to run clustalw # this assumes you've got clustalw somewhere on your path, otherwise # you need to pass a second argument to MultipleAlignCL with the complete # path to clustalw cline = MultipleAlignCL(os.path.join(os.curdir, 'opuntia.fasta')) cline.set_output('test.aln') # actually perform the alignment and get back an alignment object alignment = Clustalw.do_alignment(cline) # get the records in the alignment all_records = alignment.get_all_seqs() print 'description:', all_records[0].description print 'sequence:', all_records[0].seq # get the length of the alignment print 'length', alignment.get_alignment_length() print alignment
if "not found" not in output and "CLUSTAL" in output.upper() : clustalw_exe = "clustalw" if not clustalw_exe : raise MissingExternalDependencyError(\ "Install clustalw or clustalw2 if you want to use Bio.Clustalw.") ################################################################# print "Checking error conditions" print "=========================" print "Empty file" input_file = "does_not_exist.fasta" assert not os.path.isfile(input_file) cline = MultipleAlignCL(input_file, command=clustalw_exe) try : align = Clustalw.do_alignment(cline) assert False, "Should have failed, returned %s" % repr(align) except IOError, err : print "Failed (good)" #Python 2.3 on Windows gave (0, 'Error') #Python 2.5 on Windows gives [Errno 0] Error assert "Cannot open sequence file" in str(err) \ or "not produced" in str(err) \ or str(err) == "[Errno 0] Error" \ or str(err) == "(0, 'Error')", str(err) print print "Single sequence" input_file = "Fasta/f001"
def align(self): id = True ppservers = () job_server = pp.Server(ppservers=ppservers,secret="secret") while id: # this should test to make sure there are still alignments to do print 'getting work unit' try: clustalw_work_unit = self.phamServer.request_seqs(self.client) if not clustalw_work_unit.query_id: print 'no work units available...sleeping' logo = os.path.join(os.path.dirname(__file__),"pixmaps/phamerator.png") #print "logo: %s" % logo try: import pynotify if pynotify.init("Phamerator"): n = pynotify.Notification("Phamerator Update", "No Clustalw alignments left to do...sleeping", "file:///%s" % logo) n.show() else: pass #print "there was a problem initializing the pynotify module" except: pass time.sleep(30) continue except Exception, x: print ''.join(Pyro.util.getPyroTraceback(x)) server, db = self.phamServer.request_db_info() #c = db_conf(username=self.username, password=self.password, server=server, db=db) #clustalw_work_unit.set_cursor(c) print 'got it' try: import pynotify if pynotify.init("Phamerator"): logo = os.path.join(os.path.dirname(__file__),"pixmaps/phamerator.png") #print "logo: %s" % logo n = pynotify.Notification("Phamerator Update", "Clustalw alignments in progress for id %s" % clustalw_work_unit.query_id, "file:///%s" % logo) n.show() else: pass #print "there was a problem initializing the pynotify module" except: pass self._logger.log('aligning sequences') ########################################################################### # BEGIN MATT'S ALTERATIONS # # # ########################################################################### results = [] open_files = [] # tuple of all parallel python servers to connect with # Creates jobserver with automatically detected number of workers #grab number of processors numcpus = job_server.get_ncpus() print "numcpus =",numcpus #for n, person in enumerate(people): #for seq in seqs: jobs = [] #for i,currentseq in enumerate(seqs): query_id = clustalw_work_unit.query_id query_translation = clustalw_work_unit.query_translation counter = 0 for record in clustalw_work_unit.database: subject_id, subject_translation = record.id, record.translation fname = os.path.join(self.rootdir, 'temp' + query_id + '_' + subject_id + '.fasta') f = open(fname, 'w') open_files.append(fname) open_files.append(fname.replace('.fasta','.dnd')) open_files.append(fname.replace('.fasta','.aln')) f.write('>%s\n%s\n>%s\n%s\n' % (query_id, query_translation, subject_id, subject_translation)) f.close() clustalw_infile = os.path.join(self.rootdir, 'temp' + str(query_id) + '_' + str(subject_id) + '.fasta') if float(Bio.__version__) >= 1.56: # pass the query id (qid) and the subject id (sid) to run_clustalw jobs.append(job_server.submit(clustalwAligner.run_clustalw, (clustalw_infile, query_id, subject_id), (), ())) else: cline = MultipleAlignCL(clustalw_infile) cline.set_output(os.path.join(self.rootdir, 'temp' + str(query_id) + '_' + str(subject_id) + '.aln')) # pass the query id (qid) and the subject id (sid) to run_clustalw jobs.append(job_server.submit(clustalwAligner.run_clustalw_old, (query_id, subject_id,cline), (), ("Bio.Clustalw",))) counter = counter + 3 if counter > 50: results = self.process_jobs(jobs, results, open_files) jobs = [] open_files = [] counter = 0 results = self.process_jobs(jobs, results, open_files) jobs = [] open_files = [] counter = 0 # must report everything back in atomic transaction print 'reporting scores back to server' try: self.phamServer.report_scores(clustalw_work_unit, results, self.client) except Exception, x: print ''.join(Pyro.util.getPyroTraceback(x)) print 'exiting on pyro traceback' sys.exit()
and "Multiple Sequence Alignments" in output: clustalw_exe = "clustalw" if not clustalw_exe: raise MissingExternalDependencyError(\ "Install clustalw or clustalw2 if you want to use Bio.Clustalw.") ################################################################# print "Checking error conditions" print "=========================" print "Empty file" input_file = "does_not_exist.fasta" assert not os.path.isfile(input_file) cline = MultipleAlignCL(input_file, command=clustalw_exe) try: align = Clustalw.do_alignment(cline) assert False, "Should have failed, returned %s" % repr(align) except IOError, err: print "Failed (good)" #Python 2.3 on Windows gave (0, 'Error') #Python 2.5 on Windows gives [Errno 0] Error assert "Cannot open sequence file" in str(err) \ or "not produced" in str(err) \ or str(err) == "[Errno 0] Error" \ or str(err) == "(0, 'Error')", str(err) print print "Single sequence" input_file = "Fasta/f001"
def align(self): id = True ppservers = () job_server = pp.Server(ppservers=ppservers, secret="secret") while id: # this should test to make sure there are still alignments to do print 'getting work unit' try: clustalw_work_unit = self.phamServer.request_seqs(self.client) if not clustalw_work_unit.query_id: print 'no work units available...sleeping' logo = os.path.join(os.path.dirname(__file__), "pixmaps/phamerator.png") #print "logo: %s" % logo try: import pynotify if pynotify.init("Phamerator"): n = pynotify.Notification( "Phamerator Update", "No Clustalw alignments left to do...sleeping", "file:///%s" % logo) n.show() else: pass #print "there was a problem initializing the pynotify module" except: pass time.sleep(30) continue except Exception, x: print ''.join(Pyro.util.getPyroTraceback(x)) server, db = self.phamServer.request_db_info() #c = db_conf(username=self.username, password=self.password, server=server, db=db) #clustalw_work_unit.set_cursor(c) print 'got it' try: import pynotify if pynotify.init("Phamerator"): logo = os.path.join(os.path.dirname(__file__), "pixmaps/phamerator.png") #print "logo: %s" % logo n = pynotify.Notification( "Phamerator Update", "Clustalw alignments in progress for id %s" % clustalw_work_unit.query_id, "file:///%s" % logo) n.show() else: pass #print "there was a problem initializing the pynotify module" except: pass self._logger.log('aligning sequences') ########################################################################### # BEGIN MATT'S ALTERATIONS # # # ########################################################################### results = [] open_files = [] # tuple of all parallel python servers to connect with # Creates jobserver with automatically detected number of workers #grab number of processors numcpus = job_server.get_ncpus() print "numcpus =", numcpus #for n, person in enumerate(people): #for seq in seqs: jobs = [] #for i,currentseq in enumerate(seqs): query_id = clustalw_work_unit.query_id query_translation = clustalw_work_unit.query_translation counter = 0 for record in clustalw_work_unit.database: subject_id, subject_translation = record.id, record.translation fname = os.path.join( self.rootdir, 'temp' + query_id + '_' + subject_id + '.fasta') f = open(fname, 'w') open_files.append(fname) open_files.append(fname.replace('.fasta', '.dnd')) open_files.append(fname.replace('.fasta', '.aln')) f.write('>%s\n%s\n>%s\n%s\n' % (query_id, query_translation, subject_id, subject_translation)) f.close() clustalw_infile = os.path.join( self.rootdir, 'temp' + str(query_id) + '_' + str(subject_id) + '.fasta') if float(Bio.__version__) >= 1.56: # pass the query id (qid) and the subject id (sid) to run_clustalw jobs.append( job_server.submit( clustalwAligner.run_clustalw, (clustalw_infile, query_id, subject_id), (), ())) else: cline = MultipleAlignCL(clustalw_infile) cline.set_output( os.path.join( self.rootdir, 'temp' + str(query_id) + '_' + str(subject_id) + '.aln')) # pass the query id (qid) and the subject id (sid) to run_clustalw jobs.append( job_server.submit(clustalwAligner.run_clustalw_old, (query_id, subject_id, cline), (), ("Bio.Clustalw", ))) counter = counter + 3 if counter > 50: results = self.process_jobs(jobs, results, open_files) jobs = [] open_files = [] counter = 0 results = self.process_jobs(jobs, results, open_files) jobs = [] open_files = [] counter = 0 # must report everything back in atomic transaction print 'reporting scores back to server' try: self.phamServer.report_scores(clustalw_work_unit, results, self.client) except Exception, x: print ''.join(Pyro.util.getPyroTraceback(x)) print 'exiting on pyro traceback' sys.exit()
and parse the results into an object that can be dealt with easily.""" # standard library import os # biopython from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Clustalw import MultipleAlignCL from Bio.Align import AlignInfo from Bio.SubsMat import FreqTable # create the command line to run clustalw # this assumes you've got clustalw somewhere on your path, otherwise # you need to pass a second argument to MultipleAlignCL with the complete # path to clustalw cline = MultipleAlignCL(os.path.join(os.curdir, "opuntia.fasta")) cline.set_output("test.aln") # actually perform the alignment and get back an alignment object alignment = Clustalw.do_alignment(cline) print alignment print "first description:", alignment[0].description print "first sequence:", alignment[0].seq # get the length of the alignment print "length", alignment.get_alignment_length() print alignment