def runBlast(self, result_handle=None): if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall(self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors) #kdrew: if we want to pre-run blast, just run this line blast_records = NCBIXML.parse(result_handle) return blast_records
def blast_2_files(input_filename,input_db): blast_db = fastafile.PERMANENT_STORE + input_db blast_file = fastafile.PERMANENT_STORE + input_filename if not os.path.exists(blast_db + ".nin"): fastafile.formatdb(blast_db) blast_out, error_handle = NCBIStandalone.blastall(BLAST_EXE, BLAST_PROGRAM, blast_db, blast_file) return blast_out
def blastfile(self, filename): # run blast b_out, e_info = NCBIStandalone.blastall(self.blastexe, self.mode, self.dbname, filename) data = b_out.read() if not data: raise ValueError, 'BLAST error: %s' % e_info.read() return data
def __find_partials(self, minimum_blast_length=0): """find partial IS elements by blasting the sequences against the genome""" #if there are no IS elements, skip this step if len(self.annotations) == 0: return #write a temporary genome fasta file blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta") outf = open(blast_db, "w") SeqIO.write(self.as_records(), outf, "fasta") outf.close() #turn it into a database os.system(FORMAT_EXE + " -p F -i " + blast_db) #write a temporary IS fasta file blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta") self.__write_singles(blast_file) #get the directions of these sample IS's directions = [is_set.lst[0].direction for is_set in self.annotations] #clear annotations self.annotations = [] #perform a blast result_handle, error_handle = NCBIStandalone.blastall( BLAST_EXE, "blastn", blast_db, blast_file) blast_records = NCBIXML.parse(result_handle) #iterate over the results and the directions of the queries for record, sample_direction in zip(blast_records, directions): ISlist = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \ len(hsp.sbjct) > minimum_blast_length: chromosome = alignment.title.split(" ")[1] start = hsp.sbjct_start - 1 end = start + len(hsp.sbjct) #find out what the gene is f = self.get_feature(chromosome, start, end) thisdir = hsp.frame[1] * sample_direction ISlist.append( IS.IS(f, chromosome, start, end, self, dir=thisdir)) if len(ISlist) > 0: self.annotations.append(ISSet.ISSet(ISlist, self.profile)) #clean up- remove the temporary files os.remove(blast_db) os.remove(blast_file) for f in glob.glob(blast_db + ".n*"): os.remove(f) os.remove("formatdb.log")
def blast(self): '''aligns sequences using blast''' blastAppDir = self.blastAppDir blastDB = os.path.join(self.blastDataDir, 'blastDB.fasta') blastQueryFile = os.path.join(self.blastDataDir, 'filetoblast.txt') print 'path to filetoblast.txt:', blastQueryFile if sys.platform == 'win32': blastall_name = 'Blastall.exe' else: blastall_name = 'blastall' blast_exe = os.path.join(blastAppDir, blastall_name) if sys.platform == 'win32': import win32api blastDB = win32api.GetShortPathName(blast_db) blastQueryFile = win32api.GetShortPathName(blastQueryFile) blast_exe = win32api.GetShortPathName(blast_exe) blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blastDB, blastQueryFile, align_view=7) #print error_info.read() #print blast_out.read() blast_records = NCBIXML.parse(blast_out) results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 try: b_record = blast_records.next() except StopIteration: break if not b_record: continue print 'query:', b_record.query e_value_thresh = 0.0001 significant = False for alignment in b_record.alignments: bestHsp = None for hsp in alignment.hsps: if not bestHsp: bestHsp = hsp.expect elif bestHsp < hsp.expect: continue if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">","") #if b_record.query != alignment.title: #print 'dir(alignment):', dir(alignment) #print 'hsps: ',alignment.hsps, 'accession:', alignment.accession, 'title:', alignment.title, 'length:', alignment.length if b_record.query != alignment.accession: significant = True print 'adding', b_record.query, 'and', alignment.accession, 'to matches (e value: ',hsp.expect, ', bit score: ', hsp.bits, ')' results.append((b_record.query, alignment.accession, hsp.expect, hsp.bits)) print b_record.query, significant #if not significant: # print 'adding', b_record.query, 'to the list of queries without matches' # results.append((b_record.query, None, None)) return results
def __find_partials(self, minimum_blast_length=0): """find partial IS elements by blasting the sequences against the genome""" #if there are no IS elements, skip this step if len(self.annotations) == 0: return #write a temporary genome fasta file blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta") outf = open(blast_db, "w") SeqIO.write(self.as_records(), outf, "fasta") outf.close() #turn it into a database os.system(FORMAT_EXE + " -p F -i " + blast_db) #write a temporary IS fasta file blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta") self.__write_singles(blast_file) #get the directions of these sample IS's directions = [is_set.lst[0].direction for is_set in self.annotations] #clear annotations self.annotations = [] #perform a blast result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE, "blastn", blast_db, blast_file) blast_records = NCBIXML.parse(result_handle) #iterate over the results and the directions of the queries for record, sample_direction in zip(blast_records, directions): ISlist = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \ len(hsp.sbjct) > minimum_blast_length: chromosome = alignment.title.split(" ")[1] start = hsp.sbjct_start-1 end = start + len(hsp.sbjct) #find out what the gene is f = self.get_feature(chromosome, start, end) thisdir = hsp.frame[1] * sample_direction ISlist.append(IS.IS(f, chromosome, start, end, self, dir=thisdir)) if len(ISlist) > 0: self.annotations.append(ISSet.ISSet(ISlist, self.profile)) #clean up- remove the temporary files os.remove(blast_db) os.remove(blast_file) for f in glob.glob(blast_db + ".n*"): os.remove(f) os.remove("formatdb.log")
def runBlast(self, result_handle=None): # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records. # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on # Output: an iterator over a sequence of Record objects # If no filehandle given, or filehandle given is None, run new blast. if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall( self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors ) # Parse and return blast records from given filehandle or new blast run. blast_records = NCBIXML.parse(result_handle) return blast_records
def runBlast(self, result_handle=None): # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records. # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on # Output: an iterator over a sequence of Record objects # If no filehandle given, or filehandle given is None, run new blast. if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall( self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors) # Parse and return blast records from given filehandle or new blast run. blast_records = NCBIXML.parse(result_handle) return blast_records
def identify_family(self, aaseq): """given an amino acid sequence, identify its family""" blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta") outf = open(blast_file, "w") temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq) SeqIO.write([temp_record], outf, "fasta") outf.close() result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE, "blastp", self.tpase_file, blast_file) try: record = NCBIXML.parse(result_handle).next() except ValueError: raise Exception("BLAST Exception: " + error_handle.read()) best_hsp = None best_alignment = None #perform blast for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < TPASE_MAX_E_VALUE: if best_hsp: if hsp.score > best_hsp.score: best_alignment = alignment best_hsp = hsp else: best_alignment = alignment best_hsp = hsp #find family and group family = None group = None if best_hsp: fields = re.split("[\s\t]+", best_alignment.title)[1].split("|") #best_IS = self.__fetch_by_name(fields[0]) family, group = fields[2], fields[3] #clean up by removing temporary blast file os.remove(blast_file) return family, group
def identify_family(self, aaseq): """given an amino acid sequence, identify its family""" blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta") outf = open(blast_file, "w") temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq) SeqIO.write([temp_record], outf, "fasta") outf.close() result_handle, error_handle = NCBIStandalone.blastall( BLAST_EXE, "blastp", self.tpase_file, blast_file) try: record = NCBIXML.parse(result_handle).next() except ValueError: raise Exception("BLAST Exception: " + error_handle.read()) best_hsp = None best_alignment = None #perform blast for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < TPASE_MAX_E_VALUE: if best_hsp: if hsp.score > best_hsp.score: best_alignment = alignment best_hsp = hsp else: best_alignment = alignment best_hsp = hsp #find family and group family = None group = None if best_hsp: fields = re.split("[\s\t]+", best_alignment.title)[1].split("|") #best_IS = self.__fetch_by_name(fields[0]) family, group = fields[2], fields[3] #clean up by removing temporary blast file os.remove(blast_file) return family, group
def blastOneBatchProbes(self, probe_id_seq_ls, blast_bin_path, database_fname, \ tmp_blast_infname, min_no_of_identities=15, node_rank=0): """ 2010-4-14 """ result_ls = [] inf = open(tmp_blast_infname, 'w') for probe_id, probe_seq in probe_id_seq_ls: inf.write(">%s\n" % probe_id) # write the probe id inf.write("%s\n" % probe_seq) inf.close() if self.report: sys.stderr.write("I'm %s, finished generating blast file for %s probes.\n"%\ (node_rank, len(probe_id_seq_ls))) result_handle, error_info = NCBIStandalone.blastall(blast_bin_path, "blastn", database_fname, tmp_blast_infname, align_view=7) #error_info = error_info.read() #2010-4-14 this read() causes program to hang out forever. ??? #if error_info: # sys.stderr.write("%s"%error_info) blast_records = NCBIXML.parse(result_handle) if self.report: sys.stderr.write("I'm %s, finished blasting.\n" % node_rank) for blast_record in blast_records: no_of_hits = min( 1000, len(blast_record.alignments )) # top 1000 or the number of available alignments for i in range(no_of_hits): alignment_title = blast_record.alignments[i].title for hsp in blast_record.alignments[i].hsps: if hsp.identities >= min_no_of_identities: result_entry = [blast_record.query, alignment_title, hsp.query_start, hsp.query_end, \ hsp.identities, hsp.sbjct_start, hsp.sbjct_end,] #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start #[query name (probe id and pos) , alignment title , number of matches, pos in contig ] result_ls.append(result_entry) if self.report: sys.stderr.write("I'm %s, finished with %s blasts, got %s returns.\n"%\ (node_rank, len(probe_id_seq_ls), len(result_ls))) return result_ls
def seqBlast(self, seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None): ''' command line blast blastall -d database -i query -p blastn -o blastout ''' if not os.path.exists(os.path.expanduser(seqFile)): print "(ignore) %s file not found" %(seqFile) if not os.path.exists(os.path.expanduser(self.blastDB + ".nsq")): print "(ignore) %s file not found" % (self.blastDB) (resultHandle,errorHandle) = NCBIStandalone.blastall(self.blastExe, blastType, self.blastDB, seqFile) time.sleep(5) blastRecords = NCBIXML.parse(resultHandle) blastRecords = list(blastRecords) resultHandle.close() errorHandle.close() return blastRecords
query_sequences = {} it = Bio.Fasta.Iterator(handle, Bio.Fasta.SequenceParser()) seq = it.next() while seq: query_sequences[seq.description] = {} query_sequences[seq.description]["number_of_hits"] = 0 print seq.description print query_sequences[seq.description]["number_of_hits"] seq = it.next() handle.close() blast_out, error_handle = NCBIStandalone.blastall(blast_exe, blast_program, blast_db, blast_file) #print error_handle records = NCBIXML.parse(blast_out) #b_record = records.next() # # #E_VALUE_THRESH = 0.000000004 # #print dir(b_record) #print b_record.num_sequences #print "Query = %s" % b_record.query #b_record = records.next()
documentation. """ # standard library import os import sys # biopython from Bio.Blast import NCBIStandalone my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta') my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast', 'sorghum_est-test.fasta') my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall') print 'Running blastall...' blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn', my_blast_db, my_blast_file) b_parser = NCBIStandalone.BlastParser() b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) while 1: b_record = b_iterator.next() if b_record is None: break E_VALUE_THRESH = 0.04 for alignment in b_record.alignments: for hsp in alignment.hsps:
def __init__(self, blastcmd, program, database, infile, **kargs): if 'align_view' in kargs: kargs.pop('align_view') blastout, blasterr = NCBIStandalone.blastall( blastcmd, program, database, infile, **kargs) BlastOutputReader.__init__(self, blastout)
def runBlast(self, inputFname=None, databaseFname=None, outputFname=None, outputFnamePrefix=None, \ blastallPath=None, minNoOfIdentities=None, \ maxNoOfMismatches=None,\ minIdentityPercentage=None, maxNoOfHits=10): """ 2012.8.19 output xml dump if outputFnamePrefix is given. 2012.5.23 -p Program Name [String] -d Database [String] default = nr -i Query File [File In] default = stdin -e Expectation value (E) [Real] default = 10.0 blastall align_view option values: 0 = pairwise, 1 = query-anchored showing identities, 2 = query-anchored no identities, 3 = flat query-anchored, show identities, 4 = flat query-anchored, no identities, 5 = query-anchored no identities and blunt ends, 6 = flat query-anchored, no identities and blunt ends, 7 = XML Blast output, 8 = tabular, 9 tabular with commresult_handleent lines 10 ASN, text 11 ASN, binary [Integer] default = 0 range from 0 to 11 """ result_handle, error_info = NCBIStandalone.blastall(blastallPath, "blastn", databaseFname, inputFname, align_view=7) #blastn_cline = NcbiblastnCommandline(cmd=self.blastnPath, query=inputFname, db=databaseFname, evalue=0.001,\ # outfmt=5, out="opuntia.xml") #outfmt 5 is xml output. #error_info = error_info.read() #2010-4-14 this read() causes program to hang out forever. ??? #if error_info: # sys.stderr.write("%s"%error_info) if outputFnamePrefix: outf = open('%s.xml' % (outputFnamePrefix), 'w') blastContent = result_handle.read() outf.write(blastContent) outf.close() result_handle = cStringIO.StringIO(blastContent) blast_records = NCBIXML.parse(result_handle) if self.report: sys.stderr.write("finished blasting.\n") counter = 0 writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['queryID', "queryStart", "queryEnd", 'queryLength', 'targetChr', 'targetStart', 'targetStop', \ 'targetLength', 'noOfIdentities', \ 'noOfMismatches', 'identityPercentage'] writer.writerow(header) for blast_record in blast_records: no_of_hits = min(maxNoOfHits, len(blast_record.alignments) ) # top 50 or the number of available alignments # each alignment is one chromosome (=one fasta record). for i in range(no_of_hits): alignment_title = blast_record.alignments[i].title targetChr = blast_record.alignments[i].hit_def targetLength = blast_record.alignments[i].length for hsp in blast_record.alignments[i].hsps: hitIsGood = True noOfMismatches = blast_record.query_length - hsp.identities identityPercentage = float(hsp.identities) / float( blast_record.query_length) if minNoOfIdentities is not None and hsp.identities < minNoOfIdentities: hitIsGood = False if maxNoOfMismatches is not None and noOfMismatches > maxNoOfMismatches: hitIsGood = False if minIdentityPercentage is not None and identityPercentage < minIdentityPercentage: hitIsGood = False if hitIsGood: counter += 1 result_entry = [blast_record.query, hsp.query_start, hsp.query_end, blast_record.query_length,\ targetChr, hsp.sbjct_start, hsp.sbjct_end, targetLength, hsp.identities, noOfMismatches,\ identityPercentage] #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start #[query name (probe id and pos) , alignment title , number of matches, pos in contig ] writer.writerow(result_entry) if self.report: sys.stderr.write("%s blast records, %s pass the filter.\n"%\ (len(blast_records), counter)) del writer
def do_blast_search(self): from Bio.Blast import NCBIStandalone self.result_handle, self.error_handle = NCBIStandalone.blastall(self.blast_exe, "blastp", self.blast_db, self.blast_file)
def localBlast( self, seqFile, db, method='blastp', resultOut=None, e='0.01', **kw ): """ Performa a local blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search. @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search, e.g. ['swissprot', 'pdb'] @type db: [str] @param method: search program to use, e.g. 'blastp', 'fasta' (default: blastp) @type method: str @param e: expectation value cutoff @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 0). gap_extend Gap extension penalty (default 0). --- Algorithm --- gapped Whether to do a gapped alignment. T/F (default T) wordsize Word size (blastp default 11). keep_hits Number of best hits from a region to keep (default off). xdrop Dropoff value (bits) for gapped alignments (blastp default 25). hit_extend Threshold for extending hits (blastp default 11) --- Processing --- filter Filter query sequence? (T/F, default F) restrict_gi Restrict search to these GI's. believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments. (default 250) @type kw: any @raise BlastError: if program call failes """ results = err = p = None resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT kw = self.__dictvalues2str( kw ) e = str(e) try: if self.verbose: self.log.add('running blast...') results, err = NCBIStandalone.blastall( settings.blast_bin, method, db, seqFile, expectation=e, align_view='7', ## XML output **kw) results = self.__copyFileHandle(results, resultOut) err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut ) parsed = NCBIXML.parse( results ).next() self.__blast2dict( parsed, db ) except Exception, why: self.log.add( T.lastErrorTrace() ) globals().update( locals() ) self.log.writeln('local namespace is pushed into global ') raise BlastError( str(why) )
def getUniverseFromMSA(fname, propid, blastdb, ecutoff, use_subfamily_seeds): # executables uniqueseq = '/usr/bin/uniqueseq' blastall = '/usr/bin/blastall' fastacmd = '/usr/bin/fastacmd' if use_subfamily_seeds: f = open("/home/ruchira/pfam_subdir_dict.pkl") pfam_subdir_dict = cPickle.load(f) f.close() fam = os.path.split(os.getcwd())[1] if fam not in pfam_subdir_dict: print "Unknown PFAM family %s, exiting..." % fam sys.exit(1) seq_path_str = ' '.join(list(pfam_subdir_dict[fam])) os.system('cat %s > gufmsa.seeds.fa' % seq_path_str) else: # remove redundant sequences print 'making %s unique at %.4f proportion identical...' % (fname, propid), sys.stdout.flush() cmd = '%s gufmsa -alignfile %s -percentid %f &> /dev/null' % ( uniqueseq, fname, propid) os.system(cmd) handle = open('gufmsa.a2m', 'r') outf = open('gufmsa.seeds.fa', 'w') seq_count = 0 for x in SeqIO.parse(handle, 'fasta'): seq_count += 1 header = x.description sequence = x.seq.tostring() print >> outf, '>%s' % header sequence = sequence.replace('-', '') sequence = sequence.replace('.', '') sequence = sequence.upper() print >> outf, sequence outf.close() handle.close() print 'done; %d sequences remaining.' % seq_count # blast non-redundant sequences print 'blasting remaining sequences against %s...' % blastdb, sys.stdout.flush() result_handle, error_handle = NCBIStandalone.blastall(blastall, 'blastp', blastdb, 'gufmsa.seeds.fa', expectation=ecutoff, descriptions=10000, alignments=10000, filter=False) blasthit_ids = set([]) for blast_record in NCBIXML.parse(result_handle): for alignment in blast_record.alignments: blasthit_ids.add(alignment.hit_id) outf = open('gufmsa.blasthits.ids', 'w') for x in blasthit_ids: print >> outf, x outf.close() print 'done; %d potential homologs found.' % len(blasthit_ids) # retrieve full-length sequences from the db print 'Retrieving full-length potential homolog sequences...', sys.stdout.flush() cmd = '%s -i gufmsa.blasthits.ids -d %s > TheUniverse.fa' % (fastacmd, blastdb) os.system(cmd) print 'done; universe is in universe.fa'
def blast(blastRootDirectory): if sys.platform == 'win32': blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta') else: if not os.path.isdir('/tmp/BLAST'): print "making directory '/tmp/BLAST'" os.mkdir('/tmp/BLAST/') if not os.path.exists('/tmp/BLAST/formatdb'): shutil.copy(os.path.join(blastRootDirectory, 'formatdb'), '/tmp/BLAST') print "copying 'formatdb' to '/tmp/BLAST/'" blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta') #print 'path to blastDB.fasta:', blast_db blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt') #print 'path to filetoblast.txt:', blast_file if sys.platform == 'win32': blastall_name = 'Blastall.exe' blast_exe = os.path.join(blastRootDirectory, blastall_name) else: blastall_name = 'blastall' blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name) #print 'path to blastall:', blast_exe if sys.platform == 'win32': import win32api blast_db = win32api.GetShortPathName(blast_db) blast_file = win32api.GetShortPathName(blast_file) blast_exe = win32api.GetShortPathName(blast_exe) #cont = raw_input('blah') #try: blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file, align_view=7) #except: # f = open(blast_file, 'r') # s = file.read() # print s #print 'done BLASTing' print 'errors:', error_info.read() print 'blast output:', blast_out.read() b_parser = NCBIXML.BlastParser() #print 'got parser' b_record = b_parser.parse(blast_out) b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) #print 'got iterator' results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 b_record = b_iterator.next() if not b_record: break print 'query:', b_record.query if b_record is None: break e_value_thresh = 0.001 print 'number of alignments:', len(b_record.alignments) significant = False for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">", "") if b_record.query != alignment.title: significant = True print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches' results.append( (b_record.query, alignment.title, hsp.expect)) print b_record.query, significant if not significant: print 'adding', b_record.query, 'to the list of queries without matches' nonmatchingQueries.append(b_record.query) return nonmatchingQueries, results
def blast(blastRootDirectory): if sys.platform == 'win32': blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta') else: if not os.path.isdir('/tmp/BLAST'): print "making directory '/tmp/BLAST'" os.mkdir('/tmp/BLAST/') if not os.path.exists('/tmp/BLAST/formatdb'): shutil.copy(os.path.join(blastRootDirectory,'formatdb'), '/tmp/BLAST') print "copying 'formatdb' to '/tmp/BLAST/'" blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta') #print 'path to blastDB.fasta:', blast_db blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt') #print 'path to filetoblast.txt:', blast_file if sys.platform == 'win32': blastall_name = 'Blastall.exe' blast_exe = os.path.join(blastRootDirectory, blastall_name) else: blastall_name = 'blastall' blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name) #print 'path to blastall:', blast_exe if sys.platform == 'win32': import win32api blast_db = win32api.GetShortPathName(blast_db) blast_file = win32api.GetShortPathName(blast_file) blast_exe = win32api.GetShortPathName(blast_exe) #cont = raw_input('blah') #try: blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file, align_view=7) #except: # f = open(blast_file, 'r') # s = file.read() # print s #print 'done BLASTing' print 'errors:', error_info.read() print 'blast output:', blast_out.read() b_parser = NCBIXML.BlastParser() #print 'got parser' b_record = b_parser.parse(blast_out) b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) #print 'got iterator' results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 b_record = b_iterator.next() if not b_record: break print 'query:', b_record.query if b_record is None: break e_value_thresh = 0.001 print 'number of alignments:', len(b_record.alignments) significant = False for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">","") if b_record.query != alignment.title: significant = True print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches' results.append((b_record.query, alignment.title, hsp.expect)) print b_record.query, significant if not significant: print 'adding', b_record.query, 'to the list of queries without matches' nonmatchingQueries.append(b_record.query) return nonmatchingQueries, results
def localBlast(self, seqFile, db, method='blastp', resultOut=None, e='0.01', **kw): """ Performa a local blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search. @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search, e.g. ['swissprot', 'pdb'] @type db: [str] @param method: search program to use, e.g. 'blastp', 'fasta' (default: blastp) @type method: str @param e: expectation value cutoff @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 0). gap_extend Gap extension penalty (default 0). --- Algorithm --- gapped Whether to do a gapped alignment. T/F (default T) wordsize Word size (blastp default 11). keep_hits Number of best hits from a region to keep (default off). xdrop Dropoff value (bits) for gapped alignments (blastp default 25). hit_extend Threshold for extending hits (blastp default 11) --- Processing --- filter Filter query sequence? (T/F, default F) restrict_gi Restrict search to these GI's. believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments. (default 250) @type kw: any @raise BlastError: if program call failes """ results = err = p = None resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT kw = self.__dictvalues2str(kw) e = str(e) try: if self.verbose: self.log.add('running blast...') results, err = NCBIStandalone.blastall( settings.blast_bin, method, db, seqFile, expectation=e, align_view='7', ## XML output **kw) results = self.__copyFileHandle(results, resultOut) err = self.__copyFileHandle(err, self.outFolder + self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut) parsed = NCBIXML.parse(results).next() self.__blast2dict(parsed, db) except Exception, why: self.log.add(T.lastErrorTrace()) globals().update(locals()) self.log.writeln('local namespace is pushed into global ') raise BlastError(str(why))
def dazheMpiBlast(): """ 2010-4-14 wrap all of dazhe's old code into this function """ blast_bin_path = '/home/cmb-01/yuhuang/bin/blast/bin/blastall' database_fname = '/home/cmb-01/dazhemen/Data/Ler/Cereon_Ath_Ler.fasta' probes = vardata.vardata() comm = MPI.world.duplicate() ppp = 10 # probes per processor accs = [] genome = ref_genome() genome.load_chr() probes.readfromfile("/home/cmb-01/dazhemen/CNV/CNV_probelist.csv", format=2) ppp = len(probes.data) / (comm.size - 1) + 1 print "I'm %s of %s, finished loading\n" % (comm.rank, comm.size) if comm.rank == 0: #Master node, reads the file etc outf = open("/home/cmb-01/dazhemen/CNV/ler_raw.csv", "w") outf.write( "Chromosome,Position,Probe_ID,Alignment_title,Number_matches\n") for dest in range(1, comm.size): data, source, tag = comm.receiveString(dest, None) print "I'm 0, collected data from %s\n" % source partial_result_list = cPickle.loads(data) for result_entry in partial_result_list: outf.write(result_entry[0].replace("_", ",") + "," + ",".join([str(a) for a in result_entry[1:]]) + "\n") else: probe_index_start = (comm.rank - 1) * ppp result_ls = [] if probe_index_start + ppp >= len(probes.data): ppp = len(probes.data) - probe_index_start if ppp != 0: tmp_blast_infname = '/home/cmb-01/dazhemen/CNV/tmp_blast/' + str( comm.rank) inf = open(tmp_blast_infname, 'w') for i in xrange(probe_index_start, probe_index_start + ppp): inf.write(">%s_%s_%s\n" % (probes.data[i][0], probes.data[i][1], probes.data[i][2][0])) # write the probe id inf.write( "%s\n" % genome.readprobe(probes.data[i][0], probes.data[i][1])) inf.close() print "I'm %s, finished with generating blast file from probes %s to %s\n" % ( comm.rank, probe_index_start, probe_index_start + ppp) result_handle, error_info = NCBIStandalone.blastall( blast_bin_path, "blastn", database_fname, tmp_blast_infname, align_view=7) blast_records = NCBIXML.parse(result_handle) print "I'm %s, finished with blasting\n" % comm.rank while 1: try: blast_record = blast_records.next() except: "I'm %s, finished with all records\n" % comm.rank break no_of_hits = min(1000, len(blast_record.alignments)) for i in range(no_of_hits): alignment_title = blast_record.alignments[i].title for hsp in blast_record.alignments[i].hsps: result_entry = [ blast_record.query, alignment_title, 0, 0 ] #[query name (probe id and pos) , alignment title , number of matches, pos in contig ] if hsp.identities >= 15: result_entry[2] = hsp.identities result_entry[3] = hsp.sbjct_start result_ls.append(result_entry) print "I'm %s, finished with parsing blast result\n" % comm.rank result_data = cPickle.dumps(result_ls) comm.send(result_data, 0, 0)
#!/usr/bin/python my_blast_db = "/home/kenglish/Data/Genomes/Databases/EST_Clade_A" my_blast_file = "Record1.fasta" my_blast_exe = "/usr/bin/blastall" from Bio.Blast import NCBIStandalone from Bio.Blast import NCBIXML result_handle, error_handle = NCBIStandalone.blastall(my_blast_exe, "blastn", my_blast_db, my_blast_file) #$blast_results = result_handle.read() #print blast_results from Bio.Blast import NCBIXML blast_records = NCBIXML.parse(result_handle) blast_record = blast_records.next() print blast_record.alignments
def __init__(self, blastcmd, program, database, infile, **kargs): if 'align_view' in kargs: kargs.pop('align_view') blastout, blasterr = NCBIStandalone.blastall(blastcmd, program, database, infile, **kargs) BlastOutputReader.__init__(self, blastout)