def blast2data(filehandle): ###This should be for blast-txt """BLAST output to data dict""" data = {} blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser) for blast_record in blast_iterator: readname = blast_record.query.split()[0] for alignment in blast_record.alignments: if re.search("\|", alignment.title): fields = alignment.title[1:].strip().split('|') refgi = fields[1] else: fields = alignment.title[1:].strip().split(' ') refgi = fields[0] for hsp in alignment.hsps: if hsp.expect > EVALUE_CUTOFF: continue identity = round( float(hsp.identities[0]) * 100 / hsp.identities[1], 1) start = hsp.sbjct_start end = hsp.sbjct_end if start > end: tmp = start start = end end = tmp if not data.has_key(refgi): data[refgi] = [] if not refgi in references: references.append(refgi) refgenome2json(refgi) if not refLengths.has_key(refgi): refLengths[refgi] = gi2length(refgi) data[refgi].append([start, identity, end, readname]) return data
def getCoordinatesFromBlo(bloFname, padding): ''' # Extract coordinates from blo file ''' coord = {} #outf = open(outFile, 'w') blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser) #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet) for hit in blast_iterator: for alignment in hit.alignments: for hsp in alignment.hsps: #print alignment.title #print hsp.sbjct_start, hsp.sbjct_end #print hsp.sbjct #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct)) new = True fullName = alignment.title.replace('>', '') if fullName in coord.keys() and hsp.sbjct_start >= coord[ fullName][0] and hsp.sbjct_end <= coord[fullName][1]: new = False if new: coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end] return coord
def getHits(gene): ''' BLAST parser using Biopython Input: name of blast out file in standard ouput format Outputs: 2 files ''' inf = open(o.blast, 'rU') parser = NCBIStandalone.BlastParser() error_parser = NCBIStandalone.BlastErrorParser(inf) iterator = NCBIStandalone.Iterator(inf, error_parser) err_iterator = NCBIStandalone.Iterator(inf, error_parser) #next_record = ## *** Parsing *** ## lg = len(gene) if o.verbose == True: sys.stderr.write("\nGetting hits...\n") for record in iterator: query = record.query.split(" ")[0] if query in gene: out.write("%s\n" % gene[query]) if record.alignments is []: out.write("%s\tNA\tNA\tNA\n" % gene[query]) else: flag = 0 for alignment in record.alignments: for hsp in alignment.hsps: #-->## ** Selection Process **## if float(hsp.expect) < 0.0001 and flag < 3: out.write( "%s\t%s\t%s\tHigh\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 1.0 and flag < 3: out.write( "%s\t%s\t%s\tLow\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 5.0 and flag < 3: out.write( "%s\t%s\t%s\tScare\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) > 1.0 and flag < 1: out.write("%s\tNA\tNA\tNA\n" % gene[query]) flag += 1 del gene[query] if o.verbose == True: sys.stderr.write('\r' + '' * 0) sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%') sys.stdout.flush() else: pass if (lg - len(gene)) != len(gene): sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
def runBlast(self, result_handle=None): if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall(self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors) #kdrew: if we want to pre-run blast, just run this line blast_records = NCBIXML.parse(result_handle) return blast_records
def blast_2_files(input_filename,input_db): blast_db = fastafile.PERMANENT_STORE + input_db blast_file = fastafile.PERMANENT_STORE + input_filename if not os.path.exists(blast_db + ".nin"): fastafile.formatdb(blast_db) blast_out, error_handle = NCBIStandalone.blastall(BLAST_EXE, BLAST_PROGRAM, blast_db, blast_file) return blast_out
def blastfile(self, filename): # run blast b_out, e_info = NCBIStandalone.blastall(self.blastexe, self.mode, self.dbname, filename) data = b_out.read() if not data: raise ValueError, 'BLAST error: %s' % e_info.read() return data
def __find_partials(self, minimum_blast_length=0): """find partial IS elements by blasting the sequences against the genome""" #if there are no IS elements, skip this step if len(self.annotations) == 0: return #write a temporary genome fasta file blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta") outf = open(blast_db, "w") SeqIO.write(self.as_records(), outf, "fasta") outf.close() #turn it into a database os.system(FORMAT_EXE + " -p F -i " + blast_db) #write a temporary IS fasta file blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta") self.__write_singles(blast_file) #get the directions of these sample IS's directions = [is_set.lst[0].direction for is_set in self.annotations] #clear annotations self.annotations = [] #perform a blast result_handle, error_handle = NCBIStandalone.blastall( BLAST_EXE, "blastn", blast_db, blast_file) blast_records = NCBIXML.parse(result_handle) #iterate over the results and the directions of the queries for record, sample_direction in zip(blast_records, directions): ISlist = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \ len(hsp.sbjct) > minimum_blast_length: chromosome = alignment.title.split(" ")[1] start = hsp.sbjct_start - 1 end = start + len(hsp.sbjct) #find out what the gene is f = self.get_feature(chromosome, start, end) thisdir = hsp.frame[1] * sample_direction ISlist.append( IS.IS(f, chromosome, start, end, self, dir=thisdir)) if len(ISlist) > 0: self.annotations.append(ISSet.ISSet(ISlist, self.profile)) #clean up- remove the temporary files os.remove(blast_db) os.remove(blast_file) for f in glob.glob(blast_db + ".n*"): os.remove(f) os.remove("formatdb.log")
def extract_sequences(file): scanner = NCBIStandalone._Scanner() consumer = SequencesExtractor() file_to_parse = open(file, 'r') scanner.feed(file_to_parse, consumer) file_to_parse.close() return consumer.sequences_list
def __find_partials(self, minimum_blast_length=0): """find partial IS elements by blasting the sequences against the genome""" #if there are no IS elements, skip this step if len(self.annotations) == 0: return #write a temporary genome fasta file blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta") outf = open(blast_db, "w") SeqIO.write(self.as_records(), outf, "fasta") outf.close() #turn it into a database os.system(FORMAT_EXE + " -p F -i " + blast_db) #write a temporary IS fasta file blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta") self.__write_singles(blast_file) #get the directions of these sample IS's directions = [is_set.lst[0].direction for is_set in self.annotations] #clear annotations self.annotations = [] #perform a blast result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE, "blastn", blast_db, blast_file) blast_records = NCBIXML.parse(result_handle) #iterate over the results and the directions of the queries for record, sample_direction in zip(blast_records, directions): ISlist = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \ len(hsp.sbjct) > minimum_blast_length: chromosome = alignment.title.split(" ")[1] start = hsp.sbjct_start-1 end = start + len(hsp.sbjct) #find out what the gene is f = self.get_feature(chromosome, start, end) thisdir = hsp.frame[1] * sample_direction ISlist.append(IS.IS(f, chromosome, start, end, self, dir=thisdir)) if len(ISlist) > 0: self.annotations.append(ISSet.ISSet(ISlist, self.profile)) #clean up- remove the temporary files os.remove(blast_db) os.remove(blast_file) for f in glob.glob(blast_db + ".n*"): os.remove(f) os.remove("formatdb.log")
def blast(self): '''aligns sequences using blast''' blastAppDir = self.blastAppDir blastDB = os.path.join(self.blastDataDir, 'blastDB.fasta') blastQueryFile = os.path.join(self.blastDataDir, 'filetoblast.txt') print 'path to filetoblast.txt:', blastQueryFile if sys.platform == 'win32': blastall_name = 'Blastall.exe' else: blastall_name = 'blastall' blast_exe = os.path.join(blastAppDir, blastall_name) if sys.platform == 'win32': import win32api blastDB = win32api.GetShortPathName(blast_db) blastQueryFile = win32api.GetShortPathName(blastQueryFile) blast_exe = win32api.GetShortPathName(blast_exe) blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blastDB, blastQueryFile, align_view=7) #print error_info.read() #print blast_out.read() blast_records = NCBIXML.parse(blast_out) results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 try: b_record = blast_records.next() except StopIteration: break if not b_record: continue print 'query:', b_record.query e_value_thresh = 0.0001 significant = False for alignment in b_record.alignments: bestHsp = None for hsp in alignment.hsps: if not bestHsp: bestHsp = hsp.expect elif bestHsp < hsp.expect: continue if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">","") #if b_record.query != alignment.title: #print 'dir(alignment):', dir(alignment) #print 'hsps: ',alignment.hsps, 'accession:', alignment.accession, 'title:', alignment.title, 'length:', alignment.length if b_record.query != alignment.accession: significant = True print 'adding', b_record.query, 'and', alignment.accession, 'to matches (e value: ',hsp.expect, ', bit score: ', hsp.bits, ')' results.append((b_record.query, alignment.accession, hsp.expect, hsp.bits)) print b_record.query, significant #if not significant: # print 'adding', b_record.query, 'to the list of queries without matches' # results.append((b_record.query, None, None)) return results
def __init__(self, dbname=None, blastexe=None, mode=None, parser=None): if dbname is None: dbname = DEFAULT_BLAST_DB if blastexe is None: blastexe = DEFAULT_BLAST_EXE if mode is None: mode = DEFAULT_BLAST_MODE if parser is None: parser = NCBIStandalone.BlastParser() self.dbname = dbname self.blastexe = blastexe self.parser = parser self.mode = mode
def runBlast(self, result_handle=None): # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records. # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on # Output: an iterator over a sequence of Record objects # If no filehandle given, or filehandle given is None, run new blast. if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall( self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors ) # Parse and return blast records from given filehandle or new blast run. blast_records = NCBIXML.parse(result_handle) return blast_records
def blast_parse(file, e, output): result_handle = open(file) blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser) blast_record = next(blast_iterator) output = open(output, 'w') output.write('query title\tdescription\tlength\te value' + '\n') for blast_record in blast_iterator: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < e: output.write(str(blast_record.query[:18]) + ' \t') output.write(str(alignment.title) + '\t') output.write(str(alignment.length) + '\t') output.write(str(hsp.expect) + '') output.write('\n') output.close()
def blastall_seq2db(header, sequence, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [uniquetag, str(header).replace(" ", "_"), sequence[0:10] + ".fa"]) fname = osPathJoin(OSgetcwd(), fname) fh = open(fname, 'w') fh.write(">%s\n%s\n" % (header, sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def runBlast(self, result_handle=None): # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records. # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on # Output: an iterator over a sequence of Record objects # If no filehandle given, or filehandle given is None, run new blast. if result_handle == None: result_handle, error_handle = NCBIStandalone.blastall( self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors) # Parse and return blast records from given filehandle or new blast run. blast_records = NCBIXML.parse(result_handle) return blast_records
def identify_family(self, aaseq): """given an amino acid sequence, identify its family""" blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta") outf = open(blast_file, "w") temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq) SeqIO.write([temp_record], outf, "fasta") outf.close() result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE, "blastp", self.tpase_file, blast_file) try: record = NCBIXML.parse(result_handle).next() except ValueError: raise Exception("BLAST Exception: " + error_handle.read()) best_hsp = None best_alignment = None #perform blast for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < TPASE_MAX_E_VALUE: if best_hsp: if hsp.score > best_hsp.score: best_alignment = alignment best_hsp = hsp else: best_alignment = alignment best_hsp = hsp #find family and group family = None group = None if best_hsp: fields = re.split("[\s\t]+", best_alignment.title)[1].split("|") #best_IS = self.__fetch_by_name(fields[0]) family, group = fields[2], fields[3] #clean up by removing temporary blast file os.remove(blast_file) return family, group
def identify_family(self, aaseq): """given an amino acid sequence, identify its family""" blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta") outf = open(blast_file, "w") temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq) SeqIO.write([temp_record], outf, "fasta") outf.close() result_handle, error_handle = NCBIStandalone.blastall( BLAST_EXE, "blastp", self.tpase_file, blast_file) try: record = NCBIXML.parse(result_handle).next() except ValueError: raise Exception("BLAST Exception: " + error_handle.read()) best_hsp = None best_alignment = None #perform blast for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < TPASE_MAX_E_VALUE: if best_hsp: if hsp.score > best_hsp.score: best_alignment = alignment best_hsp = hsp else: best_alignment = alignment best_hsp = hsp #find family and group family = None group = None if best_hsp: fields = re.split("[\s\t]+", best_alignment.title)[1].split("|") #best_IS = self.__fetch_by_name(fields[0]) family, group = fields[2], fields[3] #clean up by removing temporary blast file os.remove(blast_file) return family, group
def blast(self,fasta,output): """ Blast the fasta, consume the output buffer, return the output filename """ runtime().debug("Blasting %s with alignment %s using %s" %(fasta, self.alignment,self.blast_exe)) r,e = NCBIStandalone.blastpgp(self.blast_exe, self.db, fasta, align_infile=self.alignment, align_outfile=output, expectation=self.expect, model_threshold=self.expect, npasses=3, nprocessors=1, **self.kwargs) consume(r) return output
def blast(self, fasta, output): """ Blast the fasta, consume the output buffer, return the output filename """ runtime().debug("Blasting %s with alignment %s using %s" % (fasta, self.alignment, self.blast_exe)) r, e = NCBIStandalone.blastpgp(self.blast_exe, self.db, fasta, align_infile=self.alignment, align_outfile=output, expectation=self.expect, model_threshold=self.expect, npasses=3, nprocessors=1, **self.kwargs) consume(r) return output
def blastOneBatchProbes(self, probe_id_seq_ls, blast_bin_path, database_fname, \ tmp_blast_infname, min_no_of_identities=15, node_rank=0): """ 2010-4-14 """ result_ls = [] inf = open(tmp_blast_infname, 'w') for probe_id, probe_seq in probe_id_seq_ls: inf.write(">%s\n" % probe_id) # write the probe id inf.write("%s\n" % probe_seq) inf.close() if self.report: sys.stderr.write("I'm %s, finished generating blast file for %s probes.\n"%\ (node_rank, len(probe_id_seq_ls))) result_handle, error_info = NCBIStandalone.blastall(blast_bin_path, "blastn", database_fname, tmp_blast_infname, align_view=7) #error_info = error_info.read() #2010-4-14 this read() causes program to hang out forever. ??? #if error_info: # sys.stderr.write("%s"%error_info) blast_records = NCBIXML.parse(result_handle) if self.report: sys.stderr.write("I'm %s, finished blasting.\n" % node_rank) for blast_record in blast_records: no_of_hits = min( 1000, len(blast_record.alignments )) # top 1000 or the number of available alignments for i in range(no_of_hits): alignment_title = blast_record.alignments[i].title for hsp in blast_record.alignments[i].hsps: if hsp.identities >= min_no_of_identities: result_entry = [blast_record.query, alignment_title, hsp.query_start, hsp.query_end, \ hsp.identities, hsp.sbjct_start, hsp.sbjct_end,] #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start #[query name (probe id and pos) , alignment title , number of matches, pos in contig ] result_ls.append(result_entry) if self.report: sys.stderr.write("I'm %s, finished with %s blasts, got %s returns.\n"%\ (node_rank, len(probe_id_seq_ls), len(result_ls))) return result_ls
def seqBlast(self, seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None): ''' command line blast blastall -d database -i query -p blastn -o blastout ''' if not os.path.exists(os.path.expanduser(seqFile)): print "(ignore) %s file not found" %(seqFile) if not os.path.exists(os.path.expanduser(self.blastDB + ".nsq")): print "(ignore) %s file not found" % (self.blastDB) (resultHandle,errorHandle) = NCBIStandalone.blastall(self.blastExe, blastType, self.blastDB, seqFile) time.sleep(5) blastRecords = NCBIXML.parse(resultHandle) blastRecords = list(blastRecords) resultHandle.close() errorHandle.close() return blastRecords
def blastall_file2db(fname, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'tblastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() # do NOT remove the input fname except: co.close() error = ce.read().strip() ce.close() print command print "ERROR: '%s'" % error raise "BLAST CRASHED...." # and return! return blastallout
def __init__(self, blastcmd, program, database, infile, **kargs): if 'align_view' in kargs: kargs.pop('align_view') blastout, blasterr = NCBIStandalone.blastall( blastcmd, program, database, infile, **kargs) BlastOutputReader.__init__(self, blastout)
def __init__(self, blastcmd, program, database, infile, **kargs): if 'align_view' in kargs: kargs.pop('align_view') blastout, blasterr = NCBIStandalone.blastall(blastcmd, program, database, infile, **kargs) BlastOutputReader.__init__(self, blastout)
def localPSIBlast( self, seqFile, db, method='blastp', resultOut=None, e='0.001', **kw ): """ Performa a local psi-blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) for the search @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search e.g. ['swissprot', 'pdb'] @type db: [str] @param e: expectation value cutoff (default: 0.001) @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- New Blast+ routine --- (see NcbipsiblastCommandline) num_iterations Number of passes (default 1). matrix Matrix to use (default BLOSUM62). --- old blastall routine --- --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 11). gap_extend Gap extension penalty (default 1). window_size Multiple hits window size (default 40). npasses Number of passes (default 1). passes Hits/passes (Integer 0-2, default 1). --- Algorithm --- gapped Whether to do a gapped alignment (T/F, default T). wordsize Word size (default 3). keep_hits Number of beset hits from a region to keep (def 0) xdrop Dropoff value (bits) for gapped alignments (def 15) hit_extend Threshold for extending hits (default 11). nbits_gapping Number of bits to trigger gapping (default 22). pseudocounts Pseudocounts constants for multiple passes (def 9). xdrop_final X dropoff for final gapped alignment (default 25). xdrop_extension Dropoff for blast extensions (default 7). model_threshold E-value threshold to include in multipass model (default 0.005). required_start Start of required region in query (default 1). required_end End of required region in query (default -1). --- Processing --- filter Filter query sequence with SEG? (T/F, default F) believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments (default 250). @type kw: any @raise BlastError: if program call failes """ ## the following should work for new Blast+ tools: #from Bio.Blast.Applications import NcbipsiblastCommandline #resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT #blastx_cline = NcbipsiblastCommandline(query=seqFile, #db=db, #evalue=e, #outfmt=5, #out=resultOut, #**kw) #stdout, stderr = blastx_cline() #parsed = NCBIXML.parse( results ).next() #self.__blast2dict( parsed, db ) results = err = None resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT kw = self.__dictvalues2str( kw ) e = str(e) try: results, err = NCBIStandalone.blastpgp( settings.psi_blast_bin, db, seqFile, program='blastpgp', align_view='7', ## XML output expectation=e, **kw) results = self.__copyFileHandle(results,resultOut ) err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut ) parsed = NCBIXML.parse( results ).next() self.__blast2dict( parsed, db ) except Exception, why: self.log.add( T.lastErrorTrace() ) globals().update( locals() ) self.log.writeln('local namespace is pushed into global ') raise BlastError( str(why) )
handle = func("/somewhere/blast", "blastz", "nr", "/tmp/example.fasta", \ nprocessors=4, expectation="0.001", filter= "F > /etc/passwd'") assert False, "Attempted output redirection not caught!" except ValueError, e: assert str(e) == "Rejecting suspicious argument for filter" #Good ### _Scanner print "Running tests on _Scanner" scanner = NCBIStandalone._Scanner() for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.AbstractConsumer()) for test in detailed_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.TaggingConsumer()) ### BlastParser print "Running tests on BlastParser" parser = NCBIStandalone.BlastParser()
def localBlast(self, seqFile, db, method='blastp', resultOut=None, e='0.01', **kw): """ Performa a local blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search. @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search, e.g. ['swissprot', 'pdb'] @type db: [str] @param method: search program to use, e.g. 'blastp', 'fasta' (default: blastp) @type method: str @param e: expectation value cutoff @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 0). gap_extend Gap extension penalty (default 0). --- Algorithm --- gapped Whether to do a gapped alignment. T/F (default T) wordsize Word size (blastp default 11). keep_hits Number of best hits from a region to keep (default off). xdrop Dropoff value (bits) for gapped alignments (blastp default 25). hit_extend Threshold for extending hits (blastp default 11) --- Processing --- filter Filter query sequence? (T/F, default F) restrict_gi Restrict search to these GI's. believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments. (default 250) @type kw: any @raise BlastError: if program call failes """ results = err = p = None resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT kw = self.__dictvalues2str(kw) e = str(e) try: if self.verbose: self.log.add('running blast...') results, err = NCBIStandalone.blastall( settings.blast_bin, method, db, seqFile, expectation=e, align_view='7', ## XML output **kw) results = self.__copyFileHandle(results, resultOut) err = self.__copyFileHandle(err, self.outFolder + self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut) parsed = NCBIXML.parse(results).next() self.__blast2dict(parsed, db) except Exception, why: self.log.add(T.lastErrorTrace()) globals().update(locals()) self.log.writeln('local namespace is pushed into global ') raise BlastError(str(why))
def __init__(self, handle): """Initialize the class.""" self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
def __init__(self, handle): self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
import time from multiprocessing import Process, Queue import string from Bio.Seq import Seq from Bio.Blast import NCBIStandalone import fileinput import glob OutFile = r'Blast\out\02.blast_result_total.txt' f = open(OutFile, 'r') Mismatch_total_file = open('Mismatch_total_LOD v3_171103.txt', 'w') blast_parser = NCBIStandalone.BlastParser() print blast_parser iterator = NCBIStandalone.Iterator(f, blast_parser) for record in iterator: for alignment in record.alignments: for hsp in alignment.hsps: mismatch_Number = (hsp.identities[1] - (hsp.identities[0] + hsp.gaps[0])) Gaps_Number = hsp.gaps[0] if (mismatch_Number != 0): Mismatch_total_file.write('%s\t%s\t%s\t%s\n' % (record.query, alignment.title, mismatch_Number, Gaps_Number)) f.close()
starttime = time.time() print 'retrieving homologous sequences from UniProt using PSI-BLAST...', sys.stdout.flush() ## execute blastpgp ## blastexe = '/usr/bin/blastpgp' blastdb = '/clusterfs/ohana/external/UniProt/current/protein' iters = 4 eval = 0.0001 maxseqs = 1000 results, errors = NCBIStandalone.blastpgp(blastexe, blastdb, seedfname, expectation=eval, alignments=maxseqs, npasses=iters) ## parse psiblast hits to get ids ## blasthits = set([]) for blast_record in NCBIXML.parse(results): for alignment in blast_record.alignments: blasthits.add(alignment.hit_id) if len(blasthits) < 3: print 'Sorry, only %d homologs were retrieved from UniProt, too few sequences to determine patterns of evolutionary conservation.' % len( blasthits) sys.exit(0)
def dazheMpiBlast(): """ 2010-4-14 wrap all of dazhe's old code into this function """ blast_bin_path = '/home/cmb-01/yuhuang/bin/blast/bin/blastall' database_fname = '/home/cmb-01/dazhemen/Data/Ler/Cereon_Ath_Ler.fasta' probes = vardata.vardata() comm = MPI.world.duplicate() ppp = 10 # probes per processor accs = [] genome = ref_genome() genome.load_chr() probes.readfromfile("/home/cmb-01/dazhemen/CNV/CNV_probelist.csv", format=2) ppp = len(probes.data) / (comm.size - 1) + 1 print "I'm %s of %s, finished loading\n" % (comm.rank, comm.size) if comm.rank == 0: #Master node, reads the file etc outf = open("/home/cmb-01/dazhemen/CNV/ler_raw.csv", "w") outf.write( "Chromosome,Position,Probe_ID,Alignment_title,Number_matches\n") for dest in range(1, comm.size): data, source, tag = comm.receiveString(dest, None) print "I'm 0, collected data from %s\n" % source partial_result_list = cPickle.loads(data) for result_entry in partial_result_list: outf.write(result_entry[0].replace("_", ",") + "," + ",".join([str(a) for a in result_entry[1:]]) + "\n") else: probe_index_start = (comm.rank - 1) * ppp result_ls = [] if probe_index_start + ppp >= len(probes.data): ppp = len(probes.data) - probe_index_start if ppp != 0: tmp_blast_infname = '/home/cmb-01/dazhemen/CNV/tmp_blast/' + str( comm.rank) inf = open(tmp_blast_infname, 'w') for i in xrange(probe_index_start, probe_index_start + ppp): inf.write(">%s_%s_%s\n" % (probes.data[i][0], probes.data[i][1], probes.data[i][2][0])) # write the probe id inf.write( "%s\n" % genome.readprobe(probes.data[i][0], probes.data[i][1])) inf.close() print "I'm %s, finished with generating blast file from probes %s to %s\n" % ( comm.rank, probe_index_start, probe_index_start + ppp) result_handle, error_info = NCBIStandalone.blastall( blast_bin_path, "blastn", database_fname, tmp_blast_infname, align_view=7) blast_records = NCBIXML.parse(result_handle) print "I'm %s, finished with blasting\n" % comm.rank while 1: try: blast_record = blast_records.next() except: "I'm %s, finished with all records\n" % comm.rank break no_of_hits = min(1000, len(blast_record.alignments)) for i in range(no_of_hits): alignment_title = blast_record.alignments[i].title for hsp in blast_record.alignments[i].hsps: result_entry = [ blast_record.query, alignment_title, 0, 0 ] #[query name (probe id and pos) , alignment title , number of matches, pos in contig ] if hsp.identities >= 15: result_entry[2] = hsp.identities result_entry[3] = hsp.sbjct_start result_ls.append(result_entry) print "I'm %s, finished with parsing blast result\n" % comm.rank result_data = cPickle.dumps(result_ls) comm.send(result_data, 0, 0)
def blastall_seq2seq(fastadata=(), filenames=(), output="ncbiparsed", blastprogram="blastp", remove_files=True, extra_blastp_params={ 'F': 'F', 'e': '10' }): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn', 'tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type( ()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa']) fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa']) fh = open(fname_q, 'w') fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1])) fh.close() fh = open(fname_s, 'w') fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1])) fh.close() elif filenames and type(filenames) == type( ()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot)) # and blastall! extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) ci, co, ce = osPopen3( "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout
# starttime = time.time() print 'retrieving homologous sequences from UniProt using PSI-BLAST...', sys.stdout.flush() ## execute blastpgp ## blastexe = '/usr/bin/blastpgp' blastdb = '/clusterfs/ohana/external/UniProt/current/protein' iters = 4 eval = 0.0001 maxseqs = 1000 results,errors = NCBIStandalone.blastpgp(blastexe, blastdb, seedfname, expectation=eval, alignments=maxseqs, npasses=iters) ## parse psiblast hits to get ids ## blasthits = set([]) for blast_record in NCBIXML.parse(results): for alignment in blast_record.alignments: blasthits.add(alignment.hit_id) if len(blasthits) < 3: print 'Sorry, only %d homologs were retrieved from UniProt, too few sequences to determine patterns of evolutionary conservation.' % len(blasthits) sys.exit(0) handle = open('intrepid-psiblast-ids.txt', 'w')
documentation. """ # standard library import os import sys # biopython from Bio.Blast import NCBIStandalone my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta') my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast', 'sorghum_est-test.fasta') my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall') print 'Running blastall...' blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn', my_blast_db, my_blast_file) b_parser = NCBIStandalone.BlastParser() b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) while 1: b_record = b_iterator.next() if b_record is None: break E_VALUE_THRESH = 0.04 for alignment in b_record.alignments: for hsp in alignment.hsps:
def predict(input_sequence): sys.path.append(settings["SERENDIP_DIR"]) from sequence.entropy.lib.seq_lib import FastaParser from Bio.Blast import NCBIStandalone sequence_hash = get_sequence_hash(input_sequence) results_path = os.path.join(settings["RESULTS_DIR"], sequence_hash) lock_path = results_path + ".lock" with FileLock(lock_path): if os.path.isfile(results_path): return parse_serendip_results(open(results_path, 'r').read()) input_id = 'input' out_dir = tempfile.mkdtemp() try: out_file = os.path.join(out_dir, 'output.myrsa') input_fasta_path = os.path.join(out_dir, input_id + '.fa') # Netsurf open(input_fasta_path, 'w').write(">%s\n%s" % (input_id, input_sequence)) cmd = [settings["NETSURF_EXE"], "-i", input_fasta_path, "-d", settings["NR70_DB"], "-a", "-k", "-T", out_dir, "-o", out_file] _log.info(cmd) subprocess.call(cmd) blast_parser = NCBIStandalone.PSIBlastParser() blast_record = blast_parser.parse(open(os.path.join(out_dir, input_id + '.blastout'), 'r')) if blast_record.rounds <= 0: raise Exception("no netsurf hits") hit_titles = [alignment.title[1:] for alignment in blast_record.rounds[-1].alignments] id_path = os.path.join(out_dir, input_id + '.blastout_id') with open(id_path, 'w') as f: for hit_title in hit_titles: f.write(hit_title + '\n') blast_hits_path = os.path.join(out_dir, 'output_seqs.fa') cmd = [settings["FASTACMD_EXE"], "-d", settings["NR70_DB"], '-i', id_path, '-o', blast_hits_path] _log.info(cmd) result = subprocess.call(cmd) if result == 0: # We have blast hits # Netsurf on hits netsurf_append_path = os.path.join(out_dir, 'output_other.myrsa') subtasks = [netsurf_hit.delay(str(seq)) for seq in FastaParser(open(blast_hits_path, 'r'))] with open(netsurf_append_path, 'w') as f: for subtask in subtasks: f.write(subtask.get()) # Append input sequence to blast hits for alignment as input for entropy and DynaMine with open(blast_hits_path, 'a') as f: f.write('>input\n' + input_sequence + '\n') # Make alignment using muscle alignment_path = os.path.join(out_dir, "output.ali") cmd = [settings["MUSCLE_EXE"], "-in", blast_hits_path, "-out", alignment_path] _log.info(cmd) subprocess.call(cmd) else: raise Exception("No blast hits for input sequence") # Alignment position entropies entropy_path = os.path.join(out_dir, "output.entropy") hit_sequences = FastaParser(open(alignment_path, 'r')) hit_sequences.frequencies().normalize() hit_entropies = hit_sequences.frequencies().entropies() with open(entropy_path, 'w') as f: n = 0 for entropy in hit_entropies: n += 1 f.write(str(n) + ' ' + str(entropy) + '\n') # Run dynamine on each sequence dynamine_fasta_path = os.path.join(out_dir, "output_seq.fasta") for seq in FastaParser(open(blast_hits_path, 'r')): # We use this file name, to avoid confusing the rest of the script: with open(dynamine_fasta_path, 'w') as f: f.write(str(seq)) cmd = [settings["DYNAMINE_EXE"], "-a", dynamine_fasta_path] _log.info(cmd) subprocess.call(cmd, env=dict(os.environ, **{"PYTHONPATH":"/usr/local/lib/python2.7/site-packages/"})) # Run prediction script result_testing_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "Result_Testing") combined_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "five_models_combined") dynamine_path = os.path.splitext(dynamine_fasta_path)[0] cmd = [settings["RSCRIPT_EXE"], settings["RF_SCRIPT"], input_id, alignment_path, entropy_path, netsurf_append_path, dynamine_path, out_file, result_testing_path, combined_path] _log.info(cmd) os.chdir(out_dir) subprocess.call(cmd) output_result_path = os.path.join(out_dir, input_id + '.out') if not os.path.isfile(output_result_path): raise Exception("No ouput generated") shutil.copyfile(output_result_path, results_path) data = parse_serendip_results(open(results_path, 'r').read()) # Start making the scene: yasara_scene.delay(data) return data finally: if os.path.isdir(out_dir): shutil.rmtree(out_dir)
dbFile = argv[2] outFile = argv[3] #padding = int(argv[4]) # Format dbFile if os.path.exists("%s.nin" % dbFile): print "--[WARNING]blastdb already formated, using the existing one." else: print "Formatting database..." os.system('formatdb -i %s -p F' % dbFile) # Run BLAST os.system( 'blastall -p blastn -i %s -d %s -e 1e-10 -v 100000 -b 100000 -m 0 -o temp.blo -q -2' % (queryFile, dbFile)) # Extract sequeces from blo file outf = open(outFile, 'w') blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser) #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet) for hit in blast_iterator: for alignment in hit.alignments: for hsp in alignment.hsps: #print alignment.title #print hsp.sbjct_start, hsp.sbjct_end #print hsp.sbjct outf.write( "%s_%s-%s\n%s\n\n" % (alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
# CL Structure: ./blastpquerydb.py [QUERY FASTA FILE] [DATABASE TO SEARCH (MAKE WITH MAKEBLASTDB)] from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Blast.Applications import NcbiblastpCommandline from Bio.SubsMat.MatrixInfo import blosum62 from Bio.Blast import NCBIStandalone from sys import argv queryList = argv[1] for seq_record in SeqIO.parse(queryList, "fasta"): SeqIO.write(SeqRecord(seq_record.seq, id = seq_record.id), '/home/underasail/temp/seq.fa', 'fasta') blastp_cli = NcbiblastpCommandline(cmd = "/home/underasail/ncbi-blast/ncbi-blast-2.7.1+/bin/blastp", query = '/home/underasail/temp/seq.fa', db = argv[2], matrix = 'blosum62', evalue = 0.01, num_descriptions = 1, num_alignments = 1, out = '~/temp/output.txt') blastp_cli() result_handle = open('/home/underasail/temp/output.txt', 'r') blast_parser = NCBIStandalone.BlastParser() blast_record = blast_parser.parse(result_handle) print('Human ID: ', seq_record.id) for description in blast_record.descriptions: print('Mouse Match ID: ', description.title) for alignment in blast_record.alignments: for hsp in alignment.hsps: print('Score: ', hsp.score) print('Bits: ', hsp.bits) print('E-value: ', hsp.expect) print('Alignment (Query/Match): ') print(hsp.query) print(hsp.match) print(hsp.sbjct, \n\n)
def do_blast_search(self): from Bio.Blast import NCBIStandalone self.result_handle, self.error_handle = NCBIStandalone.blastall(self.blast_exe, "blastp", self.blast_db, self.blast_file)
def blast(blastRootDirectory): if sys.platform == 'win32': blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta') else: if not os.path.isdir('/tmp/BLAST'): print "making directory '/tmp/BLAST'" os.mkdir('/tmp/BLAST/') if not os.path.exists('/tmp/BLAST/formatdb'): shutil.copy(os.path.join(blastRootDirectory, 'formatdb'), '/tmp/BLAST') print "copying 'formatdb' to '/tmp/BLAST/'" blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta') #print 'path to blastDB.fasta:', blast_db blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt') #print 'path to filetoblast.txt:', blast_file if sys.platform == 'win32': blastall_name = 'Blastall.exe' blast_exe = os.path.join(blastRootDirectory, blastall_name) else: blastall_name = 'blastall' blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name) #print 'path to blastall:', blast_exe if sys.platform == 'win32': import win32api blast_db = win32api.GetShortPathName(blast_db) blast_file = win32api.GetShortPathName(blast_file) blast_exe = win32api.GetShortPathName(blast_exe) #cont = raw_input('blah') #try: blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file, align_view=7) #except: # f = open(blast_file, 'r') # s = file.read() # print s #print 'done BLASTing' print 'errors:', error_info.read() print 'blast output:', blast_out.read() b_parser = NCBIXML.BlastParser() #print 'got parser' b_record = b_parser.parse(blast_out) b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) #print 'got iterator' results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 b_record = b_iterator.next() if not b_record: break print 'query:', b_record.query if b_record is None: break e_value_thresh = 0.001 print 'number of alignments:', len(b_record.alignments) significant = False for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">", "") if b_record.query != alignment.title: significant = True print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches' results.append( (b_record.query, alignment.title, hsp.expect)) print b_record.query, significant if not significant: print 'adding', b_record.query, 'to the list of queries without matches' nonmatchingQueries.append(b_record.query) return nonmatchingQueries, results
query_sequences = {} it = Bio.Fasta.Iterator(handle, Bio.Fasta.SequenceParser()) seq = it.next() while seq: query_sequences[seq.description] = {} query_sequences[seq.description]["number_of_hits"] = 0 print seq.description print query_sequences[seq.description]["number_of_hits"] seq = it.next() handle.close() blast_out, error_handle = NCBIStandalone.blastall(blast_exe, blast_program, blast_db, blast_file) #print error_handle records = NCBIXML.parse(blast_out) #b_record = records.next() # # #E_VALUE_THRESH = 0.000000004 # #print dir(b_record) #print b_record.num_sequences #print "Query = %s" % b_record.query #b_record = records.next()
def main(): parser = OptionParser() parser.add_option("-i", "--input", action="store", dest="input", help="input file to make phylotree") parser.add_option("-g", "--germline", action="store", dest="germline", help="germline fasta") parser.add_option("-o", "--output", action="store", dest="output", help="the file where you want all your data") (options, args) = parser.parse_args() if len(sys.argv) < 2: dowhat() parser.print_help() exit() open(options.output, 'w').write("Your Sequence Results:\n\n") copy(options.input, "workable.fasta") copy(options.germline, "germ.fasta") list_of_database_files = SeqIO.to_dict( SeqIO.parse("workable.fasta", "fasta")) while list_of_database_files: list_of_database_files = SeqIO.to_dict( SeqIO.parse("workable.fasta", "fasta")) populate_database("workable.fasta") print "***DatabasePopulated***" newsequence_search = open("germ.fasta", "r") cline = NcbiblastpCommandline(matrix="PAM30", evalue="20", word_size="2", query="germ.fasta", cmd='blastp', db="temporary_database", out="blastout") newsequence_search.close print "****Cline = *** --->", cline call_blast(cline) print "***Call_blast_successful***" result_handle = open('blastout') print "***result handle successful***" blast_parser = NCBIStandalone.BlastParser() print "***blast_parser****" blast_record = blast_parser.parse(result_handle) print "***blast_record***" newsequence_search = open("germ.fasta", 'w') newsequence_search.write(">" + str(blast_record.alignments[0].title[2:]) + "\n" + str(blast_record.alignments[0].hsps[0].sbjct)) current_object = blast_record.alignments[0].title[2:] print current_object newfile = open(options.output, 'a') newfile.write( str(blast_record.alignments[0].hsps[0].query[:]) + "----> Query\n") newfile.write( str(blast_record.alignments[0].hsps[0].match[:]) + "----> Score of: " + str(blast_record.alignments[0].hsps[0].score) + "\n") newfile.write( str(blast_record.alignments[0].hsps[0].sbjct[:]) + "----> Template\n\n") list_of_database_files.pop(current_object) SeqIO.write(list_of_database_files.values(), "workable.fasta", "fasta")
def localPSIBlast(self, seqFile, db, method='blastp', resultOut=None, e='0.001', **kw): """ Performa a local psi-blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) for the search @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search e.g. ['swissprot', 'pdb'] @type db: [str] @param e: expectation value cutoff (default: 0.001) @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- New Blast+ routine --- (see NcbipsiblastCommandline) num_iterations Number of passes (default 1). matrix Matrix to use (default BLOSUM62). --- old blastall routine --- --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 11). gap_extend Gap extension penalty (default 1). window_size Multiple hits window size (default 40). npasses Number of passes (default 1). passes Hits/passes (Integer 0-2, default 1). --- Algorithm --- gapped Whether to do a gapped alignment (T/F, default T). wordsize Word size (default 3). keep_hits Number of beset hits from a region to keep (def 0) xdrop Dropoff value (bits) for gapped alignments (def 15) hit_extend Threshold for extending hits (default 11). nbits_gapping Number of bits to trigger gapping (default 22). pseudocounts Pseudocounts constants for multiple passes (def 9). xdrop_final X dropoff for final gapped alignment (default 25). xdrop_extension Dropoff for blast extensions (default 7). model_threshold E-value threshold to include in multipass model (default 0.005). required_start Start of required region in query (default 1). required_end End of required region in query (default -1). --- Processing --- filter Filter query sequence with SEG? (T/F, default F) believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments (default 250). @type kw: any @raise BlastError: if program call failes """ ## the following should work for new Blast+ tools: #from Bio.Blast.Applications import NcbipsiblastCommandline #resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT #blastx_cline = NcbipsiblastCommandline(query=seqFile, #db=db, #evalue=e, #outfmt=5, #out=resultOut, #**kw) #stdout, stderr = blastx_cline() #parsed = NCBIXML.parse( results ).next() #self.__blast2dict( parsed, db ) results = err = None resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT kw = self.__dictvalues2str(kw) e = str(e) try: results, err = NCBIStandalone.blastpgp( settings.psi_blast_bin, db, seqFile, program='blastpgp', align_view='7', ## XML output expectation=e, **kw) results = self.__copyFileHandle(results, resultOut) err = self.__copyFileHandle(err, self.outFolder + self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut) parsed = NCBIXML.parse(results).next() self.__blast2dict(parsed, db) except Exception, why: self.log.add(T.lastErrorTrace()) globals().update(locals()) self.log.writeln('local namespace is pushed into global ') raise BlastError(str(why))
def blast(blastRootDirectory): if sys.platform == 'win32': blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta') else: if not os.path.isdir('/tmp/BLAST'): print "making directory '/tmp/BLAST'" os.mkdir('/tmp/BLAST/') if not os.path.exists('/tmp/BLAST/formatdb'): shutil.copy(os.path.join(blastRootDirectory,'formatdb'), '/tmp/BLAST') print "copying 'formatdb' to '/tmp/BLAST/'" blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta') #print 'path to blastDB.fasta:', blast_db blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt') #print 'path to filetoblast.txt:', blast_file if sys.platform == 'win32': blastall_name = 'Blastall.exe' blast_exe = os.path.join(blastRootDirectory, blastall_name) else: blastall_name = 'blastall' blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name) #print 'path to blastall:', blast_exe if sys.platform == 'win32': import win32api blast_db = win32api.GetShortPathName(blast_db) blast_file = win32api.GetShortPathName(blast_file) blast_exe = win32api.GetShortPathName(blast_exe) #cont = raw_input('blah') #try: blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file, align_view=7) #except: # f = open(blast_file, 'r') # s = file.read() # print s #print 'done BLASTing' print 'errors:', error_info.read() print 'blast output:', blast_out.read() b_parser = NCBIXML.BlastParser() #print 'got parser' b_record = b_parser.parse(blast_out) b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) #print 'got iterator' results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 b_record = b_iterator.next() if not b_record: break print 'query:', b_record.query if b_record is None: break e_value_thresh = 0.001 print 'number of alignments:', len(b_record.alignments) significant = False for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">","") if b_record.query != alignment.title: significant = True print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches' results.append((b_record.query, alignment.title, hsp.expect)) print b_record.query, significant if not significant: print 'adding', b_record.query, 'to the list of queries without matches' nonmatchingQueries.append(b_record.query) return nonmatchingQueries, results
#!/usr/bin/python my_blast_db = "/home/kenglish/Data/Genomes/Databases/EST_Clade_A" my_blast_file = "Record1.fasta" my_blast_exe = "/usr/bin/blastall" from Bio.Blast import NCBIStandalone from Bio.Blast import NCBIXML result_handle, error_handle = NCBIStandalone.blastall(my_blast_exe, "blastn", my_blast_db, my_blast_file) #$blast_results = result_handle.read() #print blast_results from Bio.Blast import NCBIXML blast_records = NCBIXML.parse(result_handle) blast_record = blast_records.next() print blast_record.alignments
def localBlast( self, seqFile, db, method='blastp', resultOut=None, e='0.01', **kw ): """ Performa a local blast search (requires that the blast binaries and databases are installed localy). Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search. @param seqFile: file name with search sequence in FASTA format @type seqFile: str @param db: database(s) to search, e.g. ['swissprot', 'pdb'] @type db: [str] @param method: search program to use, e.g. 'blastp', 'fasta' (default: blastp) @type method: str @param e: expectation value cutoff @type e: float @param resultOut: save blast output to this new file @type resultOut: str @param kw: optional keywords:: --- Scoring --- matrix Matrix to use (default BLOSUM62). gap_open Gap open penalty (default 0). gap_extend Gap extension penalty (default 0). --- Algorithm --- gapped Whether to do a gapped alignment. T/F (default T) wordsize Word size (blastp default 11). keep_hits Number of best hits from a region to keep (default off). xdrop Dropoff value (bits) for gapped alignments (blastp default 25). hit_extend Threshold for extending hits (blastp default 11) --- Processing --- filter Filter query sequence? (T/F, default F) restrict_gi Restrict search to these GI's. believe_query Believe the query defline? (T/F, default F) nprocessors Number of processors to use (default 1). --- Formatting --- alignments Number of alignments. (default 250) @type kw: any @raise BlastError: if program call failes """ results = err = p = None resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT kw = self.__dictvalues2str( kw ) e = str(e) try: if self.verbose: self.log.add('running blast...') results, err = NCBIStandalone.blastall( settings.blast_bin, method, db, seqFile, expectation=e, align_view='7', ## XML output **kw) results = self.__copyFileHandle(results, resultOut) err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR) if self.verbose: self.log.writeln('Raw blast output copied to: ' + resultOut ) parsed = NCBIXML.parse( results ).next() self.__blast2dict( parsed, db ) except Exception, why: self.log.add( T.lastErrorTrace() ) globals().update( locals() ) self.log.writeln('local namespace is pushed into global ') raise BlastError( str(why) )