def getHits(gene): ''' BLAST parser using Biopython Input: name of blast out file in standard ouput format Outputs: 2 files ''' inf = open(o.blast, 'rU') parser = NCBIStandalone.BlastParser() error_parser = NCBIStandalone.BlastErrorParser(inf) iterator = NCBIStandalone.Iterator(inf, error_parser) err_iterator = NCBIStandalone.Iterator(inf, error_parser) #next_record = ## *** Parsing *** ## lg = len(gene) if o.verbose == True: sys.stderr.write("\nGetting hits...\n") for record in iterator: query = record.query.split(" ")[0] if query in gene: out.write("%s\n" % gene[query]) if record.alignments is []: out.write("%s\tNA\tNA\tNA\n" % gene[query]) else: flag = 0 for alignment in record.alignments: for hsp in alignment.hsps: #-->## ** Selection Process **## if float(hsp.expect) < 0.0001 and flag < 3: out.write( "%s\t%s\t%s\tHigh\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 1.0 and flag < 3: out.write( "%s\t%s\t%s\tLow\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 5.0 and flag < 3: out.write( "%s\t%s\t%s\tScare\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) > 1.0 and flag < 1: out.write("%s\tNA\tNA\tNA\n" % gene[query]) flag += 1 del gene[query] if o.verbose == True: sys.stderr.write('\r' + '' * 0) sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%') sys.stdout.flush() else: pass if (lg - len(gene)) != len(gene): sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
def blast2data(filehandle): ###This should be for blast-txt """BLAST output to data dict""" data = {} blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser) for blast_record in blast_iterator: readname = blast_record.query.split()[0] for alignment in blast_record.alignments: if re.search("\|", alignment.title): fields = alignment.title[1:].strip().split('|') refgi = fields[1] else: fields = alignment.title[1:].strip().split(' ') refgi = fields[0] for hsp in alignment.hsps: if hsp.expect > EVALUE_CUTOFF: continue identity = round( float(hsp.identities[0]) * 100 / hsp.identities[1], 1) start = hsp.sbjct_start end = hsp.sbjct_end if start > end: tmp = start start = end end = tmp if not data.has_key(refgi): data[refgi] = [] if not refgi in references: references.append(refgi) refgenome2json(refgi) if not refLengths.has_key(refgi): refLengths[refgi] = gi2length(refgi) data[refgi].append([start, identity, end, readname]) return data
def getCoordinatesFromBlo(bloFname, padding): ''' # Extract coordinates from blo file ''' coord = {} #outf = open(outFile, 'w') blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser) #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet) for hit in blast_iterator: for alignment in hit.alignments: for hsp in alignment.hsps: #print alignment.title #print hsp.sbjct_start, hsp.sbjct_end #print hsp.sbjct #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct)) new = True fullName = alignment.title.replace('>', '') if fullName in coord.keys() and hsp.sbjct_start >= coord[ fullName][0] and hsp.sbjct_end <= coord[fullName][1]: new = False if new: coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end] return coord
def blast_parse(file, e, output): result_handle = open(file) blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser) blast_record = next(blast_iterator) output = open(output, 'w') output.write('query title\tdescription\tlength\te value' + '\n') for blast_record in blast_iterator: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < e: output.write(str(blast_record.query[:18]) + ' \t') output.write(str(alignment.title) + '\t') output.write(str(alignment.length) + '\t') output.write(str(hsp.expect) + '') output.write('\n') output.close()
def __init__(self, handle): """Initialize the class.""" self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
dbFile = argv[2] outFile = argv[3] #padding = int(argv[4]) # Format dbFile if os.path.exists("%s.nin" % dbFile): print "--[WARNING]blastdb already formated, using the existing one." else: print "Formatting database..." os.system('formatdb -i %s -p F' % dbFile) # Run BLAST os.system( 'blastall -p blastn -i %s -d %s -e 1e-10 -v 100000 -b 100000 -m 0 -o temp.blo -q -2' % (queryFile, dbFile)) # Extract sequeces from blo file outf = open(outFile, 'w') blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser) #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet) for hit in blast_iterator: for alignment in hit.alignments: for hsp in alignment.hsps: #print alignment.title #print hsp.sbjct_start, hsp.sbjct_end #print hsp.sbjct outf.write( "%s_%s-%s\n%s\n\n" % (alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
# biopython from Bio.Blast import NCBIStandalone my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta') my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast', 'sorghum_est-test.fasta') my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall') print 'Running blastall...' blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn', my_blast_db, my_blast_file) b_parser = NCBIStandalone.BlastParser() b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) while 1: b_record = b_iterator.next() if b_record is None: break E_VALUE_THRESH = 0.04 for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: print '****Alignment****' print 'sequence:', alignment.title print 'length:', alignment.length print 'e value:', hsp.expect
def __init__(self, handle): self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
import string from Bio.Seq import Seq from Bio.Blast import NCBIStandalone import fileinput import glob OutFile = r'Blast\out\02.blast_result_total.txt' f = open(OutFile, 'r') Mismatch_total_file = open('Mismatch_total_LOD v3_171103.txt', 'w') blast_parser = NCBIStandalone.BlastParser() print blast_parser iterator = NCBIStandalone.Iterator(f, blast_parser) for record in iterator: for alignment in record.alignments: for hsp in alignment.hsps: mismatch_Number = (hsp.identities[1] - (hsp.identities[0] + hsp.gaps[0])) Gaps_Number = hsp.gaps[0] if (mismatch_Number != 0): Mismatch_total_file.write('%s\t%s\t%s\t%s\n' % (record.query, alignment.title, mismatch_Number, Gaps_Number)) f.close() Mismatch_total_file.close()
def blast(blastRootDirectory): if sys.platform == 'win32': blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta') else: if not os.path.isdir('/tmp/BLAST'): print "making directory '/tmp/BLAST'" os.mkdir('/tmp/BLAST/') if not os.path.exists('/tmp/BLAST/formatdb'): shutil.copy(os.path.join(blastRootDirectory, 'formatdb'), '/tmp/BLAST') print "copying 'formatdb' to '/tmp/BLAST/'" blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta') #print 'path to blastDB.fasta:', blast_db blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt') #print 'path to filetoblast.txt:', blast_file if sys.platform == 'win32': blastall_name = 'Blastall.exe' blast_exe = os.path.join(blastRootDirectory, blastall_name) else: blastall_name = 'blastall' blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name) #print 'path to blastall:', blast_exe if sys.platform == 'win32': import win32api blast_db = win32api.GetShortPathName(blast_db) blast_file = win32api.GetShortPathName(blast_file) blast_exe = win32api.GetShortPathName(blast_exe) #cont = raw_input('blah') #try: blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file, align_view=7) #except: # f = open(blast_file, 'r') # s = file.read() # print s #print 'done BLASTing' print 'errors:', error_info.read() print 'blast output:', blast_out.read() b_parser = NCBIXML.BlastParser() #print 'got parser' b_record = b_parser.parse(blast_out) b_iterator = NCBIStandalone.Iterator(blast_out, b_parser) #print 'got iterator' results = [] recordnumber = 0 nonmatchingQueries = [] while 1: recordnumber += 1 b_record = b_iterator.next() if not b_record: break print 'query:', b_record.query if b_record is None: break e_value_thresh = 0.001 print 'number of alignments:', len(b_record.alignments) significant = False for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < e_value_thresh: alignment.title = alignment.title.replace(">", "") if b_record.query != alignment.title: significant = True print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches' results.append( (b_record.query, alignment.title, hsp.expect)) print b_record.query, significant if not significant: print 'adding', b_record.query, 'to the list of queries without matches' nonmatchingQueries.append(b_record.query) return nonmatchingQueries, results
argparser.add_argument('-d', '--dump', type=argparse.FileType('w'), dest='dump_file', help='pickle intermediate results in tempfile') argparser.add_argument('-l', '--load', type=argparse.FileType('r'), dest='load_file', help='depickle intermediate results from tempfile') argparser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) args = argparser.parse_args() assert not (args.load_file and args.dump_file) parser1 = NCBIStandalone.BlastParser() parser2 = NCBIStandalone.BlastParser() # PXL: PMZ(Q) x Lamp3(S), LXP: Lamp3(Q) x PMZ(S) pxl_records = NCBIStandalone.Iterator(args.blast1, parser1) lxp_records = NCBIStandalone.Iterator(args.blast2, parser2) pxl_re = re.compile(r'(PMZ_[^\s]+)') pxl_key_fn = lambda k: pxl_re.findall(k)[0] lxp_re = re.compile(r'(lamp3[^\s]+ [^\s]+ len\d+)') # matching 'not whitespace' is faster and more robust lxp_key_fn = lambda k: lxp_re.findall(k)[0] pxl_lookup, lxp_lookup = None, None if args.load_file: pxl_lookup = cPickle.load(args.load_file) lxp_lookup = cPickle.load(args.load_file) else: pxl_lookup = make_lookup(pxl_records, pxl_key_fn) lxp_lookup = make_lookup(lxp_records, lxp_key_fn)
def ReadBlast(self, file, OUT, iszipped=0, is_psiblast=None): output = open(OUT, "w") self.selfhits = [] if is_psiblast: print >> sys.stderr, 'Parsing PSI-Blast' self.parser = NCBIStandalone.PSIBlastParser() else: self.parser = NCBIStandalone.BlastParser() if file[-3:] == '.gz' or iszipped: handle = gzip.open(file) else: handle = open(file) self.iter = NCBIStandalone.Iterator(handle=handle, parser=self.parser) self.blastDict = {} while 1: try: rec = self.iter.next() if not rec: break except: sys.stderr.write( 'Can\'t iterate on blast records anymore. Abort.\n') import traceback traceback.print_exc() return 'Error parsing %s' % file self.query = rec.query.split(" ")[ 0] ## blast_record.query.split(" ")[0] self.length = rec.query_letters if self.length < self.min_size: self.printer("Does not meet the minimum length " + str(self.min_size)) break if is_psiblast: rec = rec.rounds[-1] # each alignment is one potential hit for n, alignment in enumerate(rec.alignments): hsp = alignment.hsps[0] #no multiple hsps alnlength = hsp.align_length hit = alignment.title #targetlength = alignment.length #m = re.search("sp\|([A-Z0-9]+)\|([A-Z0-9_]+) ?(.+)?", alignment.title) m = re.search("sp\|(.+?)\|(.+?) (.+)?", alignment.title) if m: # pyphynr blast result hit_sp_ac = m.group(1) hit_sp_id = m.group(2) hit_sp_note = m.group(3) elif alignment.title[ 0] == '>': # result from qadditional blast databases hit_sp_ac = None hit_sp_id = alignment.title[1:].split()[0] hit_sp_note = None else: hit_sp_ac = None hit_sp_id = None hit_sp_note = None self.printer(hit_sp_id) similarity = hsp.positives[0] / float(hsp.positives[1]) * 100 if float(hsp.expect) <= float(self.HSP_max_evalue): if float(similarity) >= int(self.HSP_minimal_positives): coverage = hsp.positives[1] / float(self.length) * 100 if float(coverage) >= int(self.HSP_minimal_coverage): #targetcoverage = hsp.positives[1]/float(targetlength)*100 #if float(targetcoverage) > int(self.HSP_minimal_targetcov): #self.compatibles.append((hit_sp_ac, hit)) #hitlist = [hit_sp_id, n+1 , hsp.positives[0]/float(hsp.positives[1])*100, hsp.positives[1]/float(self.length)*100, hsp.positives[1]/float(targetlength)*100, hsp.score, hsp.expect] hitlist = [ hit_sp_id, hsp.positives[0] / float(hsp.positives[1]) * 100, hsp.positives[1] / float(self.length) * 100, hsp.score, hsp.expect ] if self.cB: self.createblastDict(query, hitlist) output.write("%s\t" % (self.query)), for element in hitlist: output.write("%s\t" % element), output.write("\n") output.close() handle.close() return None
aster = [] for t, pair in enumerate(zip(query, sbjct), 1): q, s = pair if q == 'N': aster.append([t, q, s]) return aster if __name__ == "__main__": blast_parser = NCBIStandalone.BlastParser() GLB = glob.glob('*.nofmt') for glb in GLB: handle = open(glb, 'r') blast_iterator = NCBIStandalone.Iterator(handle, blast_parser) total = 0 E_VALUE_THRESH = 1 with open(glb[:-6] + ".csv", 'w') as outcsv: for blast_record in blast_iterator: print("query:", blast_record.query) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: if hsp.align_length < 100: continue if len(str(hsp.sbjct)) == 0: continue print(hsp.query) print(hsp.match) print(hsp.sbjct)