def getCoordinatesFromBlo(bloFname, padding): ''' # Extract coordinates from blo file ''' coord = {} #outf = open(outFile, 'w') blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser) #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet) for hit in blast_iterator: for alignment in hit.alignments: for hsp in alignment.hsps: #print alignment.title #print hsp.sbjct_start, hsp.sbjct_end #print hsp.sbjct #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct)) new = True fullName = alignment.title.replace('>', '') if fullName in coord.keys() and hsp.sbjct_start >= coord[ fullName][0] and hsp.sbjct_end <= coord[fullName][1]: new = False if new: coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end] return coord
def blast2data(filehandle): ###This should be for blast-txt """BLAST output to data dict""" data = {} blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser) for blast_record in blast_iterator: readname = blast_record.query.split()[0] for alignment in blast_record.alignments: if re.search("\|", alignment.title): fields = alignment.title[1:].strip().split('|') refgi = fields[1] else: fields = alignment.title[1:].strip().split(' ') refgi = fields[0] for hsp in alignment.hsps: if hsp.expect > EVALUE_CUTOFF: continue identity = round( float(hsp.identities[0]) * 100 / hsp.identities[1], 1) start = hsp.sbjct_start end = hsp.sbjct_end if start > end: tmp = start start = end end = tmp if not data.has_key(refgi): data[refgi] = [] if not refgi in references: references.append(refgi) refgenome2json(refgi) if not refLengths.has_key(refgi): refLengths[refgi] = gi2length(refgi) data[refgi].append([start, identity, end, readname]) return data
def getHits(gene): ''' BLAST parser using Biopython Input: name of blast out file in standard ouput format Outputs: 2 files ''' inf = open(o.blast, 'rU') parser = NCBIStandalone.BlastParser() error_parser = NCBIStandalone.BlastErrorParser(inf) iterator = NCBIStandalone.Iterator(inf, error_parser) err_iterator = NCBIStandalone.Iterator(inf, error_parser) #next_record = ## *** Parsing *** ## lg = len(gene) if o.verbose == True: sys.stderr.write("\nGetting hits...\n") for record in iterator: query = record.query.split(" ")[0] if query in gene: out.write("%s\n" % gene[query]) if record.alignments is []: out.write("%s\tNA\tNA\tNA\n" % gene[query]) else: flag = 0 for alignment in record.alignments: for hsp in alignment.hsps: #-->## ** Selection Process **## if float(hsp.expect) < 0.0001 and flag < 3: out.write( "%s\t%s\t%s\tHigh\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 1.0 and flag < 3: out.write( "%s\t%s\t%s\tLow\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) < 5.0 and flag < 3: out.write( "%s\t%s\t%s\tScare\n" % (gene[query], alignment.title.split(">")[1], float(hsp.expect))) flag += 1 elif float(hsp.expect) > 1.0 and flag < 1: out.write("%s\tNA\tNA\tNA\n" % gene[query]) flag += 1 del gene[query] if o.verbose == True: sys.stderr.write('\r' + '' * 0) sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%') sys.stdout.flush() else: pass if (lg - len(gene)) != len(gene): sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
def __init__(self, dbname=None, blastexe=None, mode=None, parser=None): if dbname is None: dbname = DEFAULT_BLAST_DB if blastexe is None: blastexe = DEFAULT_BLAST_EXE if mode is None: mode = DEFAULT_BLAST_MODE if parser is None: parser = NCBIStandalone.BlastParser() self.dbname = dbname self.blastexe = blastexe self.parser = parser self.mode = mode
def blastall_seq2db(header, sequence, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [uniquetag, str(header).replace(" ", "_"), sequence[0:10] + ".fa"]) fname = osPathJoin(OSgetcwd(), fname) fh = open(fname, 'w') fh.write(">%s\n%s\n" % (header, sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def blast_parse(file, e, output): result_handle = open(file) blast_parser = NCBIStandalone.BlastParser() blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser) blast_record = next(blast_iterator) output = open(output, 'w') output.write('query title\tdescription\tlength\te value' + '\n') for blast_record in blast_iterator: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < e: output.write(str(blast_record.query[:18]) + ' \t') output.write(str(alignment.title) + '\t') output.write(str(alignment.length) + '\t') output.write(str(hsp.expect) + '') output.write('\n') output.close()
def blastall_file2db(fname, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'tblastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() # do NOT remove the input fname except: co.close() error = ce.read().strip() ce.close() print command print "ERROR: '%s'" % error raise "BLAST CRASHED...." # and return! return blastallout
scanner = NCBIStandalone._Scanner() for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.AbstractConsumer()) for test in detailed_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.TaggingConsumer()) ### BlastParser print "Running tests on BlastParser" parser = NCBIStandalone.BlastParser() pb_parser = NCBIStandalone.PSIBlastParser() for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) try: # First, try parsing it with the normal parser. rec = parser.parse(open(datafile)) except ValueError, x: # If it complains that the input is psiblast data, then # parse it with the psiblast parser. if string.find(str(x), 'PSI-BLAST data') >= 0: rec = pb_parser.parse(open(datafile)) else: raise
def __init__(self, handle): """Initialize the class.""" self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
def main(): parser = OptionParser() parser.add_option("-i", "--input", action="store", dest="input", help="input file to make phylotree") parser.add_option("-g", "--germline", action="store", dest="germline", help="germline fasta") parser.add_option("-o", "--output", action="store", dest="output", help="the file where you want all your data") (options, args) = parser.parse_args() if len(sys.argv) < 2: dowhat() parser.print_help() exit() open(options.output, 'w').write("Your Sequence Results:\n\n") copy(options.input, "workable.fasta") copy(options.germline, "germ.fasta") list_of_database_files = SeqIO.to_dict( SeqIO.parse("workable.fasta", "fasta")) while list_of_database_files: list_of_database_files = SeqIO.to_dict( SeqIO.parse("workable.fasta", "fasta")) populate_database("workable.fasta") print "***DatabasePopulated***" newsequence_search = open("germ.fasta", "r") cline = NcbiblastpCommandline(matrix="PAM30", evalue="20", word_size="2", query="germ.fasta", cmd='blastp', db="temporary_database", out="blastout") newsequence_search.close print "****Cline = *** --->", cline call_blast(cline) print "***Call_blast_successful***" result_handle = open('blastout') print "***result handle successful***" blast_parser = NCBIStandalone.BlastParser() print "***blast_parser****" blast_record = blast_parser.parse(result_handle) print "***blast_record***" newsequence_search = open("germ.fasta", 'w') newsequence_search.write(">" + str(blast_record.alignments[0].title[2:]) + "\n" + str(blast_record.alignments[0].hsps[0].sbjct)) current_object = blast_record.alignments[0].title[2:] print current_object newfile = open(options.output, 'a') newfile.write( str(blast_record.alignments[0].hsps[0].query[:]) + "----> Query\n") newfile.write( str(blast_record.alignments[0].hsps[0].match[:]) + "----> Score of: " + str(blast_record.alignments[0].hsps[0].score) + "\n") newfile.write( str(blast_record.alignments[0].hsps[0].sbjct[:]) + "----> Template\n\n") list_of_database_files.pop(current_object) SeqIO.write(list_of_database_files.values(), "workable.fasta", "fasta")
def __init__(self, handle): self.handle = handle blast_parser = NCBIStandalone.BlastParser() self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
def blastall_seq2seq(fastadata=(), filenames=(), output="ncbiparsed", blastprogram="blastp", remove_files=True, extra_blastp_params={ 'F': 'F', 'e': '10' }): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn', 'tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type( ()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa']) fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa']) fh = open(fname_q, 'w') fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1])) fh.close() fh = open(fname_s, 'w') fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1])) fh.close() elif filenames and type(filenames) == type( ()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot)) # and blastall! extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) ci, co, ce = osPopen3( "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout
argparser = argparse.ArgumentParser(description='Find reciprocal best hits in two BLAST outputs') argparser.add_argument('blast1', type=file, help='BLAST results') argparser.add_argument('blast2', type=file, help='inverted BLAST results') argparser.add_argument('-d', '--dump', type=argparse.FileType('w'), dest='dump_file', help='pickle intermediate results in tempfile') argparser.add_argument('-l', '--load', type=argparse.FileType('r'), dest='load_file', help='depickle intermediate results from tempfile') argparser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) args = argparser.parse_args() assert not (args.load_file and args.dump_file) parser1 = NCBIStandalone.BlastParser() parser2 = NCBIStandalone.BlastParser() # PXL: PMZ(Q) x Lamp3(S), LXP: Lamp3(Q) x PMZ(S) pxl_records = NCBIStandalone.Iterator(args.blast1, parser1) lxp_records = NCBIStandalone.Iterator(args.blast2, parser2) pxl_re = re.compile(r'(PMZ_[^\s]+)') pxl_key_fn = lambda k: pxl_re.findall(k)[0] lxp_re = re.compile(r'(lamp3[^\s]+ [^\s]+ len\d+)') # matching 'not whitespace' is faster and more robust lxp_key_fn = lambda k: lxp_re.findall(k)[0] pxl_lookup, lxp_lookup = None, None if args.load_file: pxl_lookup = cPickle.load(args.load_file)
def ReadBlast(self, file, OUT, iszipped=0, is_psiblast=None): output = open(OUT, "w") self.selfhits = [] if is_psiblast: print >> sys.stderr, 'Parsing PSI-Blast' self.parser = NCBIStandalone.PSIBlastParser() else: self.parser = NCBIStandalone.BlastParser() if file[-3:] == '.gz' or iszipped: handle = gzip.open(file) else: handle = open(file) self.iter = NCBIStandalone.Iterator(handle=handle, parser=self.parser) self.blastDict = {} while 1: try: rec = self.iter.next() if not rec: break except: sys.stderr.write( 'Can\'t iterate on blast records anymore. Abort.\n') import traceback traceback.print_exc() return 'Error parsing %s' % file self.query = rec.query.split(" ")[ 0] ## blast_record.query.split(" ")[0] self.length = rec.query_letters if self.length < self.min_size: self.printer("Does not meet the minimum length " + str(self.min_size)) break if is_psiblast: rec = rec.rounds[-1] # each alignment is one potential hit for n, alignment in enumerate(rec.alignments): hsp = alignment.hsps[0] #no multiple hsps alnlength = hsp.align_length hit = alignment.title #targetlength = alignment.length #m = re.search("sp\|([A-Z0-9]+)\|([A-Z0-9_]+) ?(.+)?", alignment.title) m = re.search("sp\|(.+?)\|(.+?) (.+)?", alignment.title) if m: # pyphynr blast result hit_sp_ac = m.group(1) hit_sp_id = m.group(2) hit_sp_note = m.group(3) elif alignment.title[ 0] == '>': # result from qadditional blast databases hit_sp_ac = None hit_sp_id = alignment.title[1:].split()[0] hit_sp_note = None else: hit_sp_ac = None hit_sp_id = None hit_sp_note = None self.printer(hit_sp_id) similarity = hsp.positives[0] / float(hsp.positives[1]) * 100 if float(hsp.expect) <= float(self.HSP_max_evalue): if float(similarity) >= int(self.HSP_minimal_positives): coverage = hsp.positives[1] / float(self.length) * 100 if float(coverage) >= int(self.HSP_minimal_coverage): #targetcoverage = hsp.positives[1]/float(targetlength)*100 #if float(targetcoverage) > int(self.HSP_minimal_targetcov): #self.compatibles.append((hit_sp_ac, hit)) #hitlist = [hit_sp_id, n+1 , hsp.positives[0]/float(hsp.positives[1])*100, hsp.positives[1]/float(self.length)*100, hsp.positives[1]/float(targetlength)*100, hsp.score, hsp.expect] hitlist = [ hit_sp_id, hsp.positives[0] / float(hsp.positives[1]) * 100, hsp.positives[1] / float(self.length) * 100, hsp.score, hsp.expect ] if self.cB: self.createblastDict(query, hitlist) output.write("%s\t" % (self.query)), for element in hitlist: output.write("%s\t" % element), output.write("\n") output.close() handle.close() return None