def blast_align(fasta,blast_path,miRNA_db,mRNA_db): os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-3 -db "+miRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_miRNA.xml") os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-5 -db "+mRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_mRNA.xml") os.system("rm "+fasta) miRNA_records=NCBIXML.parse(open(args.output+"temp_blast_miRNA.xml")) mRNA_records=NCBIXML.parse(open(args.output+"temp_blast_mRNA.xml")) return (miRNA_records,mRNA_records)
def main(): #initialization n=0 # total number of query seq align_mi=0 align_m=0 args=ParseArg() miRNA_result=open(args.mi_xml) mRNA_result=open(args.m_xml) miRNA_records=NCBIXML.parse(miRNA_result) mRNA_records=NCBIXML.parse(mRNA_result) output=open(args.output,'w') # E-values if args.evalue==0: evalue_mi=1e-5 evalue_m=1e-15 else: evalue_mi=float(args.evalue[0]) evalue_m=float(args.evalue[1]) for mi_record,m_record in itertools.izip(miRNA_records,mRNA_records): temp_output='' mi_indic=0 # whether there are miRNA alignment m_indic=0 # whether there are mRNA alignment mi_end=150 #shortest miRNA aligned end in query sequence n=n+1 if (mi_record.query!=m_record.query): print >>sys.stderr,"The two query seqs from miRNA and mRNA results are not matched!" break temp_output=mi_record.query+'\n' for alignment in mi_record.alignments: for hsp in alignment.hsps: if hsp.expect < evalue_mi: mi_indic=1 line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score]) temp_output=temp_output+line+'\n' if mi_end>max(hsp.query_start,hsp.query_end): mi_end=max(hsp.query_start,hsp.query_end) if mi_indic==0: mi_end=0 for alignment in m_record.alignments: for hsp in alignment.hsps: if (hsp.expect < evalue_m) and (min(hsp.query_start,hsp.query_end)>mi_end): m_indic=1 line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score]) temp_output=temp_output+line+'\n' if mi_indic+m_indic>=2: output.write(temp_output) if mi_indic==1: align_mi+=1 if m_indic==1: align_m+=1 print n,align_mi,align_m
def bestrecipblast(org, seed, thresh=5, queue=None): '''Returns the best pairwise reciprocal BLAST using seed accession no. from against org organism''' seedorg=FetchUtil.fetch_organism(seed)[0] acclist={} ac=[] FetchUtil.fetch_fasta(seed) dum=str(int(int(seed.split('.')[0][-5:])*random.random())) os.system('blastp -db nr -query Orthos/'+seed+'.fasta -evalue '+str(thresh)+ ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+org+'[ORGN]\" -use_sw_tback'+ ' -remote') qoutput=open('XML/'+dum+'.xml') parser=NCBIXML.parse(qoutput) for lin in parser: for align in lin.alignments: for hsp in align.hsps: if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>=.25: ac.append(align.title.split('|')[1]) print("Done. Number of sequences found: "+repr(len(ac))) for o in ac: print o FetchUtil.fetch_fasta(o) os.system('blastp -db nr -query Orthos/'+o+'.fasta -evalue '+str(thresh)+ ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+seedorg[0]+'[ORGN]\" -use_sw_tback'+ ' -remote') q1output=open('XML/'+dum+'.xml') parse=NCBIXML.parse(q1output) acc=[] print 'blasted' for lin in parse: for align in lin.alignments: for hsp in align.hsps: if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>.25: acc.append(align.title.split('|')[1]) else: continue print "Done. Number of sequences found: "+repr(len(acc)) if seed in acc: print 'it\'s twue!' name=FetchUtil.fetch_organism(o)[0] try: acclist[name]=[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))] except KeyError: acclist.update({name:[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]}) open('dicts/'+seed,'a').write(str(acclist)+'\n') break #elapsed=time.time()-start #print "Time elapsed: "+time.strftime('%M:%S',[elapsed]) if queue is not None: queue.put(acclist) else: return acclist
def __init__(self, fhand, subj_def_as_accesion=None): 'The init requires a file to be parser' fhand.seek(0, 0) sample = fhand.read(10) if sample and 'xml' not in sample: raise ValueError('Not a xml file') fhand.seek(0, 0) self._blast_file = fhand metadata = self._get_blast_metadata() blast_version = metadata['version'] plus = metadata['plus'] self.db_name = metadata['db_name'] self._blast_file.seek(0, 0) if ((blast_version and plus) or (blast_version and blast_version > '2.2.21')): self.use_query_def_as_accession = True self.use_subject_def_as_accession = True else: self.use_query_def_as_accession = True self.use_subject_def_as_accession = False if subj_def_as_accesion is not None: self.use_subject_def_as_accession = subj_def_as_accesion #we use the biopython parser #if there are no results we put None in our blast_parse results self._blast_parse = None if fhand.read(1) == '<': fhand.seek(0) self._blast_parse = NCBIXML.parse(fhand)
def detection_no_hits_found(files): """ Print sequences where no hit were found :param files: :return: """ dict_no_hit = defaultdict(list) # dict ==> {strain0 : [scaffold_0, scaffold_1, etc], # strain1 : [...], # etc ... } for file in files: strain = os.path.basename(os.path.dirname(file)) blast_records = NCBIXML.parse(open(file)) # cmd to parse each blast file # no_hits = 0 for blast_record in blast_records: query = blast_record.query.split()[0] # extract name of contig if not blast_record.alignments: # no_hits += 1 # count nr of no hit in xml print(query) dict_no_hit[strain].append(query) for strain, scaffolds in dict_no_hit.iteritems(): with open("/Volumes/BioSan/Users/dpflieger/GB-3G/Blast_scaffold/" + strain + ".no_hit_found.txt", "w") as outfile: print("\n".join(scaffolds), file=outfile)
def blastparse(blast_handle, genome, gene): global plusdict records = NCBIXML.parse(blast_handle) # Open record from memory-mapped file dotter() for record in records: # This process is just to retrieve HSPs from xml files for alignment in record.alignments: for hsp in alignment.hsps: threadlock.acquire() # precaution # if hsp.identities == alignment.length: # if the length of the match matches the legth of the sequence # # if genome not in plusdict: # add genomes in plusdict # # plusdict[genome] = defaultdict(list) # # if gene not in plusdict[genome]: # add genes to plus dict # # plusdict[genome][gene] = [] if plusdict[genome][gene] == [] and abs(float(hsp.identities) / alignment.length) >= 0.7: # If there is only one good match then apply allele number plusdict[genome][gene].append("+") # elif "+" not in plusdict[genome][gene]: # plusdict[genome][gene].append("-") # elif abs(float(hsp.identities) / alignment.length) >= 0.7: # # If there is multiple matches then added them in a string # plusdict[genome][gene].append(alignment.title.split('_')[-1]) # plusdict[genome][gene].sort() # else: # # or add the # plusdict[genome][gene].append('%s (%s/%s)' % (alignment.title.split('_')[-1], # hsp.identities, # alignment.length)) # print json.dumps(plusdict, indent=4, separators=(',', ': ')) threadlock.release() # precaution for populate dictionary with GIL
def get_fancy_results_list(self, blast_results, num_results = 20): blast_results_list = [] blast_record = list(NCBIXML.parse(blast_results))[0] num_results = len(blast_record.alignments) if len(blast_record.alignments) < num_results else num_results for i in range(0, num_results): entry = b6lib.B6Entry() entry.q_len = int(blast_record.query_length) entry.query_length = entry.q_len alignment = blast_record.alignments[i] hsp = alignment.hsps[0] entry.hit_def = alignment.hit_def entry.subject_id = entry.hit_def entry.accession = alignment.accession entry.ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % entry.accession entry.hsp_query = hsp.query entry.hsp_match = hsp.match entry.hsp_subject = hsp.sbjct entry.identity = len([x for x in hsp.match if x == '|']) * 100.0 / len(entry.hsp_query) entry.coverage = len(hsp.query) * 100.0 / entry.query_length blast_results_list.append(entry) try: blast_results.close() except: pass return blast_results_list
def parseBlastResult(fileName): handle = open(fileName) blast_records = NCBIXML.parse(handle) results = [] for record in blast_records: rec_id = str(record.query) if len(record.alignments) == 0: results.append( (rec_id, "-", 0, "-") ) continue for algn in record.alignments: evalue = algn.hsps[0].expect score = 0 ids = [] for hsp in algn.hsps: score += hsp.bits ids.append(hsp.identities / float(hsp.align_length)) max_identity = int(max(ids)*100) seq_id = algn.hit_id results.append( (rec_id, seq_id, max_identity, algn.hit_def ) ) return results
def parsePsiBlast(psiblastfilename, max_evalue): try: results_dict = {} handle = open(psiblastfilename, 'r') for blast_record in NCBIXML.parse(handle): for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect <= max_evalue: subjid = alignment.title if subjid in results_dict: if hsp.expect < results_dict[subjid]: results_dict[subjid] = hsp.expect else: results_dict[subjid] = hsp.expect handle.close() return results_dict except: dieError('ERROR: PSI-BLAST failed.')
def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh): result_handle = open(result_file, 'r') ## The XML file to parse. blast_records = NCBIXML.parse(result_handle) print 'query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value' for record in blast_records: ## Loop through each query. query_id = record.query if len(record.alignments) > 0: ## Check whether there are hits. e_val = record.alignments[0].hsps[0].expect if e_val < e_val_thresh: ## Is hit below E-value? tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps]) ## Sum of all identities for all hsps. query_len = record.query_length ## Length of query align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps]) ## Length of query alignment to hit. pct_ident = tot_ident/float(align_len)*100 ## Calculates percentage identity. top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def if pct_ident > ident_thresh: ## Checks whether above percentage identity cutoff. if align_len > align_thresh: print '%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val)) else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') result_handle.close()
def create_rel(self, XMLin): """ Create a dictionary that relate the sequence name with the region to mask. Returns a dictionary """ bat1 = {} b_records = NCBIXML.parse(XMLin) for b_record in b_records: for alin in b_record.alignments: for hsp in alin.hsps: qs, qe = hsp.query_start, hsp.query_end if qs > qe: qe, qs = qs, qe bat1.setdefault(b_record.query.split(" ")[0], set()).add((qs, qe)) # sort and merge overlapping segments for b_record_query in bat1.keys(): joined_cols = [] for qs, qe in sorted(list(bat1[b_record_query])): if joined_cols: last_qs, last_qe = joined_cols[-1] if last_qe >= qs: joined_cols[-1] = (last_qs, qe) continue joined_cols.append((qs, qe)) bat1[b_record_query] = joined_cols return bat1
def run_blastp(match, blastdb): """run blastp""" from Bio.Blast.Applications import NcbiblastpCommandline for feature in match.features: rec = None fasta = feature.protein_fasta() if fasta == "": continue try: cline = NcbiblastpCommandline(db=blastdb, outfmt=5, num_threads=4) pipe = subprocess.Popen( str(cline), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) pipe.stdin.write(fasta) pipe.stdin.close() recs = NCBIXML.parse(pipe.stdout) rec = recs.next() pipe.stdout.close() pipe.stderr.close() except OSError, err: logging.warning("Failed to run blastp: %s" % err) continue except ValueError, err: logging.warning("Parsing blast output failed: %s" % err) continue
def BLAST_to_BRIG(BLASTfile, resultsFile): rec = open(BLASTfile) blast_records = NCBIXML.parse(rec) with open(resultsFile, "w") as tabFile: for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: tabFile.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( blast_record.query, alignment.hit_def, round(float(match.identities) / float(alignment.length), 2), int(match.score), alignment.length, int(alignment.length) - int(match.identities), match.query_start, (int(match.query_start) + int(alignment.length)), match.sbjct_start, (int(match.query_start) + int(alignment.length)), ) ) break
def parse_blast_XML(blast_xml): """ Read the blast_xml file generated before and extract the sequence and the id of each sequence in Blast and save them to multiple fasta file. It will allow ClustalW to generate a Multiple Sequence Alignment from all these sequence extracted. """ blast_xml_op = open (blast_xml, 'r') for record in NCBIXML.parse(blast_xml_op): for align in record.alignments: hit_id = align.hit_id.split("|") prev_eval = 1 coverage = align.length / 390 ######arreglar per posar longitud sequencia for hsp in align.hsps: if hsp.expect < prev_eval: prev_eval = hsp.expect efetch = Entrez.efetch(db="protein", id=hit_id, rettype="fasta") for line in efetch: line = line.rstrip() if line.startswith(">"): id_info = line sequence = "" else: sequence += line sequence += line organism = id_info[id_info.find("[") + 1:id_info.find("]")] organism = organism.split() if len(organism) != 1: species = str(organism[0] + "_" + organism[1]) yield BlastResult(hit_id[1], species, sequence, prev_eval, coverage)
def get_gb_info(self, resultshandle): """Extracts the GenBank record IDs, the hit positions, and the sequence orientations from the BLAST report.""" # Start a parser that steps through each record blast_records = NCBIXML.parse(resultshandle) # List to hold information about our hits # Step through the BLAST records for record in blast_records: # Step through each alignment in each record for alignment in record.alignments: # Then the HSPs in each alignment for hsp in alignment.hsps: # The start and end positions of each hit hit_coords = (hsp.sbjct_start, hsp.sbjct_end) # Split on the '|' character, genbank ID is last in the # list have to use -2 instead, because of the trailing '|' # in the XML report hit_gbid = alignment.title.split('|')[-2] # Relative directions of the sequences hit_directions = hsp.frame break # Tack the IDs, coordinates, and directions onto our lists self.gb_ids.append(hit_gbid) self.hit_coords.append(hit_coords) self.hit_directions.append(hit_directions) # Finished with this file resultshandle.close() return
def parse_blast_xml(xml_filename, query_filename, output_filename, abundance_filename=None): """ Parse the XML output, looking only at the 1st alignment for each query Write out in format: ID \t COUNT \t LENGTH \t AMBIG \t QSTART \t QEND \t IDEN """ if abundance_filename is None: abundance = defaultdict(lambda: 1) else: abundance = dict(line.strip().split('\t') for line in open(abundance_filename)) handle = NCBIXML.parse(open(xml_filename)) f = open(output_filename, 'w') f.write("ID\tCOUNT\tLENGTH\tAMBIG\tQSTART\tQEND\tIDEN\n") with open(query_filename) as h: for r in SeqIO.parse(h, 'fasta'): ambig = r.seq.count('N') + r.seq.count('?') blastout = handle.next() if len(blastout.alignments) == 0: # no match was found! f.write("{id}\t{count}\t{len}\t{ambig}\tNA\tNA\tNA\n".format(\ id=r.id, count=abundance[r.id], len=len(r.seq), ambig=ambig)) else: hsp = blastout.alignments[0].hsps[0] f.write("{id}\t{count}\t{len}\t{ambig}\t{qs}\t{qe}\t{iden}\n".format(\ id=r.id, len=len(r.seq), qs=hsp.query_start, qe=hsp.query_end,\ iden=hsp.identities, count=abundance[r.id], ambig=ambig)) f.close()
def blast_file_opener(filename, evalue, mismatches, outfile): """Func takes in a BLAST xml output file (filename). writes out the various details of interests to the outfile. It filters the results based on evalue and number of mismatches, as defined by the user. """ E_VALUE_THRESH = float(evalue) mismatches = int(mismatches) result_handle = open(filename) f = open(outfile, 'w') temp = outfile.split(".txt")[0] blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: alignment_hits = set([]) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: # For mismatches use (hsp.align_length - hsp.identities) mmatches = hsp.align_length - hsp.identities if str(mmatches) == str(mismatches): data = "%s\t%s\t%s\n" %(alignment.title, blast_record.query, str(hsp.expect)) f.write(data) f.close() result_handle.close() return alignment_hits
def include_check(blast_result_filename,include_line,seq_name_list,max_mismatch): strlist=str(include_line).split(' OR ') results={} # intermediate result to show if what organism are conserved in that seq results_final={} # final result to show if an seq is conserved or not in all the rquested organism for valist in strlist: txid_num=valist[valist.find('(taxid:')+7:valist.find(')')] blast_result_file= open(blast_result_filename+txid_num,"r") found={} for record in NCBIXML.parse(blast_result_file): name=record.query min_len= record.query_letters-max_mismatch if not found.has_key(name): if record.alignments : for align in record.alignments : for hsp in align.hsps : #print "blast: ",hsp.identities,name,query_len,int(num) if hsp.identities == hsp.align_len and hsp.identities>=min_len: # 100% match and has more identities than requirement length of matches found[name]=1 # this valst is conserved in current if results.has_key(name): temp=results[name] temp.append(txid_num) results[name]=temp else: temp=[txid_num] results[name]=temp #print name,results[name] blast_result_file.close() len_organ=len(strlist) for i in results.keys(): if len(results[i])==len_organ: results_final[i]=1 return (results_final)
def parse_blast_xml_for_training(xml_filename, bowtie_filename, output_filename): """ Parse the XML output, looking only at the 1st alignment for each query Write out in format: Phred Cycle B2 B1 B0 Class """ fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename)) f = open(output_filename, 'w') f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n") for blastout in NCBIXML.parse(open(xml_filename)): if len(blastout.alignments) == 0: # no match was found! continue hsp = blastout.alignments[0].hsps[0] record = fa_dict[blastout.query] primer_offset = int(record['offset']) for i in xrange(2, len(hsp.match)):# toDO: allow for i<2 and still get B2, B1 # global position is i + (query_start-1) + primer_offset if hsp.match[i]==" " and hsp.query[i]!='-' and hsp.sbjct[i]!='-': pdb.set_trace() # is a mismatch! f.write(str(ord(record['qual'][i+hsp.query_start-1])-33) + '\t') f.write(str(i + hsp.query_start - 1 + primer_offset) + '\t') f.write(hsp.query[i-2] + '\t') f.write(hsp.query[i-1] + '\t') f.write(hsp.query[i] + '\t') f.write('-\n') f.close()
def parse_online_blast (seq_list): # get the result handle and set the taxon dic blast_handle, taxon_dic = online_blast(seq_list), {} # use the biopython xml parse module to get the results logging.debug('Parsing blast result XML file.') blast_list = [item for item in NCBIXML.parse(blast_handle)] # walk through the blast results and prepare them for filtering for blast_result in blast_list: for alignment in blast_result.alignments: for hsp in alignment.hsps: # calculate the %identity identity = float(hsp.identities/(len(hsp.match)*0.01)) # grab the genbank number gb_num = alignment.title.split('|')[1:4:2] gb_num[1] = gb_num[1].split('.')[0] # get the taxon id based on the genbank identifier if gb_num[0] not in taxon_dic: taxon = obtain_tax(gb_num[0]) taxon_dic[gb_num[0]] = taxon else: taxon = taxon_dic[gb_num[0]] # pull all the results together and sent them to the filter function filter_hits([str(blast_result.query), str(alignment.title), str(gb_num[0]), str(gb_num[1]), str(identity), str(len(hsp.query)), str(blast_result.query_length), str(hsp.expect), str(hsp.bits), taxon[0], taxon[1]])
def _convertCDNA(self, diff_dir, gene, cDNA, refSeqName, refSeqVer): if gene[1] == refSeqName and gene[2] == refSeqVer: # Should only really differ by version number return cDNA else: diff_name = os.path.join(diff_dir, "%s.%sto%s.xml" % (gene[1], refSeqVer, gene[2])) if not(os.path.isfile(diff_name)): raise Exception("No BLAST xml diff file for %s from %s to %s" % (gene[1], refSeqVer, gene[2])) f = open(diff_name) blast_records = list(NCBIXML.parse(f)) f.close() if len(blast_records) < 1: raise Exception("BLAST xml diff does not have at least one record") if len(blast_records[0].alignments) < 1: raise Exception("BLAST xml diff does not have at least one alignment") if len(blast_records[0].alignments[0].hsps) < 1: raise Exception("BLAST xml diff does not have at least one hsps in alignment") hsp = blast_records[0].alignments[0].hsps[0] offset = hsp.sbjct_start - 1 parser = Parser() variant = parser.parse("", cDNA) if variant.position != '' and variant.position.find('*') < 0: variant.position = str(int(variant.position) + offset) if variant.range_lower != '' and variant.range_lower.find('*') < 0: variant.range_lower = str(int(variant.range_lower) + offset) if variant.range_upper != '' and variant.range_upper.find('*') < 0: variant.range_upper = str(int(variant_range_upper) + offset) cDNA = variant.ToString() return cDNA
def readNcbiXml(infile, options): #minPos, minLen, minExpSize, minExpClones, minMotifClones, minExpFreq, sample2total #Read in blast-output file: rh = open(infile) records = NCBIXML.parse( rh) clone2hits = {} #key = cloneName, val = [ (list of hit papers, identity) ] for record in records: #each seed if record.query_length < options.minLen: #ignore seeds that are shorter than minimum required length continue clone = record.query clone2hits[clone] = [] for aln in record.alignments: for hit in aln.hsps: #each hit #if float(hit.identities)/len(hit.query) < minPositives: if float(hit.positives)/len(hit.query) < options.minPos: #ignore matches with similarity lower than required continue if aln.title.split()[-1] == clone or isSameClone(clone, aln.title.split()[-1], hit): #hit is the seed itself, ignore continue #if len(hit.match) < options.minLen: if len(hit.match) < record.query_length: continue clone2hits[ clone ].append( (aln.title.split()[-1], hit.positives, hit.query, hit.match, hit.sbjct) ) #Sort the hits by size hits = clone2hits[clone] clone2hits[clone] = sorted(hits, key=lambda h:getHitSize(h[0]), reverse=True) return clone2hits
def load_align_table(file1,file2): init() files=[file1,file2] for i in files: result_handle = open(i) for blast_result in NCBIXML.parse(result_handle): for alignment in blast_result.alignments: for hsp in alignment.hsps: title=alignment.title e_val=hsp.expect scr=hsp.score a_len=alignment.length ident=hsp.identities q_seq=hsp.query m_seq=hsp.match s_seq=hsp.sbjct q_row=blast_result.query.split('|') pid=int(q_row[1]) r=Proteome.byPid(pid) s_row=title.split('|') t=Align(querydesc=blast_result.query, subid=int(s_row[3]), subjectdesc=title, evall=e_val, score=scr, align_length=a_len, Identity=ident, queryseq=q_seq, match=m_seq, subjctseq=s_seq, ident_percent=(ident/float(a_len)), proteome=r)
def get_ids(filename, dir, ethresh = 0.01): eValueThresh = ethresh result = open(os.path.join(dir,"BLAST",filename),"r") # mode omitted defaults to read only blast_record = NCBIXML.parse(result) blast_records = list(blast_record) record = blast_records[0] hits = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < eValueThresh: title = alignment.title mdata = re.match( r'.*[A-Z|a-z]{2,3}\|(.*?)\|.*?\[([A-Z])\S* ([A-Z|a-z]{3}).*\].*?', title) if mdata is not None: accession = re.match(r'([A-Z|a-z|_|0-9]*)\..*', mdata.group(1)) acc = str(accession.group(1)) genus = str(mdata.group(2)[0]) species = str(mdata.group(3)[:3]) shortSpecies = (genus + species) hits.append((acc, shortSpecies)) spec = filename[0:4] filteredHits = filter_species(hits,spec) # Saving results # Save as separate files for each species~! with open(os.path.join(dir,"accs",record_name(filename)+".csv"),'w') as csvfile: blasthits = csv.writer(csvfile) for each in filteredHits: blasthits.writerow([each[0]]) csvfile.close()
def blast_reads(blast_string, reads, outfh): blast_db = '/Users/sw10/Dropbox/Sanger/blastdb/ebola/Zaire_ebolavirus_KM034562' # 2) blast_binary = '/Applications/ncbi-blast-2.2.29+/bin/blastn' # 3) xml_outfile = '/tmp/test.xml' evalue = 0.01 cline = NcbiblastnCommandline(cmd=blast_binary, out=xml_outfile, outfmt=5, query="-", db=blast_db, evalue=evalue, max_target_seqs=1, num_threads=1) stdout, stderr = cline(blast_string) with open(xml_outfile, 'r') as blast_handle: blast_records = NCBIXML.parse(blast_handle) for blast_record in blast_records: name = blast_record.query for alignment in blast_record.alignments: count = 1 for hsp in alignment.hsps: seq = reads[name].sequence[hsp.query_start:hsp.query_end] qual = reads[name].quality[hsp.query_start:hsp.query_end] if hsp.sbjct_start > hsp.sbjct_end: tmp1 = [seq[i] for i in range(len(seq)-1,-1,-1)] seq = ''.join(tmp1) tmp2 = [qual[i] for i in range(len(qual)-1,-1,-1)] qual = ''.join(tmp2) outfh.write('@%s:%d\n%s\n+\n%s\n' % (name, count, seq, qual)) count += 1 os.remove(xml_outfile)
def parse_BLAST(blast_results, tol): """ Using NCBIXML parse the BLAST results, storing & returning good hits Here good hits are: * hsp.identities/float(record.query_length) >= tol :param blast_results: full path to a blast run output file (in XML format) :param tol: the cutoff threshold (see above for explaination) :type blast_results: string :type tol: float :rtype: list of satifying hit names """ if os.path.isfile(os.path.expanduser(blast_results)): hits = [] for record in NCBIXML.parse(open(blast_results)): for align in record.alignments: for hsp in align.hsps: hit_name = record.query.split(',')[1].strip() if hsp.identities/float(record.query_length) >= tol: hits.append(hit_name.strip()) else: sys.stderr.write("BLAST results do not exist. Exiting.\n") sys.exit(1) return hits
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print ("test.py -i <inputfile> -o <outputfile>") sys.exit(2) for opt, arg in opts: if opt == '-h': print ("test.py -i <inputfile> -o <outputfile>") sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg print ("Input file is " + inputfile) print ("Output file is " + outputfile) outfile = open(outputfile, 'w') outfile.write("qseqid\tqseqdef\tqframe\tqlen\tqstart\tqend\tsseqid\tsseqdef\tslen\tsstart\tsend\tidentity\tpositive\tgaps\talign_len\teValue\n") try: result_handle = open(inputfile) blast_records = NCBIXML.parse(result_handle) for record in blast_records: for alignment in record.alignments: for hsp in alignment.hsps: fields = [record.query_id,record.query,str(hsp.frame),str(record.query_length),str(hsp.query_start),str(hsp.query_end),alignment.hit_id,alignment.hit_def,str(alignment.length),str(hsp.sbjct_start),str(hsp.sbjct_end),str(hsp.identities),str(hsp.positives),str(hsp.gaps),str(hsp.align_length),str(hsp.expect)] outfile.write("\t".join(fields) + "\n") except IOError: print ("no such file!")
def blast_xml_to_gff3(file_in,file_out,blast_type): result_handle = open(file_in) blast_records = NCBIXML.parse(result_handle) E_VALUE_THRESH = 0.04 with open(file_out,"w") as f: f.write("##gff-version 3"+"\n") for blast_record in blast_records: counter = 0 for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH and counter < 1: counter+=1 if hsp.strand[0] is None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + "?" + "\t" + "." + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + "?" + "\t" + str(hsp.frame[0]) + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is not None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + str(hsp.strand[0]) + "\t" + "." + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is not None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + str(hsp.strand[0]) + "\t" + str(hsp.frame[0]) + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
def blast_align(fasta,blast_path,linker_db): fasta_name=fasta.split(".")[0] os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 0.1 -db "+linker_db+" -query ./temp/"+fasta+" > ./temp/"+fasta_name+"_blast_linker.xml") linker_records=NCBIXML.parse(open("./temp/"+fasta_name+"_blast_linker.xml")) # os.system("rm ./temp/"+fasta) # os.system("rm ./temp/"+fasta_name+"_blast_linker.xml") return (linker_records)
def main(): # query = input('Enter query file name: ') # For the working example, type in 'Test_miRNA.txt' # filename = input('What is your desired file name for the top hits file? ') # I used 'Test_miRNA_Results.txt' # writer = open(filename, 'w') records = SeqIO.parse('gg_pre_mirna.fasta', 'fasta') writer = open('results.fasta', 'w') writer.write('Organism_name' + '\t' + 'Query_start' + '\t' + 'Query_end' + '\t' + 'Subject_start' + '\t' + 'Subject_end' + '\r') # Writes the header for the results file # print('Now BLASTing') for record in records: tempWriter = open('Temp.txt', 'w') tempWriter.write('>' + record.id + '\n') tempWriter.write(str(record.seq) + '\n') #os.system('blastn -task blastn-short -query '+ str(record.seq) +' -db Input/gg_db -out BLAST_result.xml -outfmt "5" ') os.system('blastn -task blastn-short -query gg_pre_mirna.fasta -db Input/gg_db -out BLAST_result.xml -outfmt "5" ') result_handle = open('BLAST_result.xml') blast_records = NCBIXML.parse(result_handle) writer.write('\r' + '*****' + '\r') writer.write(record.id + '\r' + '*****' + '\r') for blast_record in blast_records: parsefile(blast_record,writer) tempWriter.close() writer.close() print('Finished!')
def process_blast_data(xml_filename:str, species:str, sequence_length:int): result_handle = open(xml_filename, 'r') blast_records = NCBIXML.parse(result_handle) #Processing Blast Data blast_save = {} for sequence in blast_records: E_VALUE_THRESH = 1 unique = True for alignment in sequence.alignments: # print(str(sequence.query)) for hsp in alignment.hsps: title = str(alignment.title) if not hsp.expect < E_VALUE_THRESH: unique = False if unique: if sequence.query not in blast_save: blast_save[sequence.query] = list() blast_save[sequence.query].append((alignment.title, hsp.query[0:sequence_length])) match_mismatch_number = {} match_mismatch_list = [] for (query, matches) in blast_save.items(): match_count = 0 mismatch_count = 0 mismatch_titles = [] mismatch_sequence = [] for match in matches: (title, sequence) = match if species in title: match_count = match_count+1 else: mismatch_count = mismatch_count + 1 mismatch_titles.append(title) mismatch_sequence.append(sequence) match_mismatch_number = {"query" : query, "match_count": match_count, "mismatch_count": mismatch_count, "mismatch_titles": mismatch_titles, "mismatch_sequences": mismatch_sequence } match_mismatch_list.append(match_mismatch_number) matches_df = pd.DataFrame(match_mismatch_list) return(matches_df)
def parse_blast_results(blast_xml_files): for blast_xml_file in blast_xml_files: print(blast_xml_file) with open(blast_xml_file, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) # blast_record = next(blast_records) # for alignment in blast_record.alignments: # for hsp in alignment.hsps: # print(hsp.expect) for blast_record in blast_records: # print(blast_record) for alignment in blast_record.alignments: # print(alignment) for hsp in alignment.hsps: print(hsp) print("sequence:", alignment.title) print("e value:", hsp.expect)
def net_blast(query_record, program='blastn', database='nr'): """ net_blast(query_record, program, database = 'nr') *Perform a BLAST search over the net using the specified program & database *before searching, check that the search alphabet is compatible with the type of search, *raise a ValueError if not ARGUMENTS query_record: a SeqRecord object containing the query sequence program: the program to use, as per: http://www.ncbi.nlm.nih.gov/BLAST/blast_program.shtml database: the db to query, as per: http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=ProgSelectionGuide#db """ #check whether we have a valid query if not isinstance(query_record, SeqRecord): raise ValueError(u'Invalid Search Item') if len(query_record.seq) < 10: raise ValueError(u"Query sequence is too short") #check that the program is valid program = program.lower() if program not in searches: raise ValueError(u"Invalid Program '%s'" % program) #check that the alphabet and db are ok (required_alpha, required_dbs) = searches[program] if not isinstance(query_record.seq.alphabet, required_alpha): raise ValueError(u"Query alphabet for '%s' must be '%s'" % (program, alphabets[program])) if not (database in protein_db or database in nucleotide_db): raise ValueError(u"Invalid database '%s'" % database) if not database in required_dbs: raise ValueError(u"Database '%s' cannot be used with program '%s'" % (database, program)) #Value checking done, time to run the search results = NCBIWWW.qblast(program, database, query_record.seq, format_type='XML') #parse the results blast_records = NCBIXML.parse(results) return blast_records
def blast_in_handle(handle, b='guess', log=True): """ gueses blast format :returns list of SearchIO objects, one set of hits per per query sequence per field """ if log: ml.debug(fname()) multiq = [] if b == 'guess': # gues the format l = handle.readline() handle.seek(0, 0) # seek to begining if re.search(r'^BLASTN \d+\.\d+\.\d+', l): # blast object prob plaintext b_type = 'plain' if log: ml.info('Inferred BLAST format: txt.') elif re.search(r'<\?xml version', l): # run xml parser b_type = 'xml' if log: ml.info('Inferred BLAST format: xml.') else: if log: ml.error( 'Could not guess the BLAST format, preferred format is NCBI xml.' ) return None else: b_type = b if b_type == 'plain': for p in blast_parse_txt(handle): multiq.append(p) return multiq elif b_type == 'xml': for p in NCBIXML.parse(handle): multiq.append(p) return multiq else: if log: ml.error('BLAST type not known: allowed types: plain, xml, guess') return None
def runBlastParser(cline, bOutFile): """Ensure cline is in fact the blastp command""" if str(shutil.which("blastp")) in str(cline): os.system(str(cline)) # print("opening xml") rec = open(bOutFile) # print("parsing...") blast_records = NCBIXML.parse(rec) else: raise Exception("Blastp path not found in command line argument") # if os.path.isfile(locus_sbjct): # os.remove(locus_sbjct) # os.remove(bOutFile) return blast_records
def parse(self, blast_record): records = list() with open(blast_record.get_filename()) as open_file: try: blast_records = NCBIXML.parse(open_file) for record in blast_records: parsed_record = self._parse_record(record) records.append(parsed_record) except ValueError as error: print(error, blast_record.get_filename()) blast_record._records = records return blast_record
def parse_secondary(self): records = NCBIXML.parse(open('secondary.xml')) positives = [] for record in records: title = ParseDefline(record.query) for desc in record.descriptions: if desc.e <= self.expect_diff: positives.append(title.id) print positives hipster = SeqIO.parse(open(self.substract),'fasta') hipster = SeqIO.to_dict(hipster) unique = list(set(hipster.keys())-set(positives)) handle = open('unique.faa','wb') for key in unique: record = hipster[key] SeqIO.write(record,handle,'fasta')
def conservation_check(blast_result_file,conservation_list,seq_name_list): results = [[0] * len(conservation_list) for row in range(len(seq_name_list))] for record in NCBIXML.parse(open(blast_result_file)) : if record.alignments : name=record.query num=split(record.query,"_")[1] type=split(record.query,"_")[2] num=int(num,10) for align in record.alignments : for hsp in align.hsps : for index in range(len(conservation_list)): if align.hit_def.find(conservation_list[index]) >-1: s1=bool(hsp.query_start ==1) s2=bool(hsp.query_start <= record.query_length-21) e1=bool(hsp.query_end == record.query_length) e2=bool(hsp.query_end >= 21) #print "result",results[num-1][index] #print "result",len(conservation_list) if (type =='HTH'): if (s2 and e1) and (results[num-1][index]==-1): results[num-1][index]=2 #print "itshouldbe","num",num,"index",index,"change?",results[num][index+1] elif (s1 and e2) and (results[num-1][index]==1): results[num-1][index]=2 elif (s1 and e2) and (results[num-1][index]==0): results[num-1][index]=-1 elif (s2 and e1) and (results[num-1][index]==0): results[num-1][index]=1 #print "into",num,index,results[num][index+1] #print "HTH" #print "hsp.query_start",hsp.query_start, "end",hsp.query_end, "len",record.query_length, "num",num, "index",index #print "firt", bool(s2) #print "second", bool(e1) elif (type =='HTT'): #print "HTT",bool(s2 and e1) if(s2 and e1): results[num-1][index]=1 #print "into",num,index,results[num-1][index] #print "HTT" elif (type =='TTH'): if(s1 and e2): #print "into only ",num,index,name results[num-1][index]=-1 #print "TTH" print "doneconservation" return results
def __enter__(self): # First, we try to guess the filetype of the target. ftype = self._guess_filetype(self.target) if ftype == 'fasta': self.makeblastdb(self.target, self.dbtype, self.blastdb) self.run_blast(self.cmd) #, self.query, self.target, self.tempdir) self.file_in = open(self.blastout, 'r') elif ftype == 'blastdb': # The target is already a blastdb file, so we can set that in the # ncbi blast command directly self.cmd.db = self.target self.run_blast(self.cmd) self.file_in = open(self.blastout, 'r') elif ftype == 'xml': self.file_in = open(self.target, 'r') return NCBIXML.parse(self.file_in)
def blast_file(fasta_path, blast_db='nt', parser=basic_parser): logging.info("Running BLAST {}".format(fasta_path)) results = [] #record = SeqIO.read(fasta_path, format="fasta") fasta_string = open(fasta_path, 'r').read() logging.debug(fasta_string) result_handle = NCBIWWW.qblast(BLAST_PROG, blast_db, fasta_string, megablast=True) logging.info("BLAST returned") blast_records = NCBIXML.parse(result_handle) logging.info("Analyzed BLAST") for single_record in blast_records: # each run is a single sequence search from fasta_path results.append(parser(single_record)) return results
def store_and_parse_blast_results(file_name, blast_output_handle_list): """ Stores blast results to file and then parses the file to a list we can reuse. There may be a better way to do this, but we can only iterate the blast handle once, and saving to disk lets us avoid rerunning the same jobs on NCBI's servers while developing and debugging this script. """ blast_output_file = open(file_name, 'w') for blast_output_handle in blast_output_handle_list: blast_output_file.write(blast_output_handle.read()) blast_output_handle.close() blast_output_file.close() # Parse the blast results file and convert it to a list we can reuse blast_output_handle = open(file_name, 'r') blast_records = NCBIXML.parse(blast_output_handle) blast_records = list(blast_records) return blast_records
def blastp(self, fasta, pdb_db): """ Runs BLASTP locally on a input fasta file and specified BLASTP database :param fasta: The absolute path to a FASTA file :param pdb_db: A BLASTP database :return: Bio.blast.Record object """ logger.debug('Running blastp ' + fasta) cline = NcbiblastpCommandline(cmd='blastp', query=fasta, db=pdb_db, evalue=0.001, outfmt=5) std_out, std_err = cline() blast_records = NCBIXML.parse(StringIO(std_out)) record = next(blast_records) return record
def special_gene(target_fie, database, gene_list): database = database.split("/")[-1] ##########1/27/2015 os.system('makeblastdb -in database/' + database + ' -out ' + database + '_db -dbtype nucl') ##########1/28/2015 os.system('blastn -query ' + target_file + ' -db ' + database + '_db -out ' + database + '_vs_' + target_file + '.xml ' + '-outfmt 5') ##########1/28/2015 xml_file = database + '_vs_' + target_file + '.xml' result_handle = open(xml_file) blast_record = NCBIXML.parse(result_handle) records = list(blast_record) E_thresh = 1e-10 for x in gene_list: handle = SeqIO.parse("database/" + database, "fasta") ##########1/28/2015 length_list = [] for y in handle: if x in y.description: length_x = len(y.seq) length_list.append(length_x) aver_len = float(sum(length_list)) / len(length_list) hspbit = [] alignmentlist = [] for record in records: for alignment in record.alignments: if x in alignment.hit_def: #multi gene database, so... print x, "got a hit, evaluating the hit quality..." score = 0 for hsp in alignment.hsps: if hsp.expect < E_thresh: score += hsp.bits alignment = alignment.hit_def + ':' + str(score) hspbit.append(score) alignmentlist.append(alignment) scorelist = dict(zip(alignmentlist, hspbit)) score = 0 for Htype in scorelist: if scorelist[Htype] > score: First_Choice = Htype score = scorelist[Htype] if float(score) >= 0.1 * aver_len: print "$$$", First_Choice, "got a hit, score:", score else: print "$$$No ", x, "exists" os.system("rm " + database + "_db.*") ##########1/28/2015 os.system("rm " + xml_file) ##########1/28/2015
def XMLparse(Path): E_VALUE = pow(10,-40) result = NCBIXML.parse(open(Path)) numBlastHits = 0 hitsDict = {} for blast_record in result: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect <= E_VALUE: numBlastHits += 1 break hitsDict[blast_record.query] = numBlastHits numBlastHits = 0 result.close() print(writeCSV(Path, hitsDict)) return[]
def Filter_Readouts_by_Genome(cand_readout_file='selected_candidates.fasta', genome_db='hg38', readout_folder=_readout_folder, genome_folder=_genome_folder, word_size=10, evalue=1000, save_postfix='genome', verbose=True): """Filter readout candiates by blasting against genome Inputs: Outputs: """ if not os.path.isfile(cand_readout_file): cand_readout_file = os.path.join(readout_folder, cand_readout_file) if not os.path.isfile(cand_readout_file): raise IOError(f"Wrong input candidate readout file:{cand_readout_file}, not exist.") elif '.fasta' not in cand_readout_file: raise IOError(f"Wrong input file type for {cand_readout_file}") # blast! blast_outfile = cand_readout_file.replace('.fasta', f'_{genome_db}.xml') output = NcbiblastnCommandline(query=cand_readout_file, num_threads=12, db=os.path.join(genome_folder, genome_db), evalue=500, word_size=10, out=blast_outfile, outfmt=5)()[0] # decide which to keep genomeblast_keeps = [] blast_records = NCBIXML.parse(open(os.path.join(readout_folder, 'selected_candidates_hg38.xml'), 'r')) for blast_record in blast_records: if verbose: print(blast_record.query_id, len(blast_record.alignments)) keep = filter_readouts_by_blast(blast_record, verbose=verbose) genomeblast_keeps.append(keep) # save all with open(cand_readout_file, "r") as handle: record_keeps = [] for _i, record in enumerate(SeqIO.parse(handle, "fasta")): if genomeblast_keeps[_i]: record_keeps.append(record) save_filename = cand_readout_file.replace('.fasta', f'_{save_postfix}.fasta') with open(save_filename, "w") as output_handle: SeqIO.write(record_keeps, output_handle, "fasta") if verbose: print(f"-- number of readout candidates kept: {len(record_keeps)}") return record_keeps
def blast(seq, binary="blastp", db="blastdb/FPbase_blastdb.fsa", max_hits=30, fmt=15, **kwargs): assert binary in ("blastp", "blastx"), "Unrecognized blast binary" if not (os.path.isfile(db) and (len(os.listdir(os.path.dirname(db))) > 5)): make_blastdb(db) binary = f"bin/{binary}_" + ("osx" if sys.platform == "darwin" else "nix") max_hits = kwargs.pop("max_target_seqs", max_hits) fmt = kwargs.pop("outfmt", fmt) with tempfile.NamedTemporaryFile(suffix=".fsa") as tmp: if not seq.startswith(">"): seq = ">query\n" + seq tmp.write(seq.encode()) tmp.seek(0) cmd = [ binary, "-query", tmp.name, "-outfmt", str(fmt), # xml format "-db", db, "-max_target_seqs", str(max_hits), "-max_hsps", "1", ] # only show one alignment for each pair for key, value in kwargs.items(): cmd.extend([f"-{key}", str(value)]) with tempfile.NamedTemporaryFile(suffix=".txt") as outfile: cmd.extend(["-out", outfile.name]) run(cmd) if fmt == 5: from Bio.Blast import NCBIXML records = NCBIXML.parse(outfile.file) return [serialize_record(r) for r in records] if fmt == 15: out = outfile.file.read().decode() return json.loads(out).get("BlastOutput2") else: return outfile.file.read().decode()
def blast_offtarget(fasta_string): """Function which count offtarget using blast. Args: fasta_string(str): Fasta sequence. Returns: Offtarget value(int). """ try: with blast_path(): with open('fasta', 'w') as fasta_file: fasta_file.write(fasta_string) cline = NcbiblastnCommandline( query="fasta", db="refseq_rna", outfmt=("'6 qseqid sseqid evalue bitscore sgi sacc staxids" "sscinames scomnames stitle'")) stdout, stderr = cline() blast_lines = [ line for line in stdout.split('\n') if 'H**o sapiens' in line ] return len(blast_lines) except ApplicationError: result_handle = NCBIWWW.qblast("blastn", "refseq_rna", fasta_string, entrez_query="txid9606 [ORGN]", expect=100, gapcosts="5 2", genetic_code=1, hitlist_size=100, word_size=len(fasta_string), megablast=True) blast_results = result_handle.read() blast_in = cStringIO.StringIO(blast_results) count = 0 for record in NCBIXML.parse(blast_in): for align in record.alignments: count += 1 return count
def blast(query, subj, minoverlap, logger, wd, threads): """Return bool and positions of query sequences that overlapped with subject given parameters.""" query_file = os.path.join(wd, 'query.fasta') subj_file = os.path.join(wd, 'subj.fasta') SeqIO.write(query, query_file, "fasta") SeqIO.write(subj, subj_file, "fasta") try: # options: http://www.ncbi.nlm.nih.gov/books/NBK1763/ cline = NcbiblastnCommandline(query=query_file, subject=subj_file, outfmt=5, cmd=blastn, word_size=8, num_threads=threads) logger.debug(cline) output = cline()[0] except ApplicationError: # as error_msg: # logger.debug(error_msg) # logger.warn("---- BLAST Error ----") # TODO: work out why this is happening, doesn't seem to affect # results though, low priority return [], [] finally: os.remove(query_file) os.remove(subj_file) # list of T or F for success of alignment between queries and # subject bools = [] # record start and end position to avoid composite sequence # problems positions = [] # BLAST records for each query sequence matched against subj bresults = NCBIXML.parse(StringIO(output)) for record in bresults: if record.alignments: res = record.alignments[0].hsps[0] # if identities > minoverlap, keep if res.identities > minoverlap: bools.append(True) positions.append(res.query_start) positions.append(res.query_end) continue bools.append(False) return bools, positions
def getBLAST(arg): BLASTResultAsXML = NCBIWWW.qblast(program=arg[1], database=arg[2], sequence=arg[3], expect=arg[4], hitlist_size=arg[5], matrix_name=arg[6], alignments=arg[7]) BLASTData = NCBIXML.parse(BLASTResultAsXML) maxEValue = 0.0001 maxResults = 1 i = 0 for BLASTResult in BLASTData: for alignment in BLASTResult.alignments: for hsp in alignment.hsps: if hsp.expect < maxEValue and maxResults < 2: # Header van het BLAST resultaat header = str(alignment.title) # Naam organisme name = header.split('[', 1)[1].split(']')[0].split('>')[0] protein = header.split('|')[4].split('[')[0] accession = alignment.title.split('|')[3] eValue = hsp.expect identity = hsp.identities queryCov = float(hsp.identities) / float(len( hsp.query)) * float(100) score = hsp.score bits = hsp.bits data = str(name) + "$" + str(protein) + "$" + str( accession) + "$" + str(eValue) + "$" + str( identity) + "$" + str(queryCov) + "$" + str( score) + "$" + str(bits) print(data) maxResults += 1 if maxResults >= 2: break i += 1 if i == 1: break
def parse_BLAST(blast_results, tol, careful): """ Using NCBIXML parse the BLAST results, storing & returning good hits Here good hits are: * hsp.identities/float(record.query_length) >= tol :param blast_results: full path to a blast run output file (in XML format) :param tol: the cutoff threshold (see above for explaination) :type blast_results: string :type tol: float :rtype: list of satifying hit names """ if os.path.isfile(os.path.expanduser(blast_results)): hits = [] for record in NCBIXML.parse(open(blast_results)): for align in record.alignments: for hsp in align.hsps: hit_name = record.query.split(',')[1].strip() cutoff = hsp.identities/float(record.query_length) if cutoff >= tol: hits.append(hit_name.strip()) # New method for the --careful option elif cutoff >= tol-careful: print "Please confirm this hit:" print "Name,SeqFindr score,Len(align),Len(query),Identities,Gaps" print "%s,%f,%i,%i,%i,%i" % (hit_name, cutoff, hsp.align_length, record.query_length, hsp.identities, hsp.gaps) accept = raw_input("Should this be considered a hit? (y/N)") if accept == '': pass elif accept.lower() == 'n': pass elif accept.lower() == 'y': hits.append(hit_name.strip()) else: print "Input must be y, n or enter." print "Assuming n" else: pass else: sys.stderr.write("BLAST results do not exist. Exiting.\n") sys.exit(1) return hits
def _perform_alignment(self, idx__seq_discrpt): idx, (seq, description) = idx__seq_discrpt pval = float(description.split(':')[1]) final_results = [] if pval <= self.p_value_threshold: FileUtility.create_fasta_file('../tmp/temp' + str(idx) + '.fasta', [seq], ['temp']) blastx_cline = NcbiblastnCommandline( query='../tmp/temp' + str(idx) + '.fasta', db= "/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta", evalue=0.001, outfmt=5, out='../tmp/temp' + str(idx) + '.xml') blastx_cline() f = open('../tmp/temp' + str(idx) + '.xml', 'r') blast_records = NCBIXML.parse(f) flag = False score = -1 alignment_length = -1 results = [] for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if not flag and score == -1: score = hsp.score alignment_length = hsp.align_length flag = True if hsp.score >= score and hsp.align_length >= alignment_length and 'Eukarya' not in self.ez_taxa_dict[ alignment.hit_id]: results.append( (self.ez_taxa_dict[alignment.hit_id], hsp.expect)) if len(results) > 0: res = self.lowest_certain_level(results) if res: final_results = (seq, self.refine_ez_taxonomy(res) + idx[-1], pval) else: final_results = (seq, 'ZZZNOVEL' + idx[-1], pval) else: final_results = (seq, 'ZZZNOVEL' + idx[-1], pval) return final_results
def read_blast(fn): res_handle = open(fn, "r") blast_record = NCBIXML.parse(res_handle) #print(blast_record) #normal_bast_res = 0 #unexpected_blast_res = 0 alt_dict = {} first = True fam = '' pickle_name = '' for record in blast_record: if (first): arr = record.query.split('.') fam = arr[0] pickle_name = "alts_" + fam + ".pkl" if ("alts_" + fam + ".pkl" in os.listdir()): alt_dict = pickle.load(open(pickle_name, "rb")) first = False alt_dict[record.query] = [] cter = 0 for al in record.alignments: cter += 1 should_print = False hsp_list = [] for hsp in al.hsps: length = hsp.align_length score = hsp.score if (hsp.align_length and record.query_length and hsp.align_length < .9 * record.query_length): break else: should_print = True alt_dict[record.query].append((al.title, hsp.query)) pickle.dump(alt_dict, open(pickle_name, "wb")) res_handle.close() return
def get_bsr_for_strain(bsr_records, blast_results_path, strain_name): """Give a bunch of BSR_Record {'match_key':BSR_Record, ...} and a blast results filepath. BSR etc. updated in the Match. Dict of BSR_Record returned as provided. BLAST records should have the FASTA title as alignment.hit_def """ print('Getting BSR from this blast results file:', blast_results_path) record_count = 0 records_with_hits = 0 for record in NCBIXML.parse(open(blast_results_path)): if record.alignments: records_with_hits += 1 #Take the best hit for each query sequence hsps = record.alignments[0].hsps[0] sbjct_descrpt = record.alignments[0].hit_def sbjct_seq = ''.join([nt for nt in hsps.sbjct if nt != '-']) best_score = hsps.score #query_score = record.alignments[0].hsps[0].score query_descrpt = record.query if query_descrpt in bsr_records: bsr_records[query_descrpt].add_hit(sbjct_descrpt, best_score, sbjct_seq, strain_name) else: print( query_descrpt, 'not found in bsr_records, this shouldnt happen but can result from weirdly formated FastA record descriptions.' ) raise KeyError # If this didn't hit in the subject genome we still want to create an # empty placeholder in the match record else: bsr_records[record.query].add_hit('', 0, None, strain_name) if record_count % 100 == 0: print(record_count, '... ', sep='', end='') record_count += 1 print('Of {} reference sequences, {} had hits in the primary proteome.'. format(record_count, records_with_hits)) return bsr_records
def diamondindex(db1, db2, outputdir, evalue=1E-30): """Run DIAMOND search and parse the results. Input: db1 and db2 are BLAST databases. outputdir is a place to put the BLAST results. Returns: A dictionary where keys are the query proteins, values are the corresponding hits.""" print("db1 is.... " + db1) print("db2 is.... " + db2) #Make dir to store results blastoutputdir = outputdir + "/BLAST_data" if not isdir(blastoutputdir): makedirs(abspath(blastoutputdir)) #the weird re split stuff is so I can use the file names of DBs, #not paths to make the output file name. species1 = re.split('[\\\/.]+', db1)[-2] species2 = re.split('[\\\/.]+', db2)[-2] outputID = "/" + species1 + "_vs_" + species2 + ".XML" blastoutputfile = abspath(blastoutputdir + outputID) #Make blastdbs from fasta: makediamond = "diamond makedb --in " + db2 + " --db " + db2 print(makediamond) subprocess.call(makediamond, shell=True) rundiamond = ("diamond blastp --db " + db2 + " --out " + blastoutputfile + " --query " + db1 + " --outfmt 5 -e 1e-3 --quiet") print(rundiamond) subprocess.call(rundiamond) #Parse the BLAST result_handle = open(blastoutputfile, "r") blastrecs = NCBIXML.parse(result_handle) iddict1 = {} #consider making the iddict1 an ordered dict to ensure correctness in synteny assessment. for B in blastrecs: if B.alignments: ali = B.alignments[0] qID = B.query hID = ali.hit_def #THis is only the score of the top HSP, not for the whole thing score = ali.hsps[0].score iddict1[qID] = { "hit ID " + species2: hID, "score " + species2: score } return iddict1
def read_in_xml(xml_inf): with open(xml_inf) as xml: with open( "{}_parsed_cutoff_{}nt.out".format( xml_inf.split(".out")[0], args.length), "w+") as outf: hit_no = 0 xml_parse = NCBIXML.parse(xml) List_of_hits_to_sort = [] for entry in xml_parse: for alignment in entry.alignments: for hsp in alignment.hsps: length = hsp.identities if length > 100: output_string = "" output_string += "alignment_length\t{}\n".format( length) output_string += "alignment_evalue\t{}\n".format( hsp.expect) output_string += "alignment_score\t{}\n".format( hsp.score) output_string += "query\t{}\n".format(entry.query) output_string += "query_pos\t{}\t{}\n".format( hsp.query_start, hsp.query_end) output_string += "hsp\n" output_string += "{}\n{}\n{}\n".format( hsp.query, hsp.match, hsp.sbjct) output_string += "match\t{}\n".format( alignment.title) output_string += "query_pos\t{}\t{}\n".format( hsp.sbjct_start, hsp.sbjct_end) output_string += "#\n" List_of_hits_to_sort.append( (length, output_string)) for hit in sorted(List_of_hits_to_sort, reverse=True): hit_no += 1 outf.write("hit_number\t{}\n".format(hit_no)) outf.write(hit[1])
def BlastFastaXmlIndv(fasta_filename=None, xml_filename=None): if fasta_filename: record_iterator = SeqIO.parse(fasta_filename, "fasta") output_table = open(fasta_filename + ".summary.tsv", 'w') outputWriter = csv.writer(output_table, delimiter="\t") for seq_record in record_iterator: wait_time = 1 while True: print seq_record.id try: result_handle = NCBIWWW.qblast("blastn", "nr", seq_record.seq, entrez_query="KM204118.1") break except ValueError: print "Error encountered" print "Trying again in " + str(wait_time) + " seconds" if wait_time > 100: sys.exit() time.sleep(wait_time) wait_time *= 2 blast_record = NCBIXML.read(result_handle) filteredHspStartEnds = FilterBlastRecord(blast_record) if filteredHspStartEnds and CheckPossibleRecomb( filteredHspStartEnds): WriteARow(outputWriter, blast_record, filteredHspStartEnds) result_handle.close() elif xml_filename: output_table = open(xml_filename + ".summary.tsv", 'w') outputWriter = csv.writer(output_table, delimiter="\t") result_handle = open(xml_filename) blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: filteredHspStartEnds = FilterBlastRecord(blast_record) if filteredHspStartEnds and CheckPossibleRecomb( filteredHspStartEnds): WriteARow(outputWriter, blast_record, filteredHspStartEnds) result_handle.close() output_table.close()
def readVirusXML1(fname): result_handle = open(fname, 'r') blast_records = NCBIXML.parse(result_handle) virusE={} virusID={} for blast_record in blast_records: query = blast_record.query for alignment in blast_record.alignments: subject = alignment.title.upper() for hsp in alignment.hsps: if not virusE.has_key(query): virusE[query]=float(hsp.expect) virusID[query]=subject # lowest subject elif float(hsp.expect) < virusE[query]: virusE[query]=float(hsp.expect) virusID[query]=subject # lowest subject result_handle.close() return virusE, virusID
def blastp(self, acc): try: gis = [] print 'here' result_handle = NCBIWWW.qblast("blastp", "nr", acc, format_type="XML", expect=self.blast_threshold) print 'here' for blast_record in NCBIXML.parse(result_handle): for alignment in blast_record.alignments: gis.append(alignment.title.split("|")[1]) unique = [int(i.strip()) for i in gis if int(i) not in self.gis] self.gis.extend(unique) except: self.status.setdefault(acc, False) return
def filter_blast(blast_result_file_name, pan_protein_file_path, filtered_pan_protein_file_path, length_table): ''' input 1: blast_result_file_name input 2: pan_protein_file_path output 1: filtered_pan_protein_file_path output 2: length_table ''' pan_protein_dic = SeqIO.index(str(pan_protein_file_path), "fasta") with filtered_pan_protein_file_path.open("w+") as out_fl: with length_table.open("w") as length_table_fl: for record in NCBIXML.parse(open(blast_result_file_name)): query_name = record.query query_len = record.query_letters assert record.query_letters == record.query_length length_table_fl.write("{}\t{}\n".format(query_name, query_len)) if query_len <= 20: continue SeqIO.write(pan_protein_dic[query_name], out_fl, "fasta")
def process_rps_output(filepath, evalue): """Process rpsblast output and return list of dictionaries.""" results = [] with open(filepath, "r") as fh: for record in NCBIXML.parse(fh): for align in record.alignments: des, d_id, name = process_align(align) for hsp in align.hsps: if hsp.expect <= evalue: dict = {"HitID": align.hit_id, "DomainID": d_id, "Name": name, "Description": des, "Expect": float(hsp.expect), "QueryStart": int(hsp.query_start), "QueryEnd": int(hsp.query_end)} results.append(dict) return results