def blastBACTEUK(arg): out=open('bacterial.txt','a') out2=open('eukaryotic.txt','a') records = SeqIO.parse(open(arg), format="fasta") for record in records: try: name = record.id result_handleB = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Bacteria[ORGN] OR Archaea[ORGN])') result_handleE = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Eukaryota[ORGN])') blast_recordsB = NCBIXML.read(result_handleB) blast_recordsE = NCBIXML.read(result_handleE) if blast_recordsB.descriptions: print record.id name = record.id out.write(name + ',' + str(blast_recordsB.alignments[0].hsps[0].expect) + '\n') else: out.write(name + ', no hit' + '\n') if blast_recordsE.descriptions: out2.write(name + ',' + str(blast_recordsE.alignments[0].hsps[0].expect) + '\n') else: out2.write(name + ', no hit' + '\n') except: errorout = open('errorlog.txt','a') error out.write('problem blasting ' + record.id + '\n') errorout.close() out.close() out2.close()
def blast_align(fasta,blast_path,miRNA_db,mRNA_db): os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-3 -db "+miRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_miRNA.xml") os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-5 -db "+mRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_mRNA.xml") os.system("rm "+fasta) miRNA_records=NCBIXML.parse(open(args.output+"temp_blast_miRNA.xml")) mRNA_records=NCBIXML.parse(open(args.output+"temp_blast_mRNA.xml")) return (miRNA_records,mRNA_records)
def fetch_indentity_from_local(seq): def extract_prot_id(string): s = string.split('|')[2] s = s.split(' ')[1] return s result = [] record = SeqRecord(Seq(seq), id="tmp", name="", description="") SeqIO.write(record, "tmp.fastaa", "fasta") NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/HUMAN_DB', outfmt=5, out='blastp_human_output.xml')() NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/RODENTS_DB', outfmt=5, out='blastp_rodents_output.xml')() result_handle = open("blastp_human_output.xml") b_record = NCBIXML.read(result_handle) for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.positives == hsp.identities: result.append(extract_prot_id(alignment.title)) result_handle = open("blastp_rodents_output.xml") b_record = NCBIXML.read(result_handle) for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.positives == hsp.identities: result.append(extract_prot_id(alignment.title)) return ";".join(result)
def bestrecipblast(org, seed, thresh=5, queue=None): '''Returns the best pairwise reciprocal BLAST using seed accession no. from against org organism''' seedorg=FetchUtil.fetch_organism(seed)[0] acclist={} ac=[] FetchUtil.fetch_fasta(seed) dum=str(int(int(seed.split('.')[0][-5:])*random.random())) os.system('blastp -db nr -query Orthos/'+seed+'.fasta -evalue '+str(thresh)+ ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+org+'[ORGN]\" -use_sw_tback'+ ' -remote') qoutput=open('XML/'+dum+'.xml') parser=NCBIXML.parse(qoutput) for lin in parser: for align in lin.alignments: for hsp in align.hsps: if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>=.25: ac.append(align.title.split('|')[1]) print("Done. Number of sequences found: "+repr(len(ac))) for o in ac: print o FetchUtil.fetch_fasta(o) os.system('blastp -db nr -query Orthos/'+o+'.fasta -evalue '+str(thresh)+ ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+seedorg[0]+'[ORGN]\" -use_sw_tback'+ ' -remote') q1output=open('XML/'+dum+'.xml') parse=NCBIXML.parse(q1output) acc=[] print 'blasted' for lin in parse: for align in lin.alignments: for hsp in align.hsps: if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>.25: acc.append(align.title.split('|')[1]) else: continue print "Done. Number of sequences found: "+repr(len(acc)) if seed in acc: print 'it\'s twue!' name=FetchUtil.fetch_organism(o)[0] try: acclist[name]=[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))] except KeyError: acclist.update({name:[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]}) open('dicts/'+seed,'a').write(str(acclist)+'\n') break #elapsed=time.time()-start #print "Time elapsed: "+time.strftime('%M:%S',[elapsed]) if queue is not None: queue.put(acclist) else: return acclist
def main(): #initialization n=0 # total number of query seq align_mi=0 align_m=0 args=ParseArg() miRNA_result=open(args.mi_xml) mRNA_result=open(args.m_xml) miRNA_records=NCBIXML.parse(miRNA_result) mRNA_records=NCBIXML.parse(mRNA_result) output=open(args.output,'w') # E-values if args.evalue==0: evalue_mi=1e-5 evalue_m=1e-15 else: evalue_mi=float(args.evalue[0]) evalue_m=float(args.evalue[1]) for mi_record,m_record in itertools.izip(miRNA_records,mRNA_records): temp_output='' mi_indic=0 # whether there are miRNA alignment m_indic=0 # whether there are mRNA alignment mi_end=150 #shortest miRNA aligned end in query sequence n=n+1 if (mi_record.query!=m_record.query): print >>sys.stderr,"The two query seqs from miRNA and mRNA results are not matched!" break temp_output=mi_record.query+'\n' for alignment in mi_record.alignments: for hsp in alignment.hsps: if hsp.expect < evalue_mi: mi_indic=1 line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score]) temp_output=temp_output+line+'\n' if mi_end>max(hsp.query_start,hsp.query_end): mi_end=max(hsp.query_start,hsp.query_end) if mi_indic==0: mi_end=0 for alignment in m_record.alignments: for hsp in alignment.hsps: if (hsp.expect < evalue_m) and (min(hsp.query_start,hsp.query_end)>mi_end): m_indic=1 line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score]) temp_output=temp_output+line+'\n' if mi_indic+m_indic>=2: output.write(temp_output) if mi_indic==1: align_mi+=1 if m_indic==1: align_m+=1 print n,align_mi,align_m
def parseBlastResult(fileName): handle = open(fileName) blast_records = NCBIXML.parse(handle) results = [] for record in blast_records: rec_id = str(record.query) if len(record.alignments) == 0: results.append( (rec_id, "-", 0, "-") ) continue for algn in record.alignments: evalue = algn.hsps[0].expect score = 0 ids = [] for hsp in algn.hsps: score += hsp.bits ids.append(hsp.identities / float(hsp.align_length)) max_identity = int(max(ids)*100) seq_id = algn.hit_id results.append( (rec_id, seq_id, max_identity, algn.hit_def ) ) return results
def blastdemo(genbankID): # run blastp on the swissprot database NB to scale this up we must do it locally on cluster result_handle = NCBIWWW.qblast("blastp", "swissprot", genbankID) # read the results as XML blast_record = NCBIXML.read(result_handle) # Set this value to ridiculously low E_VALUE_THRESH = 0.00000000000000001 # for each alignment found, display the one with the lowest e-value, and also protein function information. for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: print ("****Alignment****") print ("sequence:", alignment.title) print ("length:", alignment.length) print ("e value:", hsp.expect) print (hsp.query[0:75] + "...") print (hsp.match[0:75] + "...") print (hsp.sbjct[0:75] + "...") print "\n" ### h is not defined yet, Will (problem from iPython nb's!) # print h.query[0:75] + '...' # print h.match[0:75] + '...' # print h.sbjct[0:75] + '...' for a in blast_record.alignments: print a.length
def blastparse(blast_handle, genome, gene): global plusdict records = NCBIXML.parse(blast_handle) # Open record from memory-mapped file dotter() for record in records: # This process is just to retrieve HSPs from xml files for alignment in record.alignments: for hsp in alignment.hsps: threadlock.acquire() # precaution # if hsp.identities == alignment.length: # if the length of the match matches the legth of the sequence # # if genome not in plusdict: # add genomes in plusdict # # plusdict[genome] = defaultdict(list) # # if gene not in plusdict[genome]: # add genes to plus dict # # plusdict[genome][gene] = [] if plusdict[genome][gene] == [] and abs(float(hsp.identities) / alignment.length) >= 0.7: # If there is only one good match then apply allele number plusdict[genome][gene].append("+") # elif "+" not in plusdict[genome][gene]: # plusdict[genome][gene].append("-") # elif abs(float(hsp.identities) / alignment.length) >= 0.7: # # If there is multiple matches then added them in a string # plusdict[genome][gene].append(alignment.title.split('_')[-1]) # plusdict[genome][gene].sort() # else: # # or add the # plusdict[genome][gene].append('%s (%s/%s)' % (alignment.title.split('_')[-1], # hsp.identities, # alignment.length)) # print json.dumps(plusdict, indent=4, separators=(',', ': ')) threadlock.release() # precaution for populate dictionary with GIL
def __init__(self, fhand, subj_def_as_accesion=None): 'The init requires a file to be parser' fhand.seek(0, 0) sample = fhand.read(10) if sample and 'xml' not in sample: raise ValueError('Not a xml file') fhand.seek(0, 0) self._blast_file = fhand metadata = self._get_blast_metadata() blast_version = metadata['version'] plus = metadata['plus'] self.db_name = metadata['db_name'] self._blast_file.seek(0, 0) if ((blast_version and plus) or (blast_version and blast_version > '2.2.21')): self.use_query_def_as_accession = True self.use_subject_def_as_accession = True else: self.use_query_def_as_accession = True self.use_subject_def_as_accession = False if subj_def_as_accesion is not None: self.use_subject_def_as_accession = subj_def_as_accesion #we use the biopython parser #if there are no results we put None in our blast_parse results self._blast_parse = None if fhand.read(1) == '<': fhand.seek(0) self._blast_parse = NCBIXML.parse(fhand)
def blast_align(fasta,blast_path,linker_db): fasta_name=fasta.split(".")[0] os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 0.1 -db "+linker_db+" -query ./temp/"+fasta+" > ./temp/"+fasta_name+"_blast_linker.xml") linker_records=NCBIXML.parse(open("./temp/"+fasta_name+"_blast_linker.xml")) # os.system("rm ./temp/"+fasta) # os.system("rm ./temp/"+fasta_name+"_blast_linker.xml") return (linker_records)
def get_fancy_results_list(self, blast_results, num_results = 20): blast_results_list = [] blast_record = list(NCBIXML.parse(blast_results))[0] num_results = len(blast_record.alignments) if len(blast_record.alignments) < num_results else num_results for i in range(0, num_results): entry = b6lib.B6Entry() entry.q_len = int(blast_record.query_length) entry.query_length = entry.q_len alignment = blast_record.alignments[i] hsp = alignment.hsps[0] entry.hit_def = alignment.hit_def entry.subject_id = entry.hit_def entry.accession = alignment.accession entry.ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % entry.accession entry.hsp_query = hsp.query entry.hsp_match = hsp.match entry.hsp_subject = hsp.sbjct entry.identity = len([x for x in hsp.match if x == '|']) * 100.0 / len(entry.hsp_query) entry.coverage = len(hsp.query) * 100.0 / entry.query_length blast_results_list.append(entry) try: blast_results.close() except: pass return blast_results_list
def parse_blast_XML(blast_xml): """ Read the blast_xml file generated before and extract the sequence and the id of each sequence in Blast and save them to multiple fasta file. It will allow ClustalW to generate a Multiple Sequence Alignment from all these sequence extracted. """ blast_xml_op = open (blast_xml, 'r') for record in NCBIXML.parse(blast_xml_op): for align in record.alignments: hit_id = align.hit_id.split("|") prev_eval = 1 coverage = align.length / 390 ######arreglar per posar longitud sequencia for hsp in align.hsps: if hsp.expect < prev_eval: prev_eval = hsp.expect efetch = Entrez.efetch(db="protein", id=hit_id, rettype="fasta") for line in efetch: line = line.rstrip() if line.startswith(">"): id_info = line sequence = "" else: sequence += line sequence += line organism = id_info[id_info.find("[") + 1:id_info.find("]")] organism = organism.split() if len(organism) != 1: species = str(organism[0] + "_" + organism[1]) yield BlastResult(hit_id[1], species, sequence, prev_eval, coverage)
def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh): result_handle = open(result_file, 'r') ## The XML file to parse. blast_records = NCBIXML.parse(result_handle) print 'query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value' for record in blast_records: ## Loop through each query. query_id = record.query if len(record.alignments) > 0: ## Check whether there are hits. e_val = record.alignments[0].hsps[0].expect if e_val < e_val_thresh: ## Is hit below E-value? tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps]) ## Sum of all identities for all hsps. query_len = record.query_length ## Length of query align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps]) ## Length of query alignment to hit. pct_ident = tot_ident/float(align_len)*100 ## Calculates percentage identity. top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def if pct_ident > ident_thresh: ## Checks whether above percentage identity cutoff. if align_len > align_thresh: print '%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val)) else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') else: print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '') result_handle.close()
def blast_xml_to_gff3(file_in,file_out,blast_type): result_handle = open(file_in) blast_records = NCBIXML.parse(result_handle) E_VALUE_THRESH = 0.04 with open(file_out,"w") as f: f.write("##gff-version 3"+"\n") for blast_record in blast_records: counter = 0 for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH and counter < 1: counter+=1 if hsp.strand[0] is None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + "?" + "\t" + "." + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + "?" + "\t" + str(hsp.frame[0]) + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is not None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + str(hsp.strand[0]) + "\t" + "." + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n") if hsp.strand[0] is not None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + str(blast_type) + "\t" + "match_part" + "\t" + str(hsp.query_start) + "\t" + str(hsp.query_end) + "\t" + str(hsp.score) + "\t" + str(hsp.strand[0]) + "\t" + str(hsp.frame[0]) + "\t" + "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" + "Parent="+blast_record.query+";"+ "Name=blast_hsp;" + "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
def get_gb_info(self, resultshandle): """Extracts the GenBank record IDs, the hit positions, and the sequence orientations from the BLAST report.""" # Start a parser that steps through each record blast_records = NCBIXML.parse(resultshandle) # List to hold information about our hits # Step through the BLAST records for record in blast_records: # Step through each alignment in each record for alignment in record.alignments: # Then the HSPs in each alignment for hsp in alignment.hsps: # The start and end positions of each hit hit_coords = (hsp.sbjct_start, hsp.sbjct_end) # Split on the '|' character, genbank ID is last in the # list have to use -2 instead, because of the trailing '|' # in the XML report hit_gbid = alignment.title.split('|')[-2] # Relative directions of the sequences hit_directions = hsp.frame break # Tack the IDs, coordinates, and directions onto our lists self.gb_ids.append(hit_gbid) self.hit_coords.append(hit_coords) self.hit_directions.append(hit_directions) # Finished with this file resultshandle.close() return
def create_rel(self, XMLin): """ Create a dictionary that relate the sequence name with the region to mask. Returns a dictionary """ bat1 = {} b_records = NCBIXML.parse(XMLin) for b_record in b_records: for alin in b_record.alignments: for hsp in alin.hsps: qs, qe = hsp.query_start, hsp.query_end if qs > qe: qe, qs = qs, qe bat1.setdefault(b_record.query.split(" ")[0], set()).add((qs, qe)) # sort and merge overlapping segments for b_record_query in bat1.keys(): joined_cols = [] for qs, qe in sorted(list(bat1[b_record_query])): if joined_cols: last_qs, last_qe = joined_cols[-1] if last_qe >= qs: joined_cols[-1] = (last_qs, qe) continue joined_cols.append((qs, qe)) bat1[b_record_query] = joined_cols return bat1
def include_check(blast_result_filename,include_line,seq_name_list,max_mismatch): strlist=str(include_line).split(' OR ') results={} # intermediate result to show if what organism are conserved in that seq results_final={} # final result to show if an seq is conserved or not in all the rquested organism for valist in strlist: txid_num=valist[valist.find('(taxid:')+7:valist.find(')')] blast_result_file= open(blast_result_filename+txid_num,"r") found={} for record in NCBIXML.parse(blast_result_file): name=record.query min_len= record.query_letters-max_mismatch if not found.has_key(name): if record.alignments : for align in record.alignments : for hsp in align.hsps : #print "blast: ",hsp.identities,name,query_len,int(num) if hsp.identities == hsp.align_len and hsp.identities>=min_len: # 100% match and has more identities than requirement length of matches found[name]=1 # this valst is conserved in current if results.has_key(name): temp=results[name] temp.append(txid_num) results[name]=temp else: temp=[txid_num] results[name]=temp #print name,results[name] blast_result_file.close() len_organ=len(strlist) for i in results.keys(): if len(results[i])==len_organ: results_final[i]=1 return (results_final)
def BLAST_to_BRIG(BLASTfile, resultsFile): rec = open(BLASTfile) blast_records = NCBIXML.parse(rec) with open(resultsFile, "w") as tabFile: for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: tabFile.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( blast_record.query, alignment.hit_def, round(float(match.identities) / float(alignment.length), 2), int(match.score), alignment.length, int(alignment.length) - int(match.identities), match.query_start, (int(match.query_start) + int(alignment.length)), match.sbjct_start, (int(match.query_start) + int(alignment.length)), ) ) break
def run_blastp(match, blastdb): """run blastp""" from Bio.Blast.Applications import NcbiblastpCommandline for feature in match.features: rec = None fasta = feature.protein_fasta() if fasta == "": continue try: cline = NcbiblastpCommandline(db=blastdb, outfmt=5, num_threads=4) pipe = subprocess.Popen( str(cline), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) pipe.stdin.write(fasta) pipe.stdin.close() recs = NCBIXML.parse(pipe.stdout) rec = recs.next() pipe.stdout.close() pipe.stderr.close() except OSError, err: logging.warning("Failed to run blastp: %s" % err) continue except ValueError, err: logging.warning("Parsing blast output failed: %s" % err) continue
def blastpSp(sp, db, evalue=0.0001): """ directory = tempfile.mkdtemp() fasta = fetchFasta(spAcc) fastaFile = '%s/seq.fasta' % directory wf = open(fastaFile, 'w') print(fasta, file=wf) wf.close() """ directory = tempfile.mkdtemp() fastaFile = '%s/seq.fasta' % directory fasta = '>query\n%s' % seq(sp) wf = open(fastaFile, 'w') print(fasta, file=wf, sep='', end='') wf.close() blastp = NcbiblastpCommandline(query=fastaFile, db=db, evalue=evalue, outfmt=5, out='%s/result.xml' % directory) stdout, stderr = blastp() print(stdout, end='', sep='') print(stderr, end='', sep='') result_handle = open('%s/result.xml' % directory) blast_record = NCBIXML.read(result_handle) result_handle.close() os.remove(fastaFile) os.remove('%s/result.xml' % directory) os.removedirs(directory) hits = [align.title for align in blast_record.alignments] hits = [i.split('|')[1] for i in hits] return hits
def run (self, input_seq): output = [] #Windows has problems with Popen and PIPE if sys.platform == 'win32': tmp = tempfile.NamedTemporaryFile() logger.debug("Running Blast with sequence: {}".format(input_seq)) tmp.write(bytes(str(input_seq) + '\n', 'latin1')) tmp.seek(0) blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, stdin=tmp, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate() else: #Rest of the world: blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate(input=str(input_seq)) if len(blast_err) != 0: logger.debug(blast_err) if blast_out!='\n': result = NCBIXML.read(StringIO(blast_out)) for aln in result.alignments[:self.top_results]: logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id)) output.append((aln.hit_id, aln)) return output
def parse_blast_xml(xml_filename, query_filename, output_filename, abundance_filename=None): """ Parse the XML output, looking only at the 1st alignment for each query Write out in format: ID \t COUNT \t LENGTH \t AMBIG \t QSTART \t QEND \t IDEN """ if abundance_filename is None: abundance = defaultdict(lambda: 1) else: abundance = dict(line.strip().split('\t') for line in open(abundance_filename)) handle = NCBIXML.parse(open(xml_filename)) f = open(output_filename, 'w') f.write("ID\tCOUNT\tLENGTH\tAMBIG\tQSTART\tQEND\tIDEN\n") with open(query_filename) as h: for r in SeqIO.parse(h, 'fasta'): ambig = r.seq.count('N') + r.seq.count('?') blastout = handle.next() if len(blastout.alignments) == 0: # no match was found! f.write("{id}\t{count}\t{len}\t{ambig}\tNA\tNA\tNA\n".format(\ id=r.id, count=abundance[r.id], len=len(r.seq), ambig=ambig)) else: hsp = blastout.alignments[0].hsps[0] f.write("{id}\t{count}\t{len}\t{ambig}\t{qs}\t{qe}\t{iden}\n".format(\ id=r.id, len=len(r.seq), qs=hsp.query_start, qe=hsp.query_end,\ iden=hsp.identities, count=abundance[r.id], ambig=ambig)) f.close()
def write_flanks(rbase,flanksfile): ''' Parse the results from BLASTing the F-plasmid against the de novo assemblies. get the query length, get the first BLAST hit that matches the 3'-end of the query, and write the flanking region to file. ''' flank_record_list = [] ## iterate over BLASTs against de novo assemblies. denovo_dirs = [x for x in listdir(rbase) if x.startswith('REL') or x.startswith('RM')] for mygenome in denovo_dirs: myfulldir = join(rbase, mygenome) ##print(myfulldir) result_f = join(myfulldir,"results.xml") result_h = open(result_f) blast_record = NCBIXML.read(result_h) query_length = int(blast_record.query_letters) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect > 0.0000000001: ## skip bad hits. continue if hsp.query_end != query_length: ## skip hits that don't match 3' end of F-plasmid query. continue subject_seq = join(myfulldir,"scaffolds.fasta") ##print(mygenome) my_flank_seq = get_flank(alignment, hsp, subject_seq) flank_record_list.append(SeqRecord(seq=my_flank_seq, id=mygenome+'_flank')) with open(flanksfile,'w') as flanks_outhandle: SeqIO.write(flank_record_list,flanks_outhandle, format="fasta")
def main(): # query = input('Enter query file name: ') # For the working example, type in 'Test_miRNA.txt' # filename = input('What is your desired file name for the top hits file? ') # I used 'Test_miRNA_Results.txt' # writer = open(filename, 'w') records = SeqIO.parse('gg_pre_mirna.fasta', 'fasta') writer = open('results.fasta', 'w') writer.write('Organism_name' + '\t' + 'Query_start' + '\t' + 'Query_end' + '\t' + 'Subject_start' + '\t' + 'Subject_end' + '\r') # Writes the header for the results file # print('Now BLASTing') for record in records: tempWriter = open('Temp.txt', 'w') tempWriter.write('>' + record.id + '\n') tempWriter.write(str(record.seq) + '\n') #os.system('blastn -task blastn-short -query '+ str(record.seq) +' -db Input/gg_db -out BLAST_result.xml -outfmt "5" ') os.system('blastn -task blastn-short -query gg_pre_mirna.fasta -db Input/gg_db -out BLAST_result.xml -outfmt "5" ') result_handle = open('BLAST_result.xml') blast_records = NCBIXML.parse(result_handle) writer.write('\r' + '*****' + '\r') writer.write(record.id + '\r' + '*****' + '\r') for blast_record in blast_records: parsefile(blast_record,writer) tempWriter.close() writer.close() print('Finished!')
def findOffTargets (refSeq,sgRNAseq): candidates=[] # Return this list of candidates f = open('temp.fasta','wb') f.write(sgRNAseq+'\n') f.close() cline = NcbiblastnCommandline(query="temp.fasta", db="testdb",outfmt=5, out="temp.xml",task='blastn-short') cline() result=open('temp.xml','r') records = NCBIXML.read(result) if len(records.alignments) == 0 : return candidates records=records.alignments[0].hsps for record in records: if record.query_end < 20: # Require ends at the seed continue if record.match[-5:] != '|'*5: # Require 5 bp of seed is perfect match #print record continue if record.sbjct_end > record.sbjct_start: end=record.sbjct_end # on the + strand, sequence is from [start,end] if refSeq[end+2:end+4]=='GG': candidates.append(record) else: # On the - strand end=record.sbjct_end if refSeq[end-3:end-1] == 'CC': candidates.append(record) return candidates
def parse_online_blast (seq_list): # get the result handle and set the taxon dic blast_handle, taxon_dic = online_blast(seq_list), {} # use the biopython xml parse module to get the results logging.debug('Parsing blast result XML file.') blast_list = [item for item in NCBIXML.parse(blast_handle)] # walk through the blast results and prepare them for filtering for blast_result in blast_list: for alignment in blast_result.alignments: for hsp in alignment.hsps: # calculate the %identity identity = float(hsp.identities/(len(hsp.match)*0.01)) # grab the genbank number gb_num = alignment.title.split('|')[1:4:2] gb_num[1] = gb_num[1].split('.')[0] # get the taxon id based on the genbank identifier if gb_num[0] not in taxon_dic: taxon = obtain_tax(gb_num[0]) taxon_dic[gb_num[0]] = taxon else: taxon = taxon_dic[gb_num[0]] # pull all the results together and sent them to the filter function filter_hits([str(blast_result.query), str(alignment.title), str(gb_num[0]), str(gb_num[1]), str(identity), str(len(hsp.query)), str(blast_result.query_length), str(hsp.expect), str(hsp.bits), taxon[0], taxon[1]])
def blast_file_opener(filename, evalue, mismatches, outfile): """Func takes in a BLAST xml output file (filename). writes out the various details of interests to the outfile. It filters the results based on evalue and number of mismatches, as defined by the user. """ E_VALUE_THRESH = float(evalue) mismatches = int(mismatches) result_handle = open(filename) f = open(outfile, 'w') temp = outfile.split(".txt")[0] blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: alignment_hits = set([]) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: # For mismatches use (hsp.align_length - hsp.identities) mmatches = hsp.align_length - hsp.identities if str(mmatches) == str(mismatches): data = "%s\t%s\t%s\n" %(alignment.title, blast_record.query, str(hsp.expect)) f.write(data) f.close() result_handle.close() return alignment_hits
def blast_reads(blast_string, reads, outfh): blast_db = '/Users/sw10/Dropbox/Sanger/blastdb/ebola/Zaire_ebolavirus_KM034562' # 2) blast_binary = '/Applications/ncbi-blast-2.2.29+/bin/blastn' # 3) xml_outfile = '/tmp/test.xml' evalue = 0.01 cline = NcbiblastnCommandline(cmd=blast_binary, out=xml_outfile, outfmt=5, query="-", db=blast_db, evalue=evalue, max_target_seqs=1, num_threads=1) stdout, stderr = cline(blast_string) with open(xml_outfile, 'r') as blast_handle: blast_records = NCBIXML.parse(blast_handle) for blast_record in blast_records: name = blast_record.query for alignment in blast_record.alignments: count = 1 for hsp in alignment.hsps: seq = reads[name].sequence[hsp.query_start:hsp.query_end] qual = reads[name].quality[hsp.query_start:hsp.query_end] if hsp.sbjct_start > hsp.sbjct_end: tmp1 = [seq[i] for i in range(len(seq)-1,-1,-1)] seq = ''.join(tmp1) tmp2 = [qual[i] for i in range(len(qual)-1,-1,-1)] qual = ''.join(tmp2) outfh.write('@%s:%d\n%s\n+\n%s\n' % (name, count, seq, qual)) count += 1 os.remove(xml_outfile)
def parsePsiBlast(psiblastfilename, max_evalue): try: results_dict = {} handle = open(psiblastfilename, 'r') for blast_record in NCBIXML.parse(handle): for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect <= max_evalue: subjid = alignment.title if subjid in results_dict: if hsp.expect < results_dict[subjid]: results_dict[subjid] = hsp.expect else: results_dict[subjid] = hsp.expect handle.close() return results_dict except: dieError('ERROR: PSI-BLAST failed.')
def get_ids(filename, dir, ethresh = 0.01): eValueThresh = ethresh result = open(os.path.join(dir,"BLAST",filename),"r") # mode omitted defaults to read only blast_record = NCBIXML.parse(result) blast_records = list(blast_record) record = blast_records[0] hits = [] for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect < eValueThresh: title = alignment.title mdata = re.match( r'.*[A-Z|a-z]{2,3}\|(.*?)\|.*?\[([A-Z])\S* ([A-Z|a-z]{3}).*\].*?', title) if mdata is not None: accession = re.match(r'([A-Z|a-z|_|0-9]*)\..*', mdata.group(1)) acc = str(accession.group(1)) genus = str(mdata.group(2)[0]) species = str(mdata.group(3)[:3]) shortSpecies = (genus + species) hits.append((acc, shortSpecies)) spec = filename[0:4] filteredHits = filter_species(hits,spec) # Saving results # Save as separate files for each species~! with open(os.path.join(dir,"accs",record_name(filename)+".csv"),'w') as csvfile: blasthits = csv.writer(csvfile) for each in filteredHits: blasthits.writerow([each[0]]) csvfile.close()
from Bio.Blast import NCBIXML from Bio import SeqIO with open("../Data/T0860/T0860.fasta", "rU") as handle: for record in SeqIO.parse(handle, "fasta"): seq_len = len(record._seq) result_handle = open('../Data/T0860/95SCU2B4015-Alignment.xml') blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: for alignment in blast_record.alignments: title = alignment.title length = alignment.length for hsp in alignment.hsps: print('****Alignment****') print('sequence:', alignment.title) print('length:', alignment.length) print('e value:', hsp.expect) print(hsp.query[0:137]) print(hsp.match[0:137]) print(hsp.sbjct[0:137]) identities = hsp.identities similarity = (100 * hsp.identities / seq_len) target = hsp.query targetstart = hsp.query_start templatestart = hsp.sbjct_start match = hsp.match template = hsp.sbjct[0:137] templatestart = hsp.sbjct_start
def parse_blast(blastOut, pdb, qseq, evalue=0.00001): ''' Parse XML Blast outputs. Parameters: - evalue: set an alignment cutoff - qseq: query sequence as a string ''' with open(blastOut) as fh: blast_record = NCBIXML.read(fh) A = numpy.zeros((len(qseq),21)) q = ['A','C','D','E','F','G','H','I','K','L','M','N','P','R','S','T','V','Y','W','Q','-'] with open(pdb+'.ali', 'w') as out: out.write('>%sq\n' % pdb) out.write(qseq+'\n') Seqs = {pdb+'q':qseq} for alignment in blast_record.alignments: for hsp in alignment.hsps: if float(hsp.expect) > evalue: continue sseq = alignment.title.split('|')[-2] if sseq not in Seqs: Seqs[sseq] = 1 out.write('>'+sseq+'1'+'\n') out.write(hsp.sbjct+'\n') else: Seqs[sseq] += 1 out.write('>'+sseq+str(Seqs[sseq])+'\n') out.write(hsp.sbjct+'\n') # --- get A matrix Clusters = ucluster(pdb+'.ali') M = len(Clusters) Meff = 0. Seqs = {} for alignment in blast_record.alignments: for hsp in alignment.hsps: if float(hsp.expect) > evalue: continue sseq = alignment.title.split('|')[-2] qstart = hsp.query_start-1 qi = qstart hquery = hsp.query hsbjct = hsp.sbjct if sseq not in Seqs: Seqs[sseq] = 1 sseq += '1' else: Seqs[sseq] += 1 sseq += str(Seqs[sseq]) ma = 1./Clusters[sseq] for i in range(len(hquery)): if hquery[i] != '-': if hquery[i]!=qseq[qi]: raise ValueError( "Mismatch in alignment sequence at position " "%d: %s %s" % (i+1, hquery[i], qseq[qi])) if hsbjct[i] in q: A[qi,q.index(hsbjct[i])] += ma #1. else: pass qi += 1 else: pass Meff += ma for i,a in enumerate(qseq): try: A[i,q.index(a)] += 1./Clusters[pdb+'q'] #1. Meff += 1./Clusters[pdb+'q'] except ValueError: pass # --- re-weight the A matrix, correct for lambda factor lmbd = Meff with open(pdb+'.sqc', 'w') as out: for i in range(len(A)): Si = 0 Fa = sum(A[i]) for j in range(len(q)): si = (1./(Meff+lmbd)) * ( (lmbd/len(q)) + A[i,j] ) #A[i,j]/Fa A[i,j] = si if si>0.: Si -= si*numpy.log(si) out.write('\t'.join([str(i+1), qseq[i], str(Si)]) + '\n')
def get_hist_ss(test_seq, type='Unknown', debug=0): """Returns sequence elements in histone sequence, all numbers assume first element in seq has number 0!!! Not like in PDB""" #Let's define 1kx5 sequences templ_H3 = Seq( "ARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEASEAYLVALFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA", IUPAC.protein) templ_H4 = Seq( "SGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG", IUPAC.protein) templ_H2A = Seq( "SGRGKQGGKTRAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPKKTESSKSKSK", IUPAC.protein) templ_H2B = Seq( "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK", IUPAC.protein) #'element_name':[start,stop], start stop - are inclusive as in PDB file #Numbering differes between symmetrical chains and 1kx5 vs 1aoi. #We simply take the minimum length of alpha helices over all chains in 1kx5 #1 substructed from PDB values!!! because these values are in array index numberins starting from 0 #docking domain (amino acids 80 – 119) from paper by Luger 1aoi, however in JMB paper we defined it as 80-118, probably to be at the trypsin cleavage site KK???, so we stick with this here. Although HistoneDB uses the Luger convention (albite with a bug - it starts with 81 - that was fixed in code now). ss_templ_H3 = { 'alphaN': [43, 56], 'alpha1': [62, 76], 'alpha2': [84, 113], 'alpha3': [119, 130], 'loopL1': [78, 83], 'loopL2': [114, 118], 'beta1': [82, 83], 'beta2': [117, 118], 'mgarg1': [62, 62], 'mgarg2': [82, 82], 'mgarg3': [48, 48] } ss_templ_H4 = { 'alpha1ext': [23, 28], 'alpha1': [29, 40], 'alpha2': [48, 75], 'alpha3': [81, 92], 'loopL1': [41, 47], 'loopL2': [76, 81], 'beta1': [44, 45], 'beta2': [79, 80], 'beta3': [95, 97], 'mgarg1': [44, 44] } # ss_templ_H2A={'alpha1ext':[15,21],'alpha1':[25,36],'alpha2':[45,72],'alpha3':[78,88],'alpha3ext':[89,96],'loopL1':[37,44],'loopL2':[73,77],'beta1':[41,42],'beta2':[76,77],'beta3':[99,101],'docking domain':[91,107],'docking tail':[108,116],'mgarg1':[41,41],'mgarg2':[76,76]} #new def of docking domains as in Suto Luger 2000 ss_templ_H2A = { 'alpha1ext': [15, 21], 'alpha1': [25, 36], 'alpha2': [45, 72], 'alpha3': [78, 88], 'alpha3ext': [89, 96], 'loopL1': [37, 44], 'loopL2': [73, 77], 'beta1': [41, 42], 'beta2': [76, 77], 'beta3': [99, 101], 'docking domain': [80, 118], 'mgarg1': [41, 41], 'mgarg2': [76, 76] } ss_templ_H2B = { 'alpha1': [33, 45], 'alpha2': [51, 80], 'alpha3': [86, 98], 'alphaC': [99, 119], 'loopL1': [46, 50], 'loopL2': [81, 85], 'beta1': [49, 50], 'beta2': [84, 85], 'mgarg1': [29, 29] } ss_templ = { 'H3': ss_templ_H3, 'H4': ss_templ_H4, 'H2A': ss_templ_H2A, 'H2B': ss_templ_H2B } templ = { 'H3': templ_H3, 'H4': templ_H4, 'H2A': templ_H2A, 'H2B': templ_H2B } #Lets use blast and see what histone is our query my_records = [ SeqRecord(templ_H3, id='H3', name='H3'), SeqRecord(templ_H4, id='H4', name='H4'), SeqRecord(templ_H2A, id='H2A', name='H2A'), SeqRecord(templ_H2B, id='H2B', name='H2B') ] n1 = str(uuid.uuid4()) n2 = str(uuid.uuid4()) faa_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".faa") fastan2_file = os.path.join(CONFIG.TEMP_DIR, n2 + ".fasta") fastan1_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".fasta") db_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db") xml_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".xml") txt_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".txt") phr_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.phr") pin_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.pin") psq_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.psq") SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')], fastan2_file, 'fasta') # print(os.environ.get('PATH')) if (type == 'Unknown'): SeqIO.write(my_records, faa_file, "fasta") os.system('makeblastdb -dbtype prot -in %s -out %s > /dev/null' % (faa_file, db_file)) blastp_cline = NcbiblastpCommandline(query=fastan2_file, db=db_file, evalue=100, outfmt=5, out=xml_file) stdout, stderr = blastp_cline() blast_record = NCBIXML.read(open(xml_file, 'r')) sname = list() evalue = list() hsp_list = list() # length_list=list() for alignment in blast_record.alignments: for hsp in alignment.hsps: sname.append(alignment.title) evalue.append(hsp.expect) hsp_list.append(hsp) # length_list.append(alignment.length) hist_identified = sname[evalue.index(min(evalue))].split()[1] hsp = hsp_list[evalue.index(min(evalue))] # length=length_list[evalue.index(min(evalue))] else: hist_identified = type if (debug): print('Most likely this is histone:') if (debug): print(hist_identified) if (debug): print('Blast alignment') #We need to determine secondary strucutre according to template using the alignment # if(debug): print(hsp) SeqIO.write([ SeqRecord( templ[hist_identified], id=hist_identified, name=hist_identified) ], fastan1_file, 'fasta') #Now we will redo it with Needlman Wunsh - the global alignment needle_cline = NeedleCommandline(asequence=fastan1_file, bsequence=fastan2_file, gapopen=20, gapextend=1, outfile=txt_file) stdout, stderr = needle_cline() # print('Needle alignment') align = AlignIO.read(txt_file, "emboss") if (debug): print(align) # print(hsp.gaps) #Blast checking # ss_test=dict() # for key,value in ss_templ[hist_identified].iteritems(): # print('Checking %s'%key) # if((hsp.sbjct_start<=value[1])&((hsp.sbjct_end)>=value[0])): # print('Belongs') # else: # print('Not') #Now we will get correspondence ss_test = dict() hist = templ[hist_identified] corrsp_hist = list(range(len(hist))) k = 0 for a, i in zip(align[0], range(len(align[0]))): if (a == '-'): k = k + 1 else: corrsp_hist[i - k] = i if (debug): print(corrsp_hist) corrsp_test = list(range(len(test_seq))) k = 0 for a, i in zip(align[1], range(len(align[1]))): if (a == '-'): k = k + 1 else: corrsp_test[i - k] = i if (debug): print(corrsp_test) for key, value in ss_templ[hist_identified].items(): if (debug): print('Checking %s' % key) start_in_aln = corrsp_hist[value[0]] if (debug): print('Start in aln %d' % start_in_aln) end_in_aln = corrsp_hist[value[1]] if (debug): print('End in aln %d' % end_in_aln) for k in range(len(align[0])): try: start_in_test_seq = corrsp_test.index(start_in_aln + k) except: start_in_test_seq = -1 if (debug): print("Trying to move start"), continue break # print('\n %d'%start_in_test_seq) for k in range(len(align[0])): try: end_in_test_seq = corrsp_test.index(end_in_aln - k) except: end_in_test_seq = -1 if (debug): print('Trying to move end'), continue break # print('\n %d'%end_in_test_seq) if ((start_in_test_seq == -1) | (end_in_test_seq == -1) | (start_in_test_seq > end_in_test_seq)): ss_test[key] = [-1, -1] else: ss_test[key] = [start_in_test_seq, end_in_test_seq] if (debug): print(ss_test[key]) if (type == 'Unknown'): #os.system("rm %s.faa %s.db.phr %s.db.pin %s.db.psq %s.fasta %s.xml %s.txt %s.fasta"%(n1,n1,n1,n1,n2,n1,n1,n1)) os.system("rm %s %s %s %s %s %s %s %s"%\ (faa_file,phr_file,pin_file,psq_file,fastan2_file,xml_file,txt_file,fastan1_file)) else: os.system("rm %s %s %s" % (fastan2_file, txt_file, fastan1_file)) return hist_identified, ss_test
def cazy2class(prefix, F, remote=False): ''' will take the cazy database (dictionary provided) and try to fetch subfamilies and place them as classifiers. ''' print 'You chose to use CAZY database to classify GH13 family into subfamilies'\ ' this will take a while, since have to go over BLAST results, etc..' cls = open(prefix + '.cls', 'w') # import database db = pickle.load(open('CazyDB.bin')) names = get_names(prefix + '.gm') for n in names: print 'Processing %s...' % (n) if remote: Entrez.email = '*****@*****.**' print '\tBlasting (Running remotely)...' n = n[:-1] + '_' + n[-1] while 1: try: b = qblast('blastp', 'nr', n, perc_ident=90, expect=1, gapcosts='11 1') print '\tBlast Done... \n\t\tAnalysing entries...' break except: print 'Some problem in NCBI, sleeping for 10...' time.sleep(10) else: print '\tBlasting (Running locally)...' fi = open('temp.fasta', 'w') fi.write('>%s\n%s' % (n, F.seqs[F.chains[n[:4]]])) fi.close() #blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001, # outfmt=5, out="temp.xml",max_target_seqs=50, # num_alignments=50,num_threads=4) bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta -evalue 0.0001 -max_target_seqs 50 '\ '-seg yes -num_threads 4 -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml', shell=True) bl.wait() print '\tBlast Done... \n\t\tAnalysing entries...' b = open('temp.xml') blast_record = NCBIXML.read(b) rm = Popen('rm temp.*', shell=True) rm.wait() nohit = True while nohit: for a in blast_record.alignments: print '\t\t\t%s' % (a.accession) h = a.hsps[0] if float(h.identities) / float(h.align_length) >= 0.9: ans, k = dict_lookup(a.accession, db) if ans: cls.write(str(db[k]) + ';') print '\t\t\t\tAccession number found in CAZY!, Subfamily %s' % ( db[k]) nohit = False break else: if blast_record.alignments.index(a) + 1 == len( blast_record.alignments): cls.write('%s;' % (n)) nohit = False print '\tNo relative found in CAZY' break elif blast_record.alignments.index(a) + 1 == len( blast_record.alignments): cls.write('%s;' % (n)) nohit = False print '\tNo relative found in CAZY' break cls.write('\n') cls.close()
def process_one_input_file (input_file , OUT_DIR): output_file = OUT_DIR + input_file.split("/")[-1].split(".xml.gz")[0] + ".features_tsv.gz" ; errlog_file = OUT_DIR + input_file.split("/")[-1].split(".xml.gz")[0] + ".errorlog.txt" ; print("processing : " + input_file) print("creating output : " + output_file) print("creating errorlog: " + errlog_file) inp_file_handle = gzip.open(input_file , 'rt') out_file_handle = gzip.open(output_file, 'wt') log_file_handle = open(errlog_file, "wt") all_records = NCBIXML.parse(inp_file_handle) cnt = 0 ; for RECORD in all_records: for alignment in RECORD.alignments: for hsp in alignment.hsps: #cnt += 1 L = [] #if cnt % 1000 == 0: # print cnt ; #Features : For each RECORD (Generally we have only 1 record) if len (RECORD.query.split ())>1: # L.append (input_file.split("/")[-1].split(".")[0]); L.append (RECORD.query.split ()[0]);#<BlastOutput_query-def>T96060004884 DPY30_HUMAN</BlastOutput_query-def> #UniprotID = DPY30_HUMAN else: L.append (RECORD.query); L.append (RECORD.query_id);#<BlastOutput_query-ID>90843</BlastOutput_query-ID> L.append (RECORD.query_length)#<Iteration_query-len>99</Iteration_query-len> L.append (RECORD.query_letters);#<BlastOutput_query-len>99</BlastOutput_query-len> #Features : For each Alignment : EACH <Hit> ... and usually each <HIT> may have multiple <Hsp> ... we usually have 50 HSP # PARAM_UNIProtID_FromGI = func_GetUniProtID_FromGI (alignment.hit_id , EVEXDBcon) # PARAM_UNIProtID_FromACC = func_GetUniProtID_ACCESSION (alignment.hit_id , EVEXDBcon) # #hit_id: gi|18202836|sp|Q9CQV8.3|1433B_MOUSE PARAM_UNIProtID_FromXML = set() tmp = alignment.hit_id.split("|") if len (tmp) == 3: PARAM_UNIProtID_FromXML.add (tmp[2]) PARAM_UNIProtID = PARAM_UNIProtID_FromXML if len(PARAM_UNIProtID) == 0: ErrStr = RECORD.query_id + "\t" + alignment.hit_id + "\t" + "GI: " + alignment.hit_id.split ("|")[1] + "\n" ; log_file_handle.write (ErrStr) continue else: PARAM_UNIProtID = ",".join (PARAM_UNIProtID) L.append (PARAM_UNIProtID);# --> GI --> UniprotID L.append (alignment.accession);#<Hit_accession>XP_005815176</Hit_accession> L.append (alignment.length);#<Hit_len>104</Hit_len> #L.append (alignment.hit_id);#<Hit_id>gi|551527403|ref|XP_005815176.1|</Hit_id> #L.append (alignment.hit_def);#<Hit_def>PREDICTED: protein dpy-30 homolog [Xiphophorus maculatus]</Hit_def> #Features : For each hsp : <hsp> L.append (hsp.align_length);#<Hsp_align-len>98</Hsp_align-len> L.append (hsp.bits) ;#<Hsp_bit-score>160.614</Hsp_bit-score> L.append (hsp.score);#<Hsp_score>405</Hsp_score> L.append (hsp.expect);# EVALUE : <Hsp_evalue>1.74162e-48</Hsp_evalue> L.append (hsp.query_start);#<Hsp_query-from>2</Hsp_query-from> L.append (hsp.query_end);#<Hsp_query-to>99</Hsp_query-to> L.append (hsp.sbjct_start);#<Hsp_hit-from>7</Hsp_hit-from> L.append (hsp.sbjct_end);#<Hsp_hit-to>104</Hsp_hit-to> L.append (hsp.frame[0]);#<Hsp_query-frame>0</Hsp_query-frame> L.append (hsp.frame[1]);#<Hsp_hit-frame>0</Hsp_hit-frame> L.append (hsp.identities);#<Hsp_identity>74</Hsp_identity> L.append (hsp.positives);#<Hsp_positive>92</Hsp_positive> L.append (hsp.gaps);#<Hsp_gaps>0</Hsp_gaps> out_file_handle.write ("\t".join(str(x) for x in L) + "\n") inp_file_handle.close() out_file_handle.close() log_file_handle.close()
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP 'BLASTN': 'nucleotide_match', 'BLASTP': 'protein_match', }.get(record.application, 'match') recid = record.query if ' ' in recid: recid = recid[0:recid.index(' ')] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): for idx_hsp, hsp in enumerate(hit.hsps): qualifiers = { "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(' >'), } if include_seq: qualifiers.update({ 'blast_qseq': hsp.query, 'blast_sseq': hsp.sbjct, 'blast_mseq': hsp.match, }) for prop in ('score', 'bits', 'identities', 'positives', 'gaps', 'align_length', 'strand', 'frame', 'query_start', 'query_end', 'sbjct_start', 'sbjct_end'): qualifiers['blast_' + prop] = getattr(hsp, prop, None) desc = hit.title.split(' >')[0] qualifiers['description'] = desc[desc.index(' '):] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') # If we trim the left end, we need to trim without losing information. used_parent_match_start = parent_match_start if trim: if parent_match_start < 1: used_parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature( FeatureLocation(used_parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers ) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = { "source": "blast", } top_feature.sub_features = [] for idx_part, (start, end, cigar) in \ enumerate(generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap)): part_qualifiers['Gap'] = cigar part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers)) ) rec.features.append(top_feature) rec.annotations = {} yield rec
def get_output(display, dcov, overwrite, query, dbname, evalue, coverage, which, e_filter, out, alignment_length): # form the blast result first if 'genome' in dbname: cmdstring = 'blastp -query {q} -db {d} -evalue {e} -max_hsps 1 -out {o}/result.xml -outfmt "5"'.format( q=query, d=dbname, e=evalue, o=out) col_list = [ 'sacc', 'qacc', 'slen', 'qlen', 'length', 'gaps', 'sstart', 'send', 'qstart', 'qend', 'evalue', 'score', 'pident', 'sseq', 'match', 'qseq' ] else: cmdstring = '{q}blastp -query - -db {d} -evalue {e} -max_hsps 1 -out {o}/result.xml -outfmt "5"'.format( q=query, d=dbname, e=evalue, o=out) col_list = [ 'qacc', 'sacc', 'qlen', 'slen', 'length', 'gaps', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'score', 'pident', 'qseq', 'match', 'sseq' ] if overwrite or not os.path.isfile(out + '/result.xml'): os.system(cmdstring) # parse the result to the filter result_handle = open('{o}/result.xml'.format(o=out)) blast_records = NCBIXML.parse(result_handle) xml_list = [] for rec in blast_records: for alignment in rec.alignments: for hsp in alignment.hsps: qacc = rec.query.split()[0] qacc = qacc.split('.')[0] if '|' in qacc: qacc = qacc.split('|')[1] sacc = alignment.accession.split()[0] sacc = sacc.split('.')[0] if '|' in sacc: sacc = sacc.split('|')[1] xml_list.append([ qacc, sacc, rec.query_length, alignment.length, hsp.align_length, hsp.gaps, hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end, hsp.expect, hsp.score, 100 * float(hsp.identities) / float(hsp.align_length), hsp.query, hsp.match, hsp.sbjct ]) df = pd.DataFrame(xml_list, columns=col_list) # calculate the query/subject coverage and add column '''df['sacc'] = df['sacc'].apply(lambda x: x.split()[0]) df['qacc'] = df['qacc'].apply(lambda x: x.split()[0]) df['sacc'] = df['sacc'].apply(lambda x: x.split('.')[0]) df['qacc'] = df['qacc'].apply(lambda x: x.split('.')[0])''' df['pident'] = df['pident'].apply(lambda x: int(x)) qcovs = [] scovs = [] for index, row in df.iterrows(): # calculate and insert corresponding values qc = ((row["qend"] - row["qstart"] + 1) * 100) / row["qlen"] qcovs.append(qc) sc = ((row["send"] - row["sstart"] + 1) * 100) / row["slen"] scovs.append(sc) df.insert(loc=12, column='qcovs', value=qcovs) df.insert(loc=13, column='scovs', value=scovs) blast_filter(df, coverage, which, e_filter, alignment_length) # sort entries based on e-value, coverages df.sort_values(["evalue", "qcovs"], inplace=True, ascending=[True, False]) # try to run pfam for these results; extract sequence of good hits into one single fasta file genome_list = list(set(df['qacc'].tolist())) NCBI_list = list(set(df['sacc'].tolist())) # check the number of protein in the list, if empty, exit if len(genome_list) == 0: print >> sys.stderr, "No good results! Try to use other keywords or lower the filter standard." sys.exit(1) list_to_file('{o}/good_genome.txt'.format(o=out), genome_list) list_to_file('{o}/good_NCBI.txt'.format(o=out), NCBI_list) cmdstring = 'blastdbcmd -db {o}/blastdb/genome_blastdb -entry_batch {o}/good_genome.txt -out {o}/good_result.fasta;blastdbcmd -db {o}/blastdb/NCBI_blastdb -entry_batch {o}/good_NCBI.txt >> {o}/good_result.fasta'.format( o=out) os.system(cmdstring) # run hmmscan and parse the result as a table if overwrite or not os.path.isfile(out + '/pfam.out'): cmdstring = 'hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout {o}/pfam.out /ResearchData/pfam/pfamdb/Pfam-A.hmm {o}/good_result.fasta'.format( o=out) os.system(cmdstring) # parse the result in pandas hmm_object = hmmParser('{o}/pfam.out'.format(o=out)) hmm_object.filterByCoverage(dcov) df_pfam = pd.DataFrame(hmm_object.matrix) domtblout_cols = 'target_name t_accession tlen query_name accession qlen evalue socre bias # of cevalue ievalue score bias hmm_from hmm_to ali_from ali_to env_from env_to acc description_of_target'.strip( ).split(' ') df_pfam.columns = domtblout_cols # get the clan accessions and clan info for the existing results df_pfam.insert(loc=3, column='clan_acc', value='') df_pfam.insert(loc=4, column='clan_info', value='') df_pfam['query_name'] = df_pfam['query_name'].apply( lambda x: x.split('.')[0]) df_pfam['t_accession'] = df_pfam['t_accession'].apply( lambda x: x.split('.')[0]) for index, row in df_pfam.iterrows(): cmd = 'zgrep {s} /ResearchData/pfam/download/Pfam-A.clans.tsv.gz'.format( s=row["t_accession"]) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) output = process.stdout.read().split() #print output if 'CL' in output[1]: row["clan_acc"] = output[1] row["clan_info"] = output[2] else: row["clan_acc"] = 'N/A' row["clan_info"] = 'N/A' # for each pair of columns in the filtered output, map the value of each cell to the pfam result, check if the domians are the same domain_dic = clan_to_dic('query_name', 'target_name', df_pfam) clan_dic = clan_to_dic('query_name', 'clan_acc', df_pfam) #pprint.pprint(clan_dic) # create a list for unqualified results and drop rows according to the list in two dataframes bad_hits_blast = [] bad_hits_pfam = [] no_domain = [] for index, row in df.iterrows(): '''if '|' in row["qacc"]: row["qacc"] = row["qacc"].split('|')[1] if '|' in row["sacc"]: row["sacc"] = row["sacc"].split('|')[1]''' # use set to compare if 2 lists have at least one common element try: if not set(domain_dic[row["qacc"]]) & set(domain_dic[row["sacc"]]): if not set(clan_dic[row["qacc"]]) & set(clan_dic[row["sacc"]]): df = df.drop(index) bad_hits_pfam.append(row["qacc"]) bad_hits_pfam.append(row["sacc"]) except KeyError: # in the case that the protein has no protein domains if not domain_dic.get(row["qacc"]): df_pfam.loc[-1] = [ 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', row["qacc"], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A' ] df_pfam.index = df_pfam.index + 1 if row["qacc"] not in no_domain: no_domain.append(row["qacc"]) if not domain_dic.get(row["sacc"]): df_pfam.loc[-1] = [ 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', row['sacc'], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A' ] df_pfam.index = df_pfam.index + 1 if row["sacc"] not in no_domain: no_domain.append(row["sacc"]) # delete rows according two lists df_pfam = df_pfam.drop_duplicates() df_pfam = df_pfam.set_index('query_name', drop=True) df_pfam = df_pfam.drop(bad_hits_pfam) # drop by rows # save new sepeate results df['evalue'] = df['evalue'].apply(lambda x: '%.2e' % x) df.to_csv('{o}/good_blast_results.tsv'.format(o=out), sep='\t') df_pfam.to_csv('{o}/good_pfam_results.tsv'.format(o=out), sep='\t') pfam_dic = pfam_to_dic(df_pfam) # create hydropathy plots for these results, and add protein domains plot_path = out + '/plots' os.system('mkdir {p}'.format(p=plot_path)) # create an final result and in the same loop finish data insertion outfile = open(out + '/result.html', 'w') outfile.write( '<!DOCTYPE html><html><head><title>Missing Components Results</title><style type="text/css"></head><body>\n.label {text-align: right;width:50px;}\n.data {text-align:left;padding-left: 8px;width:100px;}\n.seq{border:2px solid black;height:70px; width:100%;overflow-x:auto;overflow-y:auto;margin:1em 0;background:grey;color:white;}tab1 { padding-left: 4em;}</style></head><body>\n' ) # for each pair in the blast results (good one) create two seperated hydropathy plots and create a combined one with pfam domain covered, use blastdbcmd to extract sequences from the database and pass to the quod.py row_count = df.shape[0] if row_count < display: display = row_count df = df.head(display) df.reset_index(inplace=True) df.to_csv('{o}/good_blast_results1.tsv'.format(o=out), sep='\t') for index, row in df.iterrows(): d_str = domain_string(row["qacc"], row["sacc"], pfam_dic, no_domain) seq_string1 = 'blastdbcmd -db {o}/blastdb/genome_blastdb -entry {s}'.format( o=out, s=row['qacc']) seq_string2 = 'blastdbcmd -db {o}/blastdb/NCBI_blastdb -entry {q}'.format( o=out, q=row['sacc']) process = subprocess.Popen(seq_string1, stdout=subprocess.PIPE, shell=True) seq1 = process.stdout.read() process = subprocess.Popen(seq_string2, stdout=subprocess.PIPE, shell=True) seq2 = process.stdout.read() # create aligner alignment = row['qseq'] + '\n' + row['match'] + '\n' + row['sseq'] # draw seperated plots first, draw bars of alignment and commnon domain # get the seperate command string first. It should have multiple domains and for same domain they should contain the same color os.system( 'quod.py -l {q} -q -s "{s1}" --width 15 -c red -d {p} -o {q}_vs_{s}.png -w {qs}-{qe} {ds}' .format(s1=seq1, p=plot_path, q=row['qacc'], qs=row['qstart'], qe=row['qend'], ds=d_str[0], s=row['sacc'])) os.system( 'quod.py -l {s} -q -s "{s2}" --width 15 -c blue -d {p} -o {s}_vs_{q}.png -w {ss}-{se} {ds}' .format(s2=seq2, p=plot_path, s=row['sacc'], ss=row['sstart'], se=row['send'], ds=d_str[1], q=row['qacc'])) # draw the aligned part as the combined graph seq = '>\n' + row['qseq'] + '\n>\n' + row['sseq'] temp = tempfile.NamedTemporaryFile(delete=True) try: temp.write(seq) temp.flush() os.system( 'quod.py {f} -l {q} -q --width 15 -d {p} -o {q}.png'.format( f=temp.name, p=plot_path, q=row['qacc'] + '_' + row['sacc'] + '_aligned')) finally: temp.close() # insert the blast info outfile.write( '<br /><hr style="border-style:solid; border-width:5px; color:black;"/><h2 style="text-align:left;">{q}</h2></font><font size = "4"><b>Hit Accession:</b>{s}</font><table width="600px" border="0" cellspacing="0" cellpadding="2"><tr><td class="label"><b>E-value:</b></td><td class="data">{evalue}</td><td class="label"><b>Identity:</b></td><td class="data">{pident}%</td><td class="label"><b>Length:</b></td><td class="data">{length}</td></tr><tr><td class="label"><b>Q_cov:</b></td><td class="data">{qcov}%</td><td class="label"><b>S_cov:</b></td><td class="data">{scov}%</td><td class="label"></td><td class="data"></td></tr></table><p><b>Alignment:</b><tab1>Query:{qstart}-{qend}<tab1>Subject:{sstart}-{send}</p><div class="seq"><pre>{align}</pre></div>' .format(q=row['qacc'], s=row['sacc'], length=row['length'], evalue=row['evalue'], pident=row['pident'], align=alignment, qcov=row['qcovs'], scov=row['scovs'], qstart=row['qstart'], qend=row['qend'], sstart=row['sstart'], send=row['send'])) # insert images to the html file outfile.write( '<center><table style = "width:100%" border = "0"><tr><td><center><img src = "{p}/{q}.png" style="width:90%; height:90%" /></center></td><td><center><img src = "{p}/{s}.png" style="width:90%; height:90%" /></center></td></tr><tr><td colspan = "2"><center><img src = "{p}/{qs}.png" style="width:50%; height:50%" /></center></td></tr></table></center>' .format(p=plot_path, q=row['qacc'] + '_vs_' + row['sacc'], s=row['sacc'] + '_vs_' + row['qacc'], qs=row['qacc'] + '_' + row['sacc'] + '_aligned')) # insert the hmm info outfile.write( '<center><table style = "width:100%" border = "1"><tr><td>Domain</td><td>Domain_acc</td><td>Domain_len</td><td>Protein_acc</td><td>Protein_len</td><td>evalue</td><td>from</td><td>to</td><td>Clan</td><td>Clan_acc</td></tr>' ) for obj in pfam_dic[row["qacc"]]: outfile.write( '<tr><td>{domain}</td><td>{dacc}</td><td>{dlen}</td><td>{pacc}</td><td>{plen}</td><td>{evalue}</td><td>{f}</td><td>{t}</td><td>{clan}</td><td>{cacc}</td></tr>' .format(domain=obj[6], dacc=obj[2], dlen=obj[7], pacc=obj[8], plen=obj[9], evalue=obj[10], f=obj[0], t=obj[1], clan=obj[11], cacc=obj[3])) for obj in pfam_dic[row["sacc"]]: outfile.write( '<tr><td>{domain}</td><td>{dacc}</td><td>{dlen}</td><td>{pacc}</td><td>{plen}</td><td>{evalue}</td><td>{f}</td><td>{t}</td><td>{clan}</td><td>{cacc}</td></tr>' .format(domain=obj[6], dacc=obj[2], dlen=obj[7], pacc=obj[8], plen=obj[9], evalue=obj[10], f=obj[0], t=obj[1], clan=obj[11], cacc=obj[3])) outfile.write('</table></center><br>') # Eventually create the result.html file outfile.write('</body></html>') outfile.close()
def readBlast(db, path, compareTo): outpath = os.path.join( path, 'temp/out_' + os.path.basename(compareTo['path']) + ".xml") min_evalue = 1e-6 min_coverage = settings.homologyCutoffTo #0.5 min_query_coverage = settings.homologyCutoffFrom #0.4 # Create a lookup table from contig names to the number of the contigs contigLookup = {} for i, contig in enumerate(compareTo['contigs']): contigLookup[getFastaName(contig)] = i with open(outpath) as outHandle: # Crawl across all the hits for record in NCBIXML.parse(outHandle): matches = [] for alignment in record.alignments: # Find the contig of the hit contigNo = contigLookup[alignment.hit_def] contigHit = compareTo['contigs'][contigNo] for hsp in alignment.hsps: if hsp.expect > min_evalue: break # find the gene for each hit location = (hsp.sbjct_start + hsp.sbjct_end) // 2 match = matchInPV(location, contigHit) if match: match['expect'] = hsp.expect match['length'] = hsp.identities match['hsp'] = hsp match['contigNo'] = contigNo match['contigName'] = contigHit['record'].description matches.append(match) continue matchRecords = featurefinder.findMatchingFeatures( contigHit['record'], location, ['CDS', 'rRNA', 'gene']) # store the result if matchRecords: match = { 'record': matchRecords[0], 'expect': hsp.expect, 'length': hsp.identities, 'hsp': hsp, 'contigNo': contigNo, 'contigName': contigHit['record'].description } # Match this with an existing gene record if available annotateMatch(match, contigHit) matches.append(match) # Now, if we have matches we need to associate them with the right gene in our records if matches: # But first we want to group any matches that are to the same gene together # And then screen them out if the TOTAL amount matched is less than 50% groupedMatches = [] lociMatched = set([m['locus'] for m in matches]) for locus in lociMatched: locusMatches = [m for m in matches if m['locus'] == locus] ourMatch = dict(locusMatches[0]) if len(locusMatches) > 1: ourMatch['multipleHits'] = [] ourMatch['length'] = 0 for duplicate in locusMatches: ourMatch['multipleHits'].append(duplicate['hsp']) ourMatch['length'] = ourMatch[ 'length'] + duplicate['length'] ourMatch['multipleHits'].sort( key=lambda x: x.query_start) # NB: geneLength in bp, length in amino acids, hence *3 ourMatch['coverage'] = (ourMatch['length'] * 3) / ourMatch['geneLength'] if ourMatch['coverage'] > min_coverage: groupedMatches.append(ourMatch) if groupedMatches: for groupedMatch in groupedMatches: # Get the strain and name out of the record.query queryStrainName, queryContigNo, queryTractName = splitQueryName( record.query) queryContig = findStrain( db, queryStrainName)['contigs'][int(queryContigNo)] geneMatch = next(x for x in queryContig['tracts'] if x['name'] == queryTractName) if not geneMatch: print("Something has gone wrong, gene '" + record.query + "' not found") continue groupedMatch['queryCoverage'] = ( groupedMatch['length'] * 3) / geneMatch['geneLength'] if groupedMatch['queryCoverage'] < min_query_coverage: continue if 'blastMatch' not in geneMatch: geneMatch['blastMatch'] = {} # Create a link from the query match to the subject (hit) match contigLinkName = groupedMatch['contigName'] if contigLinkName not in geneMatch['blastMatch']: geneMatch['blastMatch'][contigLinkName] = [] geneMatch['blastMatch'][contigLinkName].append( groupedMatch) # Do we have a link to another PV gene? # If so create a bidirectional link if 'tractNo' in groupedMatch: otherGene = compareTo['contigs'][groupedMatch[ 'contigNo']]['tracts'][groupedMatch['tractNo']] geneMatch['links'].add(otherGene['uid']) otherGene['links'].add(geneMatch['uid'])
def summarize_blast_output(blast_out=None, blast_file=None, min_identity=None, expect=None): """ Parse NCBI BLAST XML output and convert to a list of simple summary objects. Note that this is very specific to searching the PDB, and returns incomplete information (suitable for summarizing in a flat table). """ assert ([blast_out, blast_file].count(None) == 1) from Bio.Blast import NCBIXML import iotbx.pdb.fetch if (blast_out is not None): blast_in = cStringIO.StringIO(blast_out) else: assert os.path.isfile(blast_file) blast_in = open(blast_file) parsed = NCBIXML.parse(blast_in) blast = parsed.next() if (len(blast.alignments) == 0): raise Sorry("No matching sequences!") results = [] for i_hit, hit in enumerate(blast.alignments): pdb_chain_id = str(hit.accession) #hit.accession may only have pdb_id, e.g. 1EMG if len(pdb_chain_id.split("_")) > 1: pdb_id, chain_id = pdb_chain_id.split("_") else: pdb_id = pdb_chain_id chain_id = None # hsp = hit.hsps[0] assert (hsp.align_length > 0) identity = 100 * hsp.identities / hsp.align_length if (min_identity is not None) and (identity < min_identity): continue # XXX this is really appalling, but the NCBI groups together identical # sequences in its BLAST output, so I need to parse the accession code # strings to extract the individual PDB IDs hit_def_fields = hit.hit_def.split("|") all_ids = [] for i_field, field in enumerate(hit_def_fields): if (field == "pdb") and (i_field < len(hit_def_fields) - 1): next_pdb_id = hit_def_fields[i_field + 1] if "Chain" in hit_def_fields[i_field + 2]: next_chain_id = hit_def_fields[i_field + 2].split()[0] else: next_chain_id = None if (iotbx.pdb.fetch.looks_like_pdb_id(next_pdb_id)): all_ids.append([next_pdb_id, next_chain_id]) summary = blast_hit(hit_num=i_hit + 1, pdb_id=pdb_id, chain_id=chain_id, evalue=hsp.expect, length=hsp.align_length, identity=identity, positives=100 * hsp.positives / hsp.align_length, hsp=hsp, all_ids=all_ids) results.append(summary) return results
def search_and_process2(rpsblast, cdd_name, tmp_dir, evalue, translation_id, translation): """ Uses rpsblast to search indicated gene against the indicated CDD :param rpsblast: path to rpsblast binary :param cdd_name: CDD database path/name :param tmp_dir: path to directory where I/O will take place :param evalue: evalue cutoff for rpsblast :param translation_id: unique identifier for the translation sequence :param translation: protein sequence for gene to query :return: results """ # Setup I/O variables i = "{}/{}.txt".format(tmp_dir, translation_id) o = "{}/{}.xml".format(tmp_dir, translation_id) # Write the input file with open(i, "w") as fh: fh.write(">{}\n{}".format(translation_id, translation)) # Setup and run the rpsblast command rps_command = NcbirpsblastCommandline(cmd=rpsblast, db=cdd_name, query=i, out=o, outfmt=5, evalue=evalue) rps_command() # Process results into a single list results = [] with open(o, "r") as fh: for record in NCBIXML.parse(fh): # Only need to process if there are record alignments if record.alignments: for align in record.alignments: for hsp in align.hsps: if hsp.expect <= evalue: align.hit_def = align.hit_def.replace("\"", "\'") des_list = align.hit_def.split(",") if len(des_list) == 1: description = des_list[0].strip() domain_id = None name = None elif len(des_list) == 2: domain_id = des_list[0].strip() description = des_list[1].strip() name = None else: domain_id = des_list[0].strip() name = des_list[1].strip() # Name is occassionally longer than permitted # in the database. Truncating avoids a # MySQL error. # TODO perhaps the database schema should be # changed to account for this. name = basic.truncate_value(name, 25, "...") description = ",".join(des_list[2:]).strip() # Try to put domain into domain table results.append( INSERT_INTO_DOMAIN.format( align.hit_id, domain_id, name, description)) # Try to put this hit into gene_domain table data_dict = { "Translation": translation, "HitID": align.hit_id, "Expect": float(hsp.expect), "QueryStart": int(hsp.query_start), "QueryEnd": int(hsp.query_end) } results.append(data_dict) # results.append(INSERT_INTO_GENE_DOMAIN.format( # geneid, align.hit_id, float(hsp.expect), # int(hsp.query_start), int(hsp.query_end))) # Update this gene's DomainStatus to 1 # results.append(UPDATE_GENE.format(geneid)) return results
def Serotype_BLAST(ProbeID, ProbeSEQ, ProbeScore, ResultTableName, ProbeMFIThreshold): from Bio import SeqIO from Bio.Blast import NCBIStandalone from Bio.Blast import NCBIXML import MySQLdb PROBEID = str(int(float(ProbeID))) PROBESEQ = ProbeSEQ MaxAlignCNT = 1000 AlignCNT = 0 conn = MySQLdb.connect(host = HOSTlocal, user = USER, passwd = PASS, db = DB) cursor = conn.cursor() Save_fasta("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/ProbeBlastSeq.fasta", PROBEID, PROBESEQ) Template_DB = "/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/FMD_Selected_Template/FMD_FinalConsensusDB/FMDFinalConsensusDB" BLASTN_v29template("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/CurrentFMDBlast.xml","/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/ProbeBlastSeq.fasta", Template_DB) #print "BLASTING" result_handle = open("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/CurrentFMDBlast.xml","r") blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if AlignCNT < MaxAlignCNT: AlignCNT = AlignCNT+1 SStart = (hsp.query_start) SEnd = (hsp.query_end) #print alignment.title #print "Identities: " + str(hsp.identities) #print "que: " + hsp.query + str(hsp.query_start) + "::" + str(hsp.query_end) #print "mat: " + hsp.match #print "sub: " + hsp.sbjct + str(hsp.sbjct_start) + "::" + str(hsp.sbjct_end) #print "Query Start NT:" + str(hsp.query_start) #print "Query Start NT:" + str(hsp.query_end) #print "Subject Start NT:" + str(hsp.sbjct_start) #print "Subject End NT:" + str(hsp.sbjct_end) #print "" preNUC_SCORE = str(float(ProbeScore)/len(hsp.sbjct)) NUC_SCORE = preNUC_SCORE[0:10] SubjectLength = len(hsp.query) for NT in range(0,SubjectLength): NT_ATCG = str(hsp.query[NT:NT+1]) NT_Pos = str(hsp.sbjct_start+NT) SERO = alignment.title Arguements = " '"+ NT_Pos + "','" + PROBEID + "','" + NUC_SCORE + "','" + NT_ATCG + "' " EnterLine = "RESULTS_" + ResultTableName + " (Position, Probe_ID, Nuc_Score, Nucleotide) VALUES (" + Arguements + ")" ActLine = "INSERT INTO " + EnterLine cursor.execute(ActLine) #print 'done' cursor.close() conn.commit() conn.close()
output.close() os.system("makeblastdb -in dups_removed.fasta -dbtype prot -out " + temp_dir + "/blast") output = open("dups_removed.fasta", "a") blast_cline = NcbiblastpCommandline(db=temp_dir + "/blast", query=temp_dir + "/seq_file", outfmt=5) blast_result = blast_cline() xml_file = open(temp_dir + "/xml_file", "w+") xml_file.write(blast_result[0]) xml_file.seek(0, 0) blast_iterator = NCBIXML.parse(xml_file) record = blast_iterator.i() alignments = record.alignments[:] for alignment in alignments: hsps = alignment.hsps[0] alignment_title = alignment.title.split(" ") if i.id == alignment_title[1]: continue percent_ident = float(hsps.identities) / float(alignment.length) if percent_ident > 0.95: print("Deleted!\n") break
def _getTopFromBlast(blastXML, TF, top=0, exContaminSpecies=True, outfile=None, newHeader=True): ''' Parses Blast result XML files and writes the best or all results with less information in a tsv file. :param blastXML: The filename of the Blast output (must be output type 5) :param TF: An instance of the TaxFinder class :param top: Write only the best `top` hits to file. If `top` is 0, all hits are saved. :param exContaminSpecies: Shall hits of known contaminated species be excluded? :param outfile: The file to write the results to (including path). If it is None, use the basename of `blastXML` :param newHeader: Where the Blast results produced with new headers (database from 2016 and newer)? :creates: `resulttables/FILE.tsv` ''' contaminantSpecies = { 118797, 59538, 7213 } # Lipotes vexillifer, Pantholops hodgsonii, Ceratitis capitata if outfile is None: outfile = 'resulttables/{}.tsv'.format(_myGetBasename(blastXML)) if top < 0: top = 0 with open(blastXML, 'r') as f, open(outfile, 'w') as out: records = NCBIXML.parse(f) out.write('\t'.join(('Tax-ID', 'Acc', 'Species', 'Rank', 'e-value', 'Length', 'Lineage', 'Prot-Name', 'Query-Protein')) + '\n') for record in records: for i, alignment in enumerate(record.alignments): if top and i > top: break infos = TF.getInfoFromHitDef(alignment.hit_id, alignment.hit_def, newHeader=newHeader) for info in infos: if exContaminSpecies and info[ 'taxid'] in contaminantSpecies: continue lineage = ', '.join( TF.getLineage(info['taxid'], display='name')) for hsp in alignment.hsps: try: line = '\t'.join( (str(info['taxid']), info['acc'], info['name'], info['rank'], str(hsp.expect), str(hsp.align_length), lineage, info['protname'], record.query.split('|')[1])) except IndexError: line = '\t'.join( (str(info['taxid']), info['acc'], info['name'], info['rank'], str(hsp.expect), str(hsp.align_length), lineage, info['protname'], record.query)) out.write(line + '\n')
def run_qblast(self, program, database, query, e_value, entrez_filter, additional_args, expected_hits): try: if program == "blastn": # Check the megablast parameter is accepted handle = NCBIWWW.qblast(program, database, query, alignments=10, descriptions=10, hitlist_size=10, entrez_query=entrez_filter, expect=e_value, **additional_args) else: handle = NCBIWWW.qblast(program, database, query, alignments=10, descriptions=10, hitlist_size=10, entrez_query=entrez_filter, expect=e_value, **additional_args) except HTTPError: # e.g. a proxy error raise MissingExternalDependencyError("internet connection failed") record = NCBIXML.read(handle) if record.query == "No definition line": # We used a sequence as the query self.assertEqual(len(query), record.query_letters) elif query.startswith(">"): # We used a FASTA record as the query expected = query[1:].split("\n", 1)[0] self.assertEqual(expected, record.query) elif record.query_id.startswith("Query_") and len( query) == record.query_letters: # We used a sequence as the entry and it was given a placeholder name pass else: # We used an identifier as the query self.assertIn( query, record.query_id.split("|"), "Expected %r within query_id %r" % (query, record.query_id)) # Check the recorded input parameters agree with those requested self.assertEqual(float(record.expect), e_value) self.assertEqual(record.application.lower(), program) self.assertTrue(len(record.alignments) <= 10) self.assertTrue(len(record.descriptions) <= 10) # Check the expected result(s) are found in the alignments if expected_hits is None: self.assertEqual(len(record.alignments), 0) # Expected no alignments! else: self.assertTrue( len(record.alignments) > 0) # Expected some alignments! found_result = False for expected_hit in expected_hits: for alignment in record.alignments: if expected_hit in alignment.hit_id.split("|"): found_result = True break if len(expected_hits) == 1: print("Update this test to have some redundancy...") for alignment in record.alignments: print(alignment.hit_id) self.assertTrue( found_result, "Missing all expected hits (%s), instead have: %s" % (", ".join(expected_hits), ", ".join( a.hit_id for a in record.alignments))) # Check the expected result(s) are found in the descriptions if expected_hits is None: self.assertEqual(len(record.descriptions), 0) # Expected no descriptions! else: self.assertTrue( len(record.descriptions) > 0) # Expected some descriptions! found_result = False for expected_hit in expected_hits: for descr in record.descriptions: if expected_hit == descr.accession \ or expected_hit in descr.title.split(None, 1)[0].split("|"): found_result = True break assert found_result, "Missing all of %s in descriptions" % expected_hit self.assertTrue(found_result)
get_fasta.close() #BLAST output blastp_cline = NcbiblastpCommandline(query=filename, db="nr", evalue=0.001, outfmt=5, out=filename[:-5] + ".xml", num_alignments=1, entrez_query="human[Organism]", remote=True) stdout, stderr = blastp_cline() #Parse XML output file blastp_cline = open(filename[:-5] + ".xml") blast_record = NCBIXML.read(blastp_cline) if len(blast_record.alignments) > 0: for alignment in blast_record.alignments: hsp = alignment.hsps[0] ident = float(hsp.identities) / hsp.align_length accession = alignment.accession accession_numbers.append(accession) homologies.append(ident) time.sleep(1) else: query_accession_numbers.append("No homologs") accession_numbers.append("No homologs") homologies.append("No homologs") time.sleep(1) continue else:
if tmp[1][0]<=tmp[0][1]: return True elif tmp[1][0]-tmp[0][1]<19: return True else: return False def cmpToList(mTuple,mList): for i in range(len(mList)): if isOverlap(mTuple,mList[i]): return i return None blast_results = NCBIXML.parse(open("/home/wenlei/LUO/ZF/blast/bac_results.xml","r")) for result in blast_results: for alignment in result.alignments: print alignment.hit_def mList = [] for hsp in alignment.hsps: mTuple = (hsp.query_start,hsp.query_end) mList.append(mTuple) mList.sort() print mList tmp_list = [mList[0]] for i in mList: mark = cmpToList(i,tmp_list) if mark != None:
'xbt006.xml', ] for test in detailed_tests: assert test in all_tests ### NCBIXML.BlastParser print "Running tests on NCBIXML.BlastParser" for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) input = open(datafile) records = NCBIXML.parse(input) for record in records: alignments = record.alignments if not alignments: print '%s - no hits' % record.query_id continue print '%s - %i alignments with a total of %i HSPs' \ % (record.query_id, len(alignments), reduce(lambda a,b: a+b, [len(a.hsps) for a in alignments])) if not test in detailed_tests: continue E_VALUE_THRESH = 10**-10
f_record = next(SeqIO.parse("m_cold.fasta", "fasta")) print("Doing the BLAST and retrieving the results...") result_handle = NCBIWWW.qblast("blastn", "nr", f_record.format("fasta")) # save the results for later, in case we want to look at it with open("m_cold_blast.out", "w") as save_file: blast_results = result_handle.read() save_file.write(blast_results) print("Parsing the results and extracting info...") # option 1 -- open the saved file to parse it # option 2 -- create a handle from the string and parse it string_result_handle = StringIO(blast_results) b_record = NCBIXML.read(string_result_handle) # now get the alignment info for all e values greater than some threshold E_VALUE_THRESH = 0.1 for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: print("****Alignment****") print("sequence: %s" % alignment.title) print("length: %i" % alignment.length) print("e value: %f" % hsp.expect) print(hsp.query[0:75] + "...") print(hsp.match[0:75] + "...") print(hsp.sbjct[0:75] + "...")
def out(): file = open("lines.txt", "w") if aligns == "0": if chos == 'BLASTn': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastn", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastn", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastn", "pdbnt", ser) elif db == 'Non-redundant(Protein)': err() elif db == 'NCBI Protein Ref_Seq(Protein)': err() elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': err() elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastn", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastn", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastn", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:100] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'BLASTp': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastp", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastp", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastp", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("blastp", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("blastp", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("blastp", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastp", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastp", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastp", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:len(hsp.query)] + "...", hsp.sbjct[0:100] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'tBLASTn': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("tblastn", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("tblastn", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("tblastn", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("tblastn", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("tblastn", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("tblastn", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastn", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastn", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "tblastn", "refseq_representative_genomes", ser) with open("Blast Result.xml", "w") as out_handle: out_handle.write(result_handle.read()) result_handle.close() blast_qresult = SearchIO.read('Blast Result.xml', 'blast-xml') for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:len(hsp.query)] + "...", hsp.sbjct[0:100] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'BLASTx': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastx", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastx", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastx", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("blastx", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("blastx", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("blastx", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastx", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastx", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastx", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:len(hsp.query)] + "...", hsp.sbjct[0:100] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'tBLASTx': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("tblastx", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("tblastx", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("tblastx", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("tblastx", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("tblastx", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("tblastx", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastx", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastx", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "tblastx", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:100] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif aligns == "1": if chos == 'BLASTn': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastn", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastn", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastn", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("blastn", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("blastn", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("blastn", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastn", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastn", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastn", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:100] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:100] + "...") file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'BLASTp': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastp", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastp", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastp", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("blastp", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("blastp", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("blastp", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastp", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastp", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastp", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:150] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:150] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'tBLASTn': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("tblastn", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("tblastn", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("tblastn", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("tblastn", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("tblastn", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("tblastn", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastn", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastn", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "tblastn", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:150] + "...", hsp.match[0:150] + "...", hsp.sbjct[0:150] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'BLASTx': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("blastx", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("blastx", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("blastx", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("blastx", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("blastx", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("blastx", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastx", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("blastx", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "blastx", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:150] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:150] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() elif chos == 'tBLASTx': record = SeqIO.read(str(e1.get()), "fasta") ser = record.seq print(ser) if db == 'Nucleotide collection(DNA)': result_handle = NCBIWWW.qblast("tblastx", "nt", ser) elif db == 'NCBI Transcript Ref_Seq(DNA)': result_handle = NCBIWWW.qblast("tblastx", "refseq_rna", ser) elif db == 'PDB nucleotide database(DNA)': result_handle = NCBIWWW.qblast("tblastx", "pdbnt", ser) elif db == 'Non-redundant(Protein)': result_handle = NCBIWWW.qblast("tblastx", "nr", ser) elif db == 'NCBI Protein Ref_Seq(Protein)': result_handle = NCBIWWW.qblast("tblastx", "refseq_protein", ser) elif db == 'Non-redundant UniProtKB/SwissProt(Protein)': result_handle = NCBIWWW.qblast("tblastx", "swissprot", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastx", "est", ser) elif db == 'Expressed Sequences tags(DNA)': result_handle = NCBIWWW.qblast("tblastx", "est", ser) elif db == 'RefSeq Representative Genome Database(DNA)': result_handle = NCBIWWW.qblast( "tblastx", "refseq_representative_genomes", ser) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if 1 == 1: print("****Alignment****") print("sequence:", alignment.title) print("length:", alignment.length) print("e value:", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.sbjct[0:75] + "...") view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % ( alignment.title, alignment.length, hsp.expect, hsp.query[0:150] + "...", hsp.match[0:100] + "...", hsp.sbjct[0:150] + "...") T.insert(END, view) file.write(view) filename = e2.get() if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, "w") reads = open("lines.txt") for i, line in enumerate(reads): result = "%s" % line file.write(result) reads.close() file.close() root = Tk() S = Scrollbar(root) dis = open("lines.txt", "r").read() T = Text(root, height=50, width=500) S.pack(side=RIGHT, fill=Y) T.pack(side=LEFT, fill=Y) S.config(command=T.yview) S.config(command=T.xview) T.config(yscrollcommand=S.set) T.config(xscrollcommand=S.set) T.insert(END, dis) mainloop()
def getBlastHits(self): """ Function for blasting the handle sequence against the NCBI nt database to identify homologies """ from Bio.Blast import NCBIWWW import sys import subprocess as sp sys.stdout = Unbuffered(sys.stdout) local = True if local: #localdb='/sw/data/uppnex/blast_databases/nt' localdb = '/Users/erikborgstrom/localBioInfo/BLASTnt/nt' from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML from cStringIO import StringIO import time import os #setting up blast database = localdb blastsetting = 'strict' infile = open('tmp.fa', 'w') infile.write('>tmp\n' + self.sequence + '\n') infile.close() if blastsetting == 'strict': cline = NcbiblastnCommandline(query=infile.name, db=database, evalue=0.001, outfmt=5) #, out='tmp.blastout') elif blastsetting == 'sloppy': cline = NcbiblastnCommandline( query=infile.name, db=database, evalue=0.001, outfmt=5, dust='no', perc_identity=80, task='blastn') #,out='tmp.blastout') cline = NcbiblastnCommandline( cmd='blastn', outfmt=5, query=infile.name, db=database, gapopen=5, gapextend=2, culling_limit=2) #,out='tmp.blastout') print str(cline) blast_handle = cline.__call__() #blastn = sp.Popen(cline.__str__().split(), stdout=sp.PIPE, stderr=sp.PIPE) #blastn.wait() #stdout, stderr = blastn.communicate() #print blastn.returncode #print cline.__str__().split() #blast_handle = stdout, stderr #print blast_handle blast_handle = StringIO(blast_handle[0]) blast_handle.seek(0) #os.remove(infile.name) else: sys.stdout.write('getting blast hits for handle#' + str(self.id) + '\n') result_handle = NCBIWWW.qblast("blastn", "nr", '>tmp\n' + self.sequence, format_type='XML') sys.stdout.write('start parsing blast for handle#' + str(self.id) + '\n') from cStringIO import StringIO blast_handle = StringIO(result_handle.read()) blast_handle.seek(0) from Bio.Blast import NCBIXML records = NCBIXML.parse(blast_handle) hits = 0 for blast_record in records: for alignment in blast_record.alignments: for hsp in alignment.hsps: perc_identity = float(hsp.identities) / float( hsp.align_length) * 100 perc_coverage = float(hsp.align_length) / float( blast_record.query_letters) * 100 if perc_identity >= 90 and perc_coverage >= 90: hits += 1 self.blastHits = hits
def main(argv): argsgiven = 0 query = '' subject = '' build_DB = True usage = 'seq_uniq_seek.py -q <queryfile>.fasta -s <subjectfile>.fasta -B [build database true/false]' verbal = True opts, args = getopt.getopt(argv, "xmhq:s:o:", ["subject=", "query="]) for opt, arg in opts: if opt == '-h': print(usage) sys.exit() elif opt == '-x': build_DB = False elif opt in ("-q", "--query"): query = arg argsgiven += 1 elif opt in ("-m", "--mute"): verbal = False argsgiven += 1 elif opt in ("-s", "--subject"): subject = arg argsgiven += 1 elif opt in ("-o", "--output"): output = arg argsgiven += 1 if (argsgiven < 3): print(usage) sys.exit(2) if (verbal): print( "\n ---- ==== SEEK UNIQ SEQ ==== ---- \nFinding sequences occuring in " + query + " that are not occuring in " + subject + " and saving in " + output + ".fasta\n") if (build_DB): if (verbal): print("Building blast database for subject file (" + subject + ")") makedb = NcbimakeblastdbCommandline(cmd='makeblastdb', input_file=subject, dbtype='nucl', parse_seqids=True) makedb() if (verbal): print("Done.\n") else: if (verbal): print("Not building database. Hoping for the best") if (verbal): print("Blasting query (" + query + ") against subject database (" + subject + ")") if (verbal): print("Splitting query into multiple files to save memory.") shutil.rmtree("chunks", ignore_errors=True) os.mkdir("chunks") record_iter = SeqIO.parse(open(query), "fasta") for i, batch in enumerate(batch_iterator(record_iter, 10000)): filename = "chunks/chunk_%i.fasta" % (i + 1) with open(filename, "w") as handle: count = SeqIO.write(batch, handle, "fasta") if (verbal): print("Building query index dictionary") q_dict = SeqIO.index(query, "fasta") hits = [] chunks = glob.glob('chunks/chunk*') for i, file in enumerate(chunks): now = datetime.now() dt_string = now.strftime("%d-%m_%H:%M:%S") print("[xenoseq_blast " + dt_string + "] So anyway... I'm busy blasting... " + str(round(i / len(chunks) * 100, 2)) + "%") blastn_cline = NcbiblastnCommandline(cmd='blastn', query=file, db=subject, num_threads=8, evalue=1e-5, perc_identity=90, outfmt=5, out="reads_all_vs_all.xml") blastn_cline() # Bit below is from: https://biopython.org/wiki/Retrieve_nonmatching_blast_queries for record in NCBIXML.parse(open("reads_all_vs_all.xml")): for alignment in record.alignments: if (alignment.length > 100): hits.append(record.query.split()[0]) os.remove("reads_all_vs_all.xml") shutil.rmtree("chunks") if (verbal): print("Subtracting hits from query dict keys") misses = set(q_dict.keys()) - set(hits) orphans = [q_dict[name] for name in misses] if (verbal): print("%i out of %i records in query are unique" % (len(misses), len(q_dict))) if (verbal): print("Writing to file %s" % (output)) SeqIO.write(orphans, output, 'fasta') if (verbal): print("Done. Hoping for the best.\n")
import sys from Bio.Blast import NCBIXML if len(sys.argv) > 1: blast = NCBIXML.parse(open(sys.argv[1], 'rU')) else: blast = NCBIXML.parse(sys.stdin) for record in blast: for align in record.alignments: if (align.hsps[0].frame[0] >= 0) and (align.hsps[0].frame[1] >= 0): print record.query, "\t", align.title, "\t", align.hsps[0].expect break
def runMetrics(self): cont = open("contiguity.txt", "w") cont.write("ID\tLength\tFPKM\tExpected counts\n") frag1 = open("fragmentation_1.txt", "w") frag1.write("ID\tLength\tFPKM\tExpected counts\n") frag2 = open("fragmentation_2.txt", "w") frag2.write("ID\tLength\tFPKM\tExpected counts\n") frag3 = open("fragmentation_3.txt", "w") frag3.write("ID\tLength\tFPKM\tExpected counts\n") frag4 = open("fragmentation_4.txt", "w") frag4.write("ID\tLength\tFPKM\tExpected counts\n") frag5 = open("fragmentation_5.txt", "w") frag5.write("ID\tLength\tFPKM\tExpected counts\n") handle_rf_as = open(self.blast_output_rf_as) blast_records = NCBIXML.parse(handle_rf_as) cov_sum = 0.0 align_length_sum = 0.0 corr_bases_sum = 0.0 hits_ref_sum = 0.0 cont_sum = 0.0 frag_sum = 0.0 frag_sum_1 = 0.0 frag_sum_2 = 0.0 frag_sum_3 = 0.0 frag_sum_4 = 0.0 frag_sum_5 = 0.0 assem_trpts_used = 0.0 assem_trpts_used_frag = 0.0 self.check_trpts = {} for blast_record in blast_records: result_iter = self.identifiedMetrics(blast_record) cov_sum += result_iter[0] align_length_sum += result_iter[1] corr_bases_sum += result_iter[2] if result_iter[3] == 1: hits_ref_sum += 1 cont_sum += 1 assem_trpts_used += 1 cont.write("%s\t%s\t%s\t%s\n" % (result_iter[5][0], str(self.isoforms[result_iter[5][0]][0]), str(self.isoforms[result_iter[5][0]][1]), str(self.isoforms[result_iter[5][0]][2]))) elif result_iter[4] == 1: hits_ref_sum += 1 frag_sum += 1 frag_sum_1 += 1 assem_trpts_used += result_iter[4] assem_trpts_used_frag += result_iter[4] for i in result_iter[5]: frag1.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2]))) elif result_iter[4] == 2: hits_ref_sum += 1 frag_sum += 1 frag_sum_2 += 1 assem_trpts_used += result_iter[4] assem_trpts_used_frag += result_iter[4] for i in result_iter[5]: frag2.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2]))) elif result_iter[4] == 3: hits_ref_sum += 1 frag_sum += 1 frag_sum_3 += 1 assem_trpts_used += result_iter[4] assem_trpts_used_frag += result_iter[4] for i in result_iter[5]: frag3.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2]))) elif result_iter[4] == 4: hits_ref_sum += 1 frag_sum += 1 frag_sum_4 += 1 assem_trpts_used += result_iter[4] assem_trpts_used_frag += result_iter[4] for i in result_iter[5]: frag4.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2]))) elif result_iter[4] >= 5: hits_ref_sum += 1 frag_sum += 1 frag_sum_5 += 1 assem_trpts_used += result_iter[4] assem_trpts_used_frag += result_iter[4] for i in result_iter[5]: frag5.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2]))) handle_rf_as.close(), cont.close(), frag1.close(), frag2.close(), frag3.close(), frag4.close(), frag5.close() identified = 100 * (hits_ref_sum / self.ref) completeness = 100 * (cov_sum / hits_ref_sum) contiguity = 100 * (cont_sum / hits_ref_sum) fragmented = 100 * (frag_sum / hits_ref_sum) fragmented_1 = 100 * (frag_sum_1 / hits_ref_sum) fragmented_2 = 100 * (frag_sum_2 / hits_ref_sum) fragmented_3 = 100 * (frag_sum_3 / hits_ref_sum) fragmented_4 = 100 * (frag_sum_4 / hits_ref_sum) fragmented_5 = 100 * (frag_sum_5 / hits_ref_sum) accuracy = 100 * (corr_bases_sum / align_length_sum) result_completenessCont = identified, hits_ref_sum, completeness, cov_sum, contiguity, cont_sum, fragmented, frag_sum, assem_trpts_used, assem_trpts_used_frag, accuracy, [fragmented_1, frag_sum_1, fragmented_2, frag_sum_2, fragmented_3, frag_sum_3, fragmented_4, frag_sum_4, fragmented_5, frag_sum_5] handle_as_rf = open(self.blast_output_as_rf) non_mat = open("non_match.txt", "w") non_mat.write("ID\tLength\tFPKM\tExpected counts\n") chim = open("chimerism.txt", "w") chim.write("ID\tLength\tFPKM\tExpected counts\n") blast_records = NCBIXML.parse(handle_as_rf) chimaeras = 0.0 no_hits = 0.0 for blast_record in blast_records: result_iter = self.chimerismNonMatch(blast_record) if result_iter[0] == 1: chimaeras += 1 chim.write("%s\t%s\t%s\t%s\n" % (str(blast_record.query).split(" ")[0], str(self.isoforms[str(blast_record.query).split(" ")[0]][0]), str(self.isoforms[str(blast_record.query).split(" ")[0]][1]), str(self.isoforms[str(blast_record.query).split(" ")[0]][2]))) elif result_iter[1] == 1: no_hits += 1 non_mat.write("%s\t%s\t%s\t%s\n" % (str(blast_record.query).split(" ")[0], str(self.isoforms[str(blast_record.query).split(" ")[0]][0]), str(self.isoforms[str(blast_record.query).split(" ")[0]][1]), str(self.isoforms[str(blast_record.query).split(" ")[0]][2]))) handle_as_rf.close(), non_mat.close(), chim.close() perc_chimaeras = (chimaeras / float(self.express)) * 100 perc_no_hits = (no_hits / float(self.express)) * 100 result_chimerism_ord = perc_chimaeras, chimaeras, perc_no_hits, no_hits return result_completenessCont, result_chimerism_ord
def blastxml2gff3(blastxml, include_seq=False): blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 # match_type = { # Currently we can only handle BLASTN, BLASTP # "BLASTN": "nucleotide_match", # "BLASTP": "protein_match", # }.get(record.application, "match") match_type = "match" collected_records = [] recid = record.query if " " in recid: recid = clean_string(recid[0:recid.index(" ")]) for idx_hit, hit in enumerate(record.alignments): # gotta check all hsps in a hit to see boundaries rec = SeqRecord(Seq("ACTG"), id=recid) parent_match_start = 0 parent_match_end = 0 hit_qualifiers = { "ID": "b2g.%s.%s" % (idx_record, idx_hit), "source": "blast", "accession": hit.accession, "hit_id": clean_string(hit.hit_id), "score": None, "length": hit.length, "hit_titles": clean_slist(hit.title.split(" >")), "hsp_count": len(hit.hsps), } desc = hit.title.split(" >")[0] hit_qualifiers["Name"] = desc sub_features = [] for idx_hsp, hsp in enumerate(hit.hsps): if idx_hsp == 0: # -2 and +1 for start/end to convert 0 index of python to 1 index of people, -2 on start because feature location saving issue parent_match_start = hsp.query_start parent_match_end = hsp.query_end hit_qualifiers["score"] = hsp.expect # generate qualifiers to be added to gff3 feature hit_qualifiers["score"] = min(hit_qualifiers["score"], hsp.expect) hsp_qualifiers = { "ID": "b2g.%s.%s.hsp%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": clean_string(hit.hit_id), "length": hit.length, "hit_titles": clean_slist(hit.title.split(" >")), } if include_seq: if ( "blast_qseq", "blast_sseq", "blast_mseq", ) in hit_qualifiers.keys(): hit_qualifiers.update({ "blast_qseq": hit_qualifiers["blast_qseq"] + hsp.query, "blast_sseq": hit_qualifiers["blast_sseq"] + hsp.sbjct, "blast_mseq": hit_qualifiers["blast_mseq"] + hsp.match, }) else: hit_qualifiers.update({ "blast_qseq": hsp.query, "blast_sseq": hsp.sbjct, "blast_mseq": hsp.match, }) for prop in ( "score", "bits", "identities", "positives", "gaps", "align_length", "strand", "frame", "query_start", "query_end", "sbjct_start", "sbjct_end", ): hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None) # check if parent boundary needs to increase to envelope hsp # if hsp.query_start < parent_match_start: # parent_match_start = hsp.query_start - 1 # if hsp.query_end > parent_match_end: # parent_match_end = hsp.query_end + 1 parent_match_start, parent_match_end = check_bounds( parent_match_start, parent_match_end, hsp.query_start, hsp.query_end) # add hsp to the gff3 feature as a "match_part" sub_features.append( SeqFeature( FeatureLocation(hsp.query_start - 1, hsp.query_end), type="match_part", strand=0, qualifiers=copy.deepcopy(hsp_qualifiers), )) # Build the top level seq feature for the hit hit_qualifiers["description"] = "Hit to %s..%s of %s" % ( parent_match_start, parent_match_end, desc, ) top_feature = SeqFeature( FeatureLocation(parent_match_start - 1, parent_match_end), type=match_type, strand=0, qualifiers=hit_qualifiers, ) # add the generated subfeature hsp match_parts to the hit feature top_feature.sub_features = copy.deepcopy( sorted(sub_features, key=lambda x: int(x.location.start))) # Add the hit feature to the record rec.features.append(top_feature) rec.annotations = {} collected_records.append(rec) for rec in collected_records: yield rec
parser.add_argument('-e', '--e_value', help='E-value threshold', metavar='float number', type=float) args = parser.parse_args() input_file = args.input e_value_threshold = args.e_value app_output_path = "aligned.fasta" not_app_output_path = "nonaligned.fasta" with open(app_output_path, 'w') as out_al_file, open(not_app_output_path, 'w') as out_nonal_file: for fasta in SeqIO.parse(input_file, "fasta"): query = NCBIWWW.qblast("blastn", "nt", fasta.seq, expect=e_value_threshold, format_type="XML") blast_result = NCBIXML.parse(query) for result in blast_result: if len(result.alignments) > 0: SeqIO.write(fasta, out_al_file, "fasta") elif len(result.alignments) == 0: SeqIO.write(fasta, out_nonal_file, "fasta")
# Now, we'll perform a blast search for other sequences # which are homologous to the obtained above sequence of # human beta-2 adrenergic receptor # Here, we'll seach for a homolog from Bos taurus (Cow) result_handle = NCBIWWW.qblast("blastp", "swissprot", query_seq, hitlist_size=1, entrez_query="Bos taurus[orgn]") # ...and write the result to .xml file blast_result = open("Bovine_seq.xml", "w") blast_result.write(result_handle.read()) blast_result.close() # now we can read this file at any time and get the sequence, its id, etc. print(NCBIXML.read(open("Bovine_seq.xml")).alignments[0].__str__() ) # returns formated string with this alignment details print('Seq ID: ', NCBIXML.read(open("Bovine_seq.xml")).alignments[0].hit_id.split("|")[1]) # getting the sequence of an HSP (high scoring segment pair) print('Protein sequence: ', NCBIXML.read(open("Bovine_seq.xml")).alignments[0].hsps[0].sbjct) # This loop reads the species list specified in the species.txt file and print the Entrez queries with open("./data/species.txt", 'r') as species: species_lines = species.readlines() for line in species_lines: species_latin = line.split(".")[0].strip() species_common = line.split(".")[1].strip() entrez_q = str(species_latin + "[orgn]") print(species_common, ': ', entrez_q)
#### gets the length of query and stores to a variable total = 0 #for filename in glob.glob(input_file_names): for filename in glob.glob(sys.argv[2]): record = SeqIO.read(filename, "fasta") query_length = len(record) #### compare hits to current input query #### define the handle filename = sys.argv[2] filename2 = filename + ".xml" result_handle = open(filename2) blast_record = NCBIXML.read(result_handle) #### write query file name counter = 0 #### screen blast output records against parameters for alignment in blast_record.alignments: for hsp in alignment.hsps: alignment_length = alignment.length identical_residues = hsp.identities percent_identity = float(identical_residues) / float(query_length) cond1 = percent_identity <= high_identity cond2 = percent_identity > low_identity cond3 = alignment_length <= query_length * high_length cond4 = alignment_length > query_length * low_length #### write blast output that passes screens
def test_calculate_mean_sd(): conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = physcraper.FilterBlast(data_obj, ids) # test begins fn = 'Senecio_scopolii_subsp._scopolii' # partly copy of read_local_blast_query general_wd = os.getcwd() if not os.path.exists(os.path.join(filteredScrape.workdir, "blast")): os.makedirs(os.path.join(filteredScrape.workdir, "blast")) fn_path = './tests/data/precooked/fixed/local-blast/{}'.format(fn) fn_path = os.path.abspath(fn_path) print(fn_path) os.chdir(os.path.join(filteredScrape.workdir, "blast")) local_blast.run_filter_blast(filteredScrape.workdir, fn_path, fn_path, output=os.path.join( filteredScrape.workdir, "blast/output_{}.xml".format(fn))) output_blast = os.path.join(filteredScrape.workdir, "blast/output_{}.xml".format(fn)) xml_file = open(output_blast) os.chdir(general_wd) blast_out = NCBIXML.parse(xml_file) hsp_scores = {} add_hsp = 0 for record in blast_out: for alignment in record.alignments: for hsp in alignment.hsps: gi = int(alignment.title.split(" ")[1]) hsp_scores[gi] = { "hsp.bits": hsp.bits, "hsp.score": hsp.score, "alignment.length": alignment.length, "hsp.expect": hsp.expect } add_hsp = add_hsp + float(hsp.bits) # make values to select for blast search, calculate standard deviation, mean mean_sed = local_blast.calculate_mean_sd(hsp_scores) sum_hsp = len(hsp_scores) mean = (add_hsp / sum_hsp) sd_all = 0 for item in hsp_scores: val = hsp_scores[item]["hsp.bits"] sd = (val - mean) * (val - mean) sd_all += sd sd_val = sqrt(sd_all / sum_hsp) # print((sd_val, 4), round(mean_sed['sd'], 4)) # print(mean,4), round(mean_sed['mean'], 4) assert round(sd_val, 4) == round(mean_sed['sd'], 4) assert round(mean, 4) == round(mean_sed['mean'], 4)
def blastxml2gff3(blastxml, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP "BLASTN": "nucleotide_match", "BLASTP": "protein_match", }.get(record.application, "match") recid = record.query if " " in recid: recid = recid[0:recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): # gotta check all hsps in a hit to see boundaries parent_match_start = 0 parent_match_end = 0 hit_qualifiers = { "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, "0"), "source": "blast", "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), "hsp_count": len(hit.hsps), } sub_features = [] for idx_hsp, hsp in enumerate(hit.hsps): hsp_qualifiers = { "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), } if include_seq: hsp_qualifiers.update({ "blast_qseq": hsp.query, "blast_sseq": hsp.sbjct, "blast_mseq": hsp.match, }) for prop in ( "score", "bits", "identities", "positives", "gaps", "align_length", "strand", "frame", "query_start", "query_end", "sbjct_start", "sbjct_end", ): hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None) desc = hit.title.split(" >")[0] hsp_qualifiers["description"] = desc[desc.index(" "):] # check if parent boundary needs to increase if hsp.query_start < parent_match_start: parent_match_start = hsp.query_start if hsp.query_end > parent_match_end: parent_match_end = hsp.query_end + 1 # Build out the match_part features for each HSP for idx_part, (start, end, cigar) in enumerate( generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=10)): hsp_qualifiers["Gap"] = cigar hsp_qualifiers["ID"] = hit_qualifiers["ID"] + (".%s" % idx_part) match_part_start = hsp.query_start # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(hsp_qualifiers), )) # Build the top level seq feature for the hit top_feature = SeqFeature( FeatureLocation(parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=hit_qualifiers, ) # add the generated subfeature hsp match_parts to the hit feature top_feature.sub_features = copy.deepcopy(sub_features) # Add the hit feature to the record rec.features.append(top_feature) rec.annotations = {} yield rec
def showBlastMapping(): ''' For each protein, create an overview over where the hits where mapped over the length of the protein. :creates: `blastmappings/*.png` ''' os.makedirs('blastmappings', exist_ok=True) fnames = sorted(list(CR.getProteinFiles())) fnt = ImageFont.load_default() for fname in fnames: print('Mapping {:<50}'.format(fname), end='\r') query_length = 0 with open('fastas/{}.fasta'.format(fname), 'r') as f: next(f) for line in f: query_length += len(line.rstrip()) counters = [np.zeros(query_length, np.int) for x in range(6)] numHsps = [0] * 6 with open('blastresults/{}.xml'.format(fname), 'r') as f: records = NCBIXML.parse(f) for record in records: for alignment in record.alignments: for hsp in alignment.hsps: if hsp.expect > 1e-15: n = 0 elif hsp.expect > 1e-30: n = 1 elif hsp.expect > 1e-60: n = 2 elif hsp.expect > 1e-90: n = 3 elif hsp.expect > 1e-120: n = 4 else: n = 5 counters[n][hsp.query_start - 1:hsp.query_end - 1] += 1 numHsps[n] += 1 ma = [np.amax(counters[n]) * 0.01 for n in range(6)] counters = [ counters[n] / ma[n] if ma[n] != 0 else np.ones(query_length, np.int) for n in range(6) ] im = Image.new('RGB', (query_length + 60, 600), (255, 255, 255)) dr = ImageDraw.Draw(im) dr.text((2, 40), '> 1e-15', (0, 0, 0), fnt) dr.text((2, 140), '> 1e-30', (0, 0, 0), fnt) dr.text((2, 240), '> 1e-60', (0, 0, 0), fnt) dr.text((2, 340), '> 1e-90', (0, 0, 0), fnt) dr.text((2, 440), '> 1e-120', (0, 0, 0), fnt) dr.text((2, 540), '<= 1e-120', (0, 0, 0), fnt) for n in range(6): dr.text((2, 60 + 100 * n), 'n = {}'.format(numHsps[n]), (0, 0, 0), fnt) colors = [(0, 0, 0), (0, 0, 200), (0, 200, 0), (200, 0, 200), (200, 0, 0), (150, 150, 0)] for n in range(int(query_length / 100)): col = 160 + n * 100 dr.line([(col, 0), (col, 600)], fill=(125, 125, 125), width=1) for n in range(6): for col, thickness in enumerate(counters[n]): dr.line([(col + 60, n * 100), (col + 60, thickness + n * 100)], fill=colors[n], width=1) #im.show() im.save('blastmappings/{}.png'.format(fname))