def blast(self, filename, seq, seq_id): """ :param filename: :param seq: :param seq_id: :return: """ try: open(filename, 'r') return False except FileNotFoundError: print("File not found starting blast") result_handle = qblast(self.blastmethod, self.database, seq, format_type=self.format, expect=self.evalue, matrix_name=self.matrix, hitlist_size=10) file = open(filename, "w+") result_xml = result_handle.readlines() file.writelines(result_xml) file.close() print("blast complete") return True
def coronablast(coronaseqrec): spikeblast = qblast("blastn", "nt", coronaseqrec.seq) with open("results.xml", "w") as save_file: blast_results = spikeblast.read() save_file.write(blast_results) return
def blastIt(sequence): res = qblast("blastp", "pdb", sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=True, results_file=None, show_overview=None, megablast=None) data = NCBIXML.parse(res) for record in data: if record.alignments: #skip queries with no matches print "QUERY: %s" % record.query[:60] for align in record.alignments: for hsp in align.hsps: # if hsp.expect < E_VALUE_THRESH: print "MATCH: %s " % align.title[:60] print hsp.expect return res
def blast(sequence): with open('cache/blast_tmp2.xml', mode="w+") as f: print('entering blast function') blastStart = time.time() blast_results = qblast("blastp", "nr", sequence, hitlist_size=5000) blastEnd = time.time() print('finished blast. Took ' + str(blastEnd - blastStart) + " seconds.") f.write(blast_results.read()) print('cached blast result')
def bio_blaster( input_file: str, file_format: str, output_file: str, index: Optional[str] = None, program: str = 'blastn', database: str = 'nt', gi_format: bool = True, size: int = 10 ) -> int: """Blasting sort of automated. :param input_file: The file path which file contains a/the sequence that is to be blasted. :param file_format: The format of the input_file. :param output_file: Where to store the results. :param index: If the file contains multiple seqs and you want to analise 1 place its ref code here. :param program: What blast program to use. :param database: Which database to use. :param gi_format: Whether to request the gi_format. :param size: The amount of results to request. :return: Number of files made. """ # noinspection SpellCheckingInspection assert file_format in ( # https://biopython.org/wiki/SeqIO 'abi', 'abi-trim', 'ace', 'cif-atom', 'cif-seqres', 'clustal', 'embl', 'fasta', 'fasta-2line', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'gck', 'genbank', 'gb', 'ig', 'imgt', 'nexus', 'pdb-seqres', 'pdb-atom', 'phd', 'phylip', 'pir', 'seqxml', 'sff', 'sff-trim', 'snapgene', 'stockholm', 'swiss', 'tab', 'qual', 'uniprot-xml', 'xdna' ) # noinspection SpellCheckingInspection assert program in ( # https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc92 'blastn', 'blastp', 'blastx', 'tblast', 'tblastx' ) assert size > 0 if output_file[-4:] == '.xml': output_file = output_file[:-4] # Bio = Biopython but Pycharm doesn't know. # noinspection PyPackageRequirements from Bio import SeqIO # noinspection PyPackageRequirements from Bio.Blast.NCBIWWW import qblast record_dict = SeqIO.index(input_file, format=file_format) record: str = '' count: int = 1 if index is not None: assert index in record_dict.keys() record = record_dict[index].format("fasta") else: for seq in record_dict: record += record_dict[seq].format("fasta") + '\n' del record_dict result_handle = qblast( # The actual blasting see print(help(qblast)) program=program, database=database, sequence=record, ncbi_gi=gi_format, hitlist_size=size, megablast=False ) with open(output_file + '.xml', "a") as out_handle: # Saving the results out_handle.write(result_handle.read()) result_handle.close() # The result handle is like an open file and must be closed. return count
def blaster(protSeq, orgnID="Mus musculus"): """take in an amino acid sequence and return the best matching gene name from the organism defined""" from Bio.Blast.NCBIWWW import qblast from Bio.Blast import NCBIXML from sys import exit print("\nconnecting to BLAST server. this will take some time...") i = 1 while i < 4: # BLAST sometimes returns empty results. if so, try once more, it happens quite rarely and resending the query seems to fix it. print("attempt number " + str(i)) i += 1 resX = qblast("blastp", "refseq_protein", protSeq, entrez_query=orgnID + "[organism]") resO = NCBIXML.read(resX) if resO.descriptions != []: break if resO.descriptions == []: print( "connection unsuccessful. The BLAST server is acting up. Try again later." ) exit(0) else: print("connection successful") print(resO.descriptions[0]) descO = resO.descriptions[0] if descO.e < 0.01: try: descID = descO.title.split("|")[ 3] # not sure why I picked element 3 here except IndexError: descID = descO.title.split("|")[1] if "." in descID: return descID.split(".")[0] else: return descID else: return "-"
def fetch_homologs(refseq_id): """ Query BLAST for homologs, using a blastp search. @param refseq_id: The accession number to get homologs for. Either protein or DNA is OK. @return: List of protein accession numbers of homologs. """ sys.stderr.write("\nSTEP: fetch homologs(%s)\n" % refseq_id) # Determine if protein or dna # http://www.ncbi.nlm.nih.gov/Sitemap/sequenceIDs.html if refseq_id[2].isalpha(): prot_id = refseq_id else: _, prot_ids = _fetch_dna_records([refseq_id]) prot_id = prot_ids[0] # If I don't do this, I seem to get-- # ValueError: Error message from NCBI: Message ID#68 Error: # Error occurred while trying to set up a Blast Object from CGI context: # CFastaReader: Segmented set not properly terminated around line 1 prot_id = prot_id.split(".")[0] # Query BLAST for homologs sys.stderr.write("\tQuerying BLAST, please be patient (may take minutes)...\n") blast_res = qblast("blastp", "nr", prot_id) blast_nr = NCBIXML.read(blast_res) prot_ids = [alignment.accession for alignment in blast_nr.alignments] sys.stderr.write("\tBLAST query successful, %s homologs found\n" % len(prot_ids)) # Write to disk. fname_prot_ids = "homologs.protids" if os.path.exists(fname_prot_ids): raise Exception("File %s aleady exists" % (fname_prot_ids)) with open(fname_prot_ids, "w") as f: for prot_id in prot_ids: f.write(prot_id + "\n") return fname_prot_ids
def blaster(protSeq, orgnID="H**o sapiens"): """take in an amino acid sequence and return the best matching gene name from the organism defined""" from Bio.Blast.NCBIWWW import qblast from Bio.Blast import NCBIXML from sys import exit print("\nconnecting to BLAST server. this will take some time...") i = 1 while i < 4: # BLAST sometimes returns empty results. if so, try once more, it happens quite rarely and resending the query seems to fix it. print("attempt number " + str(i)) i += 1 resX = qblast("blastp", "refseq_protein", protSeq, entrez_query=orgnID + "[organism]", descriptions=100, alignments=100) resO = NCBIXML.read(resX) if resO.descriptions != []: break if resO.descriptions == []: print( "connection unsuccessful. The BLAST server is acting up. Try again later." ) exit(0) else: print("connection successful") print(resO.descriptions[0]) descO = resO.descriptions[0] if descO.e < 1e-137: # set identification threshold here. 0.01 still returns hits in most cases, 1e-140 is based on ptpn22 similarity between mouse and human descID = descO.title.split("|")[3] if "." in descID: return descID.split(".")[0] else: return descID else: return "-"
def find_seq_homologues(self, return_raw=False): """ Uses NCBI BLAST to look for structures deposited in the PDB database that share __sequence__ homology with the target protein/chain. Bridges to Bio.BLAST.NCBIWWW. """ # Get sequence from structure/chain # We could use Bio.PDB.Polypeptide? from Bio.SCOP.Raf import to_one_letter_code s = self seq_iter = s.get_residues() seq_str = '' for aa in seq_iter: if aa.resname in to_one_letter_code: seq_str += to_one_letter_code[aa.resname] # Use BLAST to find homologous sequences with associated # structures in the PDB database. # Perhaps include local BLAST? from Bio.Blast.NCBIWWW import qblast # Adapt for short query sequences if needed # From http://www.ncbi.nlm.nih.gov/blast/producttable.shtml#shortp if len(seq_str) < 15: word_size = 2 expect = 20000 matrix_name = 'PAM30' filter = None else: word_size = 3 expect = 10.0 matrix_name = 'BLOSUM62' filter = 'SEG' query_result = qblast("blastp", "pdb", seq_str, word_size, expect, matrix_name, filter) if return_raw: return query_result # Parse BLAST result to yield results # PDBID : (E-Value, Identity, Positives, Gaps, Alignment) from Bio.Blast import NCBIXML blast_records = NCBIXML.read(query_result) results = [] for alignment in blast_records.alignments: for hsps in alignment.hsps: id_perc = "%s/%s" % (hsps.identities, alignment.length) pos_perc = "%s/%s" % (hsps.positives, alignment.length) gaps_perc = "%s/%s" % (hsps.gaps, alignment.length) pdb_id = alignment.title.split('|')[3] e_value = hsps.expect results.append( (pdb_id, "%2.5e" % e_value, id_perc, pos_perc, gaps_perc, '\n'.join([hsps.query, hsps.match, hsps.sbjct]))) return results
def cazy2class(prefix,F,remote=False): ''' will take the cazy database (dictionary provided) and try to fetch subfamilies and place them as classifiers. ''' print 'You chose to use CAZY database to classify GH13 family into subfamilies'\ ' this will take a while, since have to go over BLAST results, etc..' cls=open(prefix+'.cls','w') # import database db=pickle.load(open('CazyDB.bin')) names = get_names(prefix+'.gm') for n in names: print 'Processing %s...'%(n) if remote: Entrez.email = '*****@*****.**' print '\tBlasting (Running remotely)...' n=n[:-1]+'_'+n[-1] while 1: try: b=qblast('blastp','nr',n,perc_ident=90,expect=1,gapcosts='11 1') print '\tBlast Done... \n\t\tAnalysing entries...' break except: print 'Some problem in NCBI, sleeping for 10...' time.sleep(10) else: print '\tBlasting (Running locally)...' fi=open('temp.fasta','w') fi.write('>%s\n%s'%(n,F.seqs[F.chains[n[:4]]])) fi.close() #blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001, # outfmt=5, out="temp.xml",max_target_seqs=50, # num_alignments=50,num_threads=4) bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta -evalue 0.0001 -max_target_seqs 50 '\ '-seg yes -num_threads 4 -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml', shell=True) bl.wait() print '\tBlast Done... \n\t\tAnalysing entries...' b=open('temp.xml') blast_record = NCBIXML.read(b) rm = Popen('rm temp.*',shell=True) rm.wait() nohit=True while nohit: for a in blast_record.alignments: print '\t\t\t%s'%(a.accession) h=a.hsps[0] if float(h.identities)/float(h.align_length) >=0.9: ans,k = dict_lookup(a.accession,db) if ans: cls.write(str(db[k])+';') print '\t\t\t\tAccession number found in CAZY!, Subfamily %s'%(db[k]) nohit=False break else: if blast_record.alignments.index(a)+1 == len(blast_record.alignments): cls.write('%s;'%(n)) nohit=False print '\tNo relative found in CAZY' break elif blast_record.alignments.index(a)+1 == len(blast_record.alignments): cls.write('%s;'%(n)) nohit=False print '\tNo relative found in CAZY' break cls.write('\n') cls.close()
from Bio.Blast.NCBIWWW import qblast from Bio.Blast.NCBIXML import parse from Bio import SeqIO records = SeqIO.parse("./apoe.fas", "fasta") PROGRAM = 'blastp' DATABASE = 'nr' for rec in records: # query NCBI Blast API xml_result = qblast(PROGRAM, DATABASE, rec.seq) # Parse xml result results = parse(xml_result) # iterate over each result for record in results: for alignment in record.alignments: print(alignment)
def ncbiblast(): if session.username == None: redirect(URL(r=request, f='../account/log_in')) form = FORM(TABLE(TR("Job Title: ", INPUT(_type="text", _name="title")), TR("Sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), TR("Program: ", SELECT("blastn", "blastp", "blastx", "tblastn", "tblastx", _name="program")), TR("Database: ", SELECT("Non-redundant GenBank (nr)", "NCBI Reference Sequence (refseq)", "SWISS-PROT protein sequence (last update) (swissprot)", "Patent division of GenPept (pat)", "Protein Data Bank (pdb)", "Protein - environmental samples (env_nr)", "RNA - NCBI Reference Sequence (refseq_rna)", "Genomic - NCBI Reference Sequence (refseq_genomic)", "ESTs from GenBank + EMBL + DDBJ (est)", "Mouse subset of ESTs (est_mouse)", "Human subset of ESTs (est_human)", "Non-mouse non-human subset of ESTs (est_others)", "Genome Survey Sequences (gss)", "Complete chromosomes (chromosome)", "Whole Genome Shotgun sequences (wgs)", "Nucleotide - environmental samples (env_nr)", _name="database")), TR("Translation Table: ", SELECT("Standard (1)", "Vertebrate Mitochondria (2)", "Yeast Mitochondria (3)", "Mold, Protozoan, and Coelenterate Mitochondria (4)", "Mycoplasma/Spiroplasma (4)", "Invertebrate Mitochondria (5)", "Ciliate, Dasycladacean and Hexamita Nuclear (6)", "Echinoderm and Flatworm Mitochondria (9)", "Euplotid Nuclear (10)", "Bacterial, Archaeal and Plant Plastid (11)", "Alternative Yeast Nuclear (12)", "Ascidian Mitochondria (13)", "Alternative Flatworm Mitochondria (14)", "Blepharisma Nuclear (15)", "Chlorophycean Mitochondria (16)", "Trematode Mitochondria (21)", "Scenedesmus obliquus Mitochondria (22)", "Thraustochytrium Mitochondria (23)", _name="gcode")), TR("Matrix: ", SELECT("BLOSUM62", "BLOSUM80", "BLOSUM45", "PAM30", "PAM70",_name="matrix")), TR("Maximum number of hits to return: ", SELECT("50", "100", "200", "500", "1000", "2000", "5000", "10000", "20000", "50000", _name="hitlist_size")), TR("Number of random hits expected: ", INPUT(_type="text", _name="expect", value=10)), TR("Word size: ", INPUT(_type="text", _name="word_size", value=3)), TR("",INPUT(_type="submit", _value="SUBMIT")))) if form.accepts(request.vars,session): from Bio.Blast.NCBIWWW import qblast from Bio.Blast import NCBIXML sequence = seqClean(fasta_to_raw(form.vars.sequence.upper())) rec = NCBIXML.parse(qblast(form.vars.program, ncbi_db[form.vars.database], sequence, matrix_name=form.vars.matrix, hitlist_size=int(form.vars.hitlist_size), expect=float(form.vars.expect), word_size=int(form.vars.word_size), db_genetic_code=genetic_code[form.vars.gcode])).next() session['title'] = form.vars.title session['sequence'] = sequence session['database'] = form.vars.database session['program'] = form.vars.program session['matrix'] = form.vars.matrix session['gcode'] = form.vars.gcode session['hitsize'] = form.vars.hitlist_size session['expect'] = form.vars.expect session['word_size'] = form.vars.word_size session['data'] = [{'Title':row.title, 'Score':str(row.score), 'E-value':str(row.e)} for row in rec.descriptions] redirect(URL(r=request, f='ncbiblast_output')) return dict(form=form)
if __name__ == '__main__': from common import file_parser from Bio import SeqIO from Bio.Blast.NCBIWWW import qblast parser = file_parser(prog_desc='Perform a BLAST query', file_desc='A file containing a protein sequence') args = parser.parse_args() sequence = open(args.file, 'rU').read() result_handle = qblast('blastp', 'nr', sequence) print(result_handle.read()) result_handle.close()
from Bio.Blast.NCBIWWW import qblast query = (">s1\n" "MHEIKYITIDEADVLLTEEHEETTRFICQSANRDRQISLFSATTSERLDNFFDKVESSQQ\n" "IEVVAGEAKMPTTIDHIYIQVNPRDKVKTLYRLAQVENMRAIVFVNTIGRLNTVYEKLNH\n" "DGVKISALHGDLSKLQRQESVRDFKKGETSLLLATDVAARGIDLPNLPAIIQFDMAQSLT\n" "QYVHRSGRTGRMGEQGAAISLVTDREARELKQMVKENDVKMIEQIVKFGHLIDPQKTK") r = qblast("blastp", "nr_v5", query).getvalue() print(r)
def ncbiblast(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) form = FORM(TABLE(TR("Job Title: ", INPUT(_type="text", _name="title")), TR("Sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), TR("Program: ", SELECT("blastn", "blastp", "blastx", "tblastn", "tblastx", _name="program")), TR("Database: ", SELECT("Non-redundant GenBank (nr)", "NCBI Reference Sequence (refseq)", "SWISS-PROT protein sequence (last update) (swissprot)", "Patent division of GenPept (pat)", "Protein Data Bank (pdb)", "Protein - environmental samples (env_nr)", "RNA - NCBI Reference Sequence (refseq_rna)", "Genomic - NCBI Reference Sequence (refseq_genomic)", "ESTs from GenBank + EMBL + DDBJ (est)", "Mouse subset of ESTs (est_mouse)", "Human subset of ESTs (est_human)", "Non-mouse non-human subset of ESTs (est_others)", "Genome Survey Sequences (gss)", "Complete chromosomes (chromosome)", "Whole Genome Shotgun sequences (wgs)", "Nucleotide - environmental samples (env_nr)", _name="database")), TR("Translation Table: ", SELECT("Standard (1)", "Vertebrate Mitochondria (2)", "Yeast Mitochondria (3)", "Mold, Protozoan, and Coelenterate Mitochondria (4)", "Mycoplasma/Spiroplasma (4)", "Invertebrate Mitochondria (5)", "Ciliate, Dasycladacean and Hexamita Nuclear (6)", "Echinoderm and Flatworm Mitochondria (9)", "Euplotid Nuclear (10)", "Bacterial, Archaeal and Plant Plastid (11)", "Alternative Yeast Nuclear (12)", "Ascidian Mitochondria (13)", "Alternative Flatworm Mitochondria (14)", "Blepharisma Nuclear (15)", "Chlorophycean Mitochondria (16)", "Trematode Mitochondria (21)", "Scenedesmus obliquus Mitochondria (22)", "Thraustochytrium Mitochondria (23)", _name="gcode")), TR("Matrix: ", SELECT("BLOSUM62", "BLOSUM80", "BLOSUM45", "PAM30", "PAM70",_name="matrix")), TR("Maximum number of hits to return: ", SELECT("50", "100", "200", "500", "1000", "2000", "5000", "10000", "20000", "50000", _name="hitlist_size")), TR("Number of random hits expected: ", INPUT(_type="text", _name="expect", value=10)), TR("Word size: ", INPUT(_type="text", _name="word_size", value=3)), TR("",INPUT(_type="submit", _value="SUBMIT")))) if form.accepts(request.vars,session): from Bio.Blast.NCBIWWW import qblast from Bio.Blast import NCBIXML sequence = seqClean(fasta_to_raw(form.vars.sequence.upper())) rec = NCBIXML.parse(qblast(form.vars.program, ncbi_db[form.vars.database], sequence, matrix_name=form.vars.matrix, hitlist_size=int(form.vars.hitlist_size), expect=float(form.vars.expect), word_size=int(form.vars.word_size), db_genetic_code=genetic_code[form.vars.gcode])).next() session['title'] = form.vars.title session['sequence'] = sequence session['database'] = form.vars.database session['program'] = form.vars.program session['matrix'] = form.vars.matrix session['gcode'] = form.vars.gcode session['hitsize'] = form.vars.hitlist_size session['expect'] = form.vars.expect session['word_size'] = form.vars.word_size session['data'] = [{'Title':row.title, 'Score':str(row.score), 'E-value':str(row.e)} for row in rec.descriptions] redirect(URL(r=request, f='ncbiblast_output')) return dict(form=form)
def cazy2class(prefix, F, remote=False): ''' will take the cazy database (dictionary provided) and try to fetch subfamilies and place them as classifiers. ''' print 'You chose to use CAZY database to classify GH13 family into subfamilies'\ ' this will take a while, since have to go over BLAST results, etc..' cls = open(prefix + '.cls', 'w') # import database db = pickle.load(open('CazyDB.bin')) names = get_names(prefix + '.gm') for n in names: print 'Processing %s...' % (n) if remote: Entrez.email = '*****@*****.**' print '\tBlasting (Running remotely)...' n = n[:-1] + '_' + n[-1] while 1: try: b = qblast('blastp', 'nr', n, perc_ident=90, expect=1, gapcosts='11 1') print '\tBlast Done... \n\t\tAnalysing entries...' break except: print 'Some problem in NCBI, sleeping for 10...' time.sleep(10) else: print '\tBlasting (Running locally)...' fi = open('temp.fasta', 'w') fi.write('>%s\n%s' % (n, F.seqs[F.chains[n[:4]]])) fi.close() #blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001, # outfmt=5, out="temp.xml",max_target_seqs=50, # num_alignments=50,num_threads=4) bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta -evalue 0.0001 -max_target_seqs 50 '\ '-seg yes -num_threads 4 -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml', shell=True) bl.wait() print '\tBlast Done... \n\t\tAnalysing entries...' b = open('temp.xml') blast_record = NCBIXML.read(b) rm = Popen('rm temp.*', shell=True) rm.wait() nohit = True while nohit: for a in blast_record.alignments: print '\t\t\t%s' % (a.accession) h = a.hsps[0] if float(h.identities) / float(h.align_length) >= 0.9: ans, k = dict_lookup(a.accession, db) if ans: cls.write(str(db[k]) + ';') print '\t\t\t\tAccession number found in CAZY!, Subfamily %s' % ( db[k]) nohit = False break else: if blast_record.alignments.index(a) + 1 == len( blast_record.alignments): cls.write('%s;' % (n)) nohit = False print '\tNo relative found in CAZY' break elif blast_record.alignments.index(a) + 1 == len( blast_record.alignments): cls.write('%s;' % (n)) nohit = False print '\tNo relative found in CAZY' break cls.write('\n') cls.close()
def main(logginglevel, input, tmp, idthreshold, evalue): logger = createLogger(__file__) logger = setLoggerLevel(logger, logginglevel) with ThreadPoolExecutor(max_workers=10) as executor: loop = asyncio.get_event_loop() inputFiles = [ getFile(inputFile, logger=logger, tmp=tmp) for inputFile in input ] outputFiles = [] for file in inputFiles: fileOut = file.replace(".fasta", "_ext.fasta") logInfo(logger, "Extending file {} to {}".format(file, fileOut)) seqs = [] with open(file, "r") as handle: for record in SeqIO.parse(handle, "fasta"): seqs.append("> {}\n{}".format(record.id, str(record.seq))) queryStr = "\n".join(seqs) logInfo(logger, "Requesting web BLASTP search") requestOut = qblast("blastp", "nr_v5", queryStr, expect=float(evalue), perc_ident=float(idthreshold)).getvalue() root = ET.fromstring(requestOut) accessionIDs = [] hits = root.findall( "./BlastOutput_iterations/Iteration/Iteration_hits/Hit") logInfo(logger, "Got {} hits for given query".format(len(hits))) if len(hits) <= 0: logInfo(logger, requestOut) raise Exception('No hits found') recordsOut = [] recordCountOut = 0 for hit in hits: seqs = hit.findall("./Hit_hsps/Hsp/Hsp_qseq") for seq in seqs: accessionID = hit.find("./Hit_accession").text accessionIDs.append(accessionID) gatherTasks = [] requestsLimit = 3 chunkSize = int(len(accessionIDs) / requestsLimit) + 1 accessionChunks = [ chunk for chunk in chunks(accessionIDs, chunkSize) ] logDebug( logger, "Chunked request, will request Entrez for {} batches of size {}." .format(len(accessionChunks), chunkSize)) for accessionIDs in accessionChunks: gatherTasks.append( loop.run_in_executor(executor, entrezRetrieveSequence, *((accessionIDs, tmp)))) gatherFuture = asyncio.ensure_future(asyncio.gather(*gatherTasks)) loop.run_until_complete(gatherFuture) recordsOut = [] for records in gatherFuture.result(): for record in records: recordsOut.append(record.format("fasta")) recordCountOut = len(recordsOut) logDebug(logger, "Writing to file {}".format(fileOut)) with open(fileOut, "w") as fileOutHandle: fileOutHandle.write("\n".join(recordsOut)) with open(fileOut, "r") as handle: recordCount = 0 for record in SeqIO.parse(handle, "fasta"): recordCount = recordCount + 1 if recordCount != recordCountOut: logError( logger, "Got mismatch records count after writing output file. {} records are present in {} and there should be {} reconds." .format(recordCount, fileOut, recordCountOut)) outputFiles.append(fileOut) logInfo( logger, "Written {} files in total. Returning them to stdout as plaintext paths list" .format(len(outputFiles))) print("\n".join(outputFiles))
def main(args): """Run program Args: args (NameSpace): ArgParse arguments dictating program use """ tqdm.write('>>> Starting prokka_blast_pipeline') # Get Gene IDs from ID file ids = [gene_id.strip() for gene_id in args.id] tqdm.write('>>> Found {0} ID(s) in {1}' .format(str(len(ids)), args.id.name)) # Get contigs that contain a feature ID matching a given Gene ID contigs = [] gff_reader= GFF3Reader(args.gff3) for entry in gff_reader.iterate(): try: # Ignore features without a gene_feature field if entry.attributes['gene_feature'] in ids: contigs.append(entry.seqid) except KeyError: continue tqdm.write('>>> Found {0} contig(s) containing gene features matching ' 'given ID(s) in {1}'.format(str(len(contigs)), args.gff3.name)) # Obtain PROKKA IDs and annotations of genes on contigs obtained earlier prokka_to_contig = defaultdict(str) prokka_to_gene = defaultdict(str) args.gff3.seek(0) for entry in gff_reader.iterate(): if entry.seqid in contigs and 'gene_feature' in entry.attributes: prokka_to_contig[entry.attributes['ID']] = entry.seqid prokka_to_gene[entry.attributes['ID']] = entry.attributes[ 'gene_feature'] tqdm.write('>>> Found {0} gene feature(s) on contigs matching given ID(s)' .format(str(len(prokka_to_contig)))) # Get sequences from FAA file if they match a PROKKA ID obtained above blast_entries = [] for entry in fasta_iter(args.faa): if entry.id in prokka_to_contig or ids[0] == '*': blast_entries.append(entry) tqdm.write('>>> Obtained {0} amino acid sequence(s) from {1}' .format(str(len(blast_entries)), args.faa.name)) # Output header line args.output.write('Contig\tPROKKA_ID\tAnnotation\tGene ID\tSubject\t' 'Query Coverage(%)\tE-Value\tIdentity(%){0}'.format( os.linesep)) tqdm.write('>>> BLASTing {0} amino acid sequence(s) against the NCBI {1} ' 'database'.format(str(len(blast_entries)), args.database)) # BLAST sequences and calculate various summary values count = 0 for entry in tqdm(blast_entries): # Continue attempting same blast until it succeeds w/o NCBI errors while True: try: result_handle = qblast(args.program, args.database, entry.sequence, alignments=args.top, descriptions=args.top, hitlist_size=args.top, expect=args.e_value) break except (ValueError, socket_error): # Ignore NCBI continue # Process BLAST results result_generator = NCBIXML.parse(result_handle) for result in result_generator: for alignment in result.alignments: count += 1 for hsp in alignment.hsps: cov = float(hsp.align_length / len(entry.sequence)) * 100.0 perc = float(hsp.identities / len(entry.sequence)) * 100.0 taxonomy = alignment.hit_def.split('[')[1] taxonomy = taxonomy.split(']')[0] # Format and write output to custom TSV output = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}{8}'\ .format(prokka_to_contig[entry.id], entry.id, entry.description, prokka_to_gene[entry.id], taxonomy, str(cov), str(hsp.expect), str(perc), os.linesep) args.output.write(output) tqdm.write('>>> Wrote {0} total hit(s) to {1}'.format(str(count), args.output.name)) tqdm.write('>>> Exiting prokka_blast_pipeline')
#!/usr/bin/python2.7 from glob import glob from Bio import SeqIO from Bio.Blast.NCBIWWW import qblast from os.path import splitext, basename with open('archaea_gis.txt') as f: gis = map(lambda x: x.strip(), f.readlines()) query = " ".join(gis) for fn in glob('fasta/*.fasta'): try: print "getting BLAST for %s" % (fn) rec = SeqIO.read(fn, 'fasta') accid = splitext(basename(fn))[0] handle = qblast('blastn', 'chromosome', rec.seq, entrez_query=query) with open('blast/' + accid + '.xml', 'w') as alf: alf.write(handle.read()) except Exception as e: print "failed %s: %s" % (fn, str(e))