def search(self, fastaRecord, excludelist=[], usecache=True): # Write the query to a tmp file: tmpQueryFileIdent, tmpQueryFileName = tempfile.mkstemp() writeFile(tmpQueryFileName, str(fastaRecord)) # File name used for blast cache fileSuffix = '' for name in excludelist: l = re.split(r'\s+', name) for n in l: fileSuffix += n[0] if fileSuffix: fileSuffix = '_' + fileSuffix blastFileName = os.path.join( self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits, self.options.minsignificance, fileSuffix)) if usecache and os.path.exists( blastFileName) and os.path.getsize(blastFileName) > 0: # Use cached blast result if excludelist: print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join( excludelist), else: print "\n\t\tUsing cached Blast results...", sys.stdout.flush() blastFile = open(blastFileName, 'r') else: if excludelist: print "\n\t\tSearching database (excluding %s)..." % ', '.join( excludelist), else: print "\n\t\tSearching database...", sys.stdout.flush() if excludelist: # Build a blast db of the relevant records: excludeIDs = [] for taxon in excludelist: excludeIDs.extend(self.index[taxon]) includeIDs = self.sequence_index.keys.difference( set(excludeIDs)) if not includeIDs: print "done (database exhausted)\n\t\t\t", sys.stdout.flush() return SearchResult(None) blastDBfileName = os.path.join(self.options.project, "tmpBlastDB.fasta") tmpFastaFile = open(blastDBfileName, 'w') for key in includeIDs: tmpFastaFile.write(str(self.sequence_index.get_entry(key))) tmpFastaFile.close() cmd = "makeblastdb -dbtype nucl -title %s -in %s" % ( blastDBfileName, blastDBfileName) #cmd = "xdformat -n -o %s %s" % (blastDBfileName, blastDBfileName) cmd = self.escape(cmd) systemCall(cmd, stdout='IGNORE', stderr='IGNORE') else: blastDBfileName = self.blastSequenceFileName #blastDBfileName = self.blastDB # Blast: if self.options.blastwordsize: wordSize = '-word_size %s' % self.options.blastwordsize else: wordSize = '' if self.options.nolowcomplexfilter: filterOption = '-dust no' else: filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn cmd = 'blastn -db %s -outfmt 5 %s %s -evalue %s -max_target_seqs %s -query %s' \ % (blastDBfileName, wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, tmpQueryFileName) # cmd = "blastn %s -e %f -p blastn -d %s.fasta -i %s -m 7" % (wordSize, self.options.minsignificance, blastDBfileName, tmpQueryFileName) cmd = self.escape(cmd) STARTUPINFO = None if os.name == 'nt': STARTUPINFO = subprocess.STARTUPINFO() STARTUPINFO.dwFlags |= subprocess.STARTF_USESHOWWINDOW proc = subprocess.Popen(cmd, shell=True, startupinfo=STARTUPINFO, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() blastContent = str(stdout_value) # stdin, stdout, stderr = os.popen3(cmd) for f in glob.glob( os.path.join(self.options.project, 'tmpBlastDB.*')): os.remove(f) # blastContent = stdout.read() # # stdout.close() # stdin.close() # stderr.close() # This is a hack to remove an xml tag that just confuses things with blastn: blastContent = re.sub(r'<Hit_id>.*?</Hit_id>', '', blastContent) writeFile(blastFileName, blastContent) #blastFile = StringIO.StringIO(blastContent) blastFile = open(blastFileName, 'r') #blastRecord = NCBIXML.read(blastFile) try: blastRecord = NCBIXML.read(blastFile) except: blastRecord = None blastFile.close() os.close(tmpQueryFileIdent) os.unlink(tmpQueryFileName) print "done.\n\t\t\t", sys.stdout.flush() return SearchResult(blastRecord)
def search(self, fastaRecord, excludelist=[], usecache=True): # Write the query to a tmp file: tmpQueryFileIdent, tmpQueryFileName = tempfile.mkstemp() writeFile(tmpQueryFileName, str(fastaRecord)) # File name used for blast cache fileSuffix = '' for name in excludelist: l = re.split(r'\s+', name) for n in l: fileSuffix += n[0] if fileSuffix: fileSuffix = '_' + fileSuffix blastFileName = os.path.join(self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits, self.options.minsignificance, fileSuffix)) if usecache and os.path.exists(blastFileName) and os.path.getsize(blastFileName)>0: # Use cached blast result if excludelist: print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(excludelist), else: print "\n\t\tUsing cached Blast results...", sys.stdout.flush() blastFile = open(blastFileName, 'r') else: if excludelist: print "\n\t\tSearching database (excluding %s)..." % ', '.join(excludelist), else: print "\n\t\tSearching database...", sys.stdout.flush() if excludelist: # Build a blast db of the relevant records: excludeIDs = [] for taxon in excludelist: excludeIDs.extend(self.index[taxon]) includeIDs = self.sequence_index.keys.difference(set(excludeIDs)) if not includeIDs: print "done (database exhausted)\n\t\t\t", sys.stdout.flush() return SearchResult(None) blastDBfileName = os.path.join(self.options.project, "tmpBlastDB.fasta") tmpFastaFile = open(blastDBfileName, 'w') for key in includeIDs: tmpFastaFile.write(str(self.sequence_index.get_entry(key))) tmpFastaFile.close() cmd = "makeblastdb -dbtype nucl -title %s -in %s" % (blastDBfileName, blastDBfileName) #cmd = "xdformat -n -o %s %s" % (blastDBfileName, blastDBfileName) cmd = self.escape(cmd) systemCall(cmd, stdout='IGNORE', stderr='IGNORE') else: blastDBfileName = self.blastSequenceFileName #blastDBfileName = self.blastDB # Blast: if self.options.blastwordsize: wordSize = '-word_size %s' % self.options.blastwordsize else: wordSize = '' if self.options.nolowcomplexfilter: filterOption = '-dust no' else: filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn cmd = 'blastn -db %s -outfmt 5 %s %s -evalue %s -max_target_seqs %s -query %s' \ % (blastDBfileName, wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, tmpQueryFileName) # cmd = "blastn %s -e %f -p blastn -d %s.fasta -i %s -m 7" % (wordSize, self.options.minsignificance, blastDBfileName, tmpQueryFileName) cmd = self.escape(cmd) STARTUPINFO = None if os.name == 'nt': STARTUPINFO = subprocess.STARTUPINFO() STARTUPINFO.dwFlags |= subprocess.STARTF_USESHOWWINDOW proc = subprocess.Popen(cmd, shell=True, startupinfo=STARTUPINFO, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() blastContent = str(stdout_value) # stdin, stdout, stderr = os.popen3(cmd) for f in glob.glob(os.path.join(self.options.project, 'tmpBlastDB.*')): os.remove(f) # blastContent = stdout.read() # # stdout.close() # stdin.close() # stderr.close() # This is a hack to remove an xml tag that just confuses things with blastn: blastContent = re.sub(r'<Hit_id>.*?</Hit_id>', '', blastContent) writeFile(blastFileName, blastContent) #blastFile = StringIO.StringIO(blastContent) blastFile = open(blastFileName, 'r') #blastRecord = NCBIXML.read(blastFile) try: blastRecord = NCBIXML.read(blastFile) except: blastRecord = None blastFile.close() os.close(tmpQueryFileIdent) os.unlink(tmpQueryFileName) print "done.\n\t\t\t", sys.stdout.flush() return SearchResult(blastRecord)
def _netblast_search(self, fastaRecord, excludelist=[], usecache=True): """ Blast against genbank over web """ ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## if self.options.ONEMISSING: orig_limit_query = self.options.limitquery genus, species = re.search(r'_([^_]+)_([^_]+)$', fastaRecord.title).groups() self.options.limitquery = "barcode[keyword] NOT %s %s[ORGN]" % ( genus, species) print "REFORMATTING LIMIT QUERY TO", self.options.limitquery ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## # Make a query to filter the returned results: if excludelist: entrezQuery = '(' + self.options.limitquery + ') NOT (uncultured[WORD] OR ' + '[ORGN] OR '.join( excludelist) + '[ORGN])' else: entrezQuery = '(' + self.options.limitquery + ') NOT uncultured[WORD]' ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## if self.options.ONEMISSING: self.options.limitquery = orig_limit_query ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## fileSuffix = '' for name in excludelist: l = re.split(r'\s+', name) for n in l: fileSuffix += n[0] if fileSuffix: fileSuffix = '_' + fileSuffix # File name used for blast cache blastFileName = os.path.join( self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits, self.options.minsignificance, fileSuffix)) if usecache and os.path.exists( blastFileName) and os.path.getsize(blastFileName) > 0: # Use cached blast result if excludelist: print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join( excludelist), else: print "\n\t\tUsing cached Blast results...", sys.stdout.flush() else: # Make a query to filter the returned results: if excludelist: print "\n\t\tSearching database (excluding %s)..." % ', '.join( excludelist), else: print "\n\t\tSearching database...", sys.stdout.flush() fastaRecordFileName = os.path.join(self.options.project, utils.randomString(8)) fastaRecordFile = open(fastaRecordFileName, 'w') fastaRecordFile.write(str(fastaRecord)) fastaRecordFile.close() resultHandle = None if self.options.nolowcomplexfilter: filterOption = '-dust no' else: filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn if self.options.blastwordsize: wordSize = '-word_size %s' % self.options.blastwordsize else: wordSize = '' blastCmd = 'blastn -remote -outfmt 5 -db nt %s %s -evalue %s -max_target_seqs %s -entrez_query "%s" -query %s -out %s' \ % (wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, \ entrezQuery, fastaRecordFileName, blastFileName) for i in range(20): time.sleep(2 * i) error = utils.systemCall(blastCmd, stdout='IGNORE', stderr='IGNORE') try: # retval = os.system(blastCmd) # if retval != 0: # print "Netblast failed with return value %d. Trying again..." % retval error = utils.systemCall(blastCmd) if error or not os.path.exists( blastFileName) or os.path.getsize( blastFileName) == 0: print "Netblast failed. Trying again..." continue break except KeyboardInterrupt: sys.exit() except: print "Netblast failed. Trying again..." pass os.remove(fastaRecordFileName) # Read file from cache blastHandle = open(blastFileName, 'r') # Parse the result: try: blastRecord = NCBIXML.read(blastHandle) print "done.\n\t\t\t", sys.stdout.flush() except: blastRecord = None blastHandle.close() return blastRecord
def _netblast_search(self, fastaRecord, excludelist=[], usecache=True): """ Blast against genbank over web """ ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## if self.options.ONEMISSING: orig_limit_query = self.options.limitquery genus, species = re.search(r'_([^_]+)_([^_]+)$', fastaRecord.title).groups() self.options.limitquery = "barcode[keyword] NOT %s %s[ORGN]" % (genus, species) print "REFORMATTING LIMIT QUERY TO", self.options.limitquery ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## # Make a query to filter the returned results: if excludelist: entrezQuery = '(' + self.options.limitquery + ') NOT (uncultured[WORD] OR ' + '[ORGN] OR '.join(excludelist) + '[ORGN])' else: entrezQuery = '(' + self.options.limitquery + ') NOT uncultured[WORD]' ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## if self.options.ONEMISSING: self.options.limitquery = orig_limit_query ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ########################################## fileSuffix = '' for name in excludelist: l = re.split(r'\s+', name) for n in l: fileSuffix += n[0] if fileSuffix: fileSuffix = '_' + fileSuffix # File name used for blast cache blastFileName = os.path.join(self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits, self.options.minsignificance, fileSuffix)) if usecache and os.path.exists(blastFileName) and os.path.getsize(blastFileName)>0: # Use cached blast result if excludelist: print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(excludelist), else: print "\n\t\tUsing cached Blast results...", sys.stdout.flush() else: # Make a query to filter the returned results: if excludelist: print "\n\t\tSearching database (excluding %s)..." % ', '.join(excludelist), else: print "\n\t\tSearching database...", sys.stdout.flush() fastaRecordFileName = os.path.join(self.options.project, utils.randomString(8)) fastaRecordFile = open(fastaRecordFileName, 'w') fastaRecordFile.write(str(fastaRecord)) fastaRecordFile.close() resultHandle = None if self.options.nolowcomplexfilter: filterOption = '-dust no' else: filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn if self.options.blastwordsize: wordSize = '-word_size %s' % self.options.blastwordsize else: wordSize = '' blastCmd = 'blastn -remote -outfmt 5 -db nt %s %s -evalue %s -max_target_seqs %s -entrez_query "%s" -query %s -out %s' \ % (wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, \ entrezQuery, fastaRecordFileName, blastFileName) for i in range(20): time.sleep(2 * i) error = utils.systemCall(blastCmd, stdout='IGNORE', stderr='IGNORE') try: # retval = os.system(blastCmd) # if retval != 0: # print "Netblast failed with return value %d. Trying again..." % retval error = utils.systemCall(blastCmd) if error or not os.path.exists(blastFileName) or os.path.getsize(blastFileName) == 0: print "Netblast failed. Trying again..." continue break except KeyboardInterrupt: sys.exit() except: print "Netblast failed. Trying again..." pass os.remove(fastaRecordFileName) # Read file from cache blastHandle = open(blastFileName, 'r') # Parse the result: try: blastRecord = NCBIXML.read(blastHandle) print "done.\n\t\t\t", sys.stdout.flush() except: blastRecord = None blastHandle.close() return blastRecord