Ejemplo n.º 1
0
    def search(self, fastaRecord, excludelist=[], usecache=True):

        # Write the query to a tmp file:
        tmpQueryFileIdent, tmpQueryFileName = tempfile.mkstemp()
        writeFile(tmpQueryFileName, str(fastaRecord))

        # File name used for blast cache
        fileSuffix = ''
        for name in excludelist:
            l = re.split(r'\s+', name)
            for n in l:
                fileSuffix += n[0]
        if fileSuffix:
            fileSuffix = '_' + fileSuffix

        blastFileName = os.path.join(
            self.options.blastcache,
            "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits,
                                self.options.minsignificance, fileSuffix))

        if usecache and os.path.exists(
                blastFileName) and os.path.getsize(blastFileName) > 0:
            # Use cached blast result
            if excludelist:
                print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(
                    excludelist),
            else:
                print "\n\t\tUsing cached Blast results...",
            sys.stdout.flush()

            blastFile = open(blastFileName, 'r')
        else:
            if excludelist:
                print "\n\t\tSearching database (excluding %s)..." % ', '.join(
                    excludelist),
            else:
                print "\n\t\tSearching database...",
            sys.stdout.flush()

            if excludelist:
                # Build a blast db of the relevant records:
                excludeIDs = []
                for taxon in excludelist:
                    excludeIDs.extend(self.index[taxon])
                includeIDs = self.sequence_index.keys.difference(
                    set(excludeIDs))

                if not includeIDs:
                    print "done (database exhausted)\n\t\t\t",
                    sys.stdout.flush()
                    return SearchResult(None)

                blastDBfileName = os.path.join(self.options.project,
                                               "tmpBlastDB.fasta")
                tmpFastaFile = open(blastDBfileName, 'w')

                for key in includeIDs:
                    tmpFastaFile.write(str(self.sequence_index.get_entry(key)))
                tmpFastaFile.close()
                cmd = "makeblastdb -dbtype nucl -title %s -in %s" % (
                    blastDBfileName, blastDBfileName)
                #cmd = "xdformat -n -o %s %s" % (blastDBfileName, blastDBfileName)
                cmd = self.escape(cmd)
                systemCall(cmd, stdout='IGNORE', stderr='IGNORE')
            else:
                blastDBfileName = self.blastSequenceFileName
                #blastDBfileName = self.blastDB

            # Blast:
            if self.options.blastwordsize:
                wordSize = '-word_size %s' % self.options.blastwordsize
            else:
                wordSize = ''

            if self.options.nolowcomplexfilter:
                filterOption = '-dust no'
            else:
                filterOption = '-dust yes'  # FIXME: Check that this is an ok default... It is not the defalut in blastn

            cmd = 'blastn -db %s -outfmt 5 %s %s -evalue %s -max_target_seqs %s -query %s' \
                       % (blastDBfileName, wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits,
                          tmpQueryFileName)

            #            cmd = "blastn %s -e %f -p blastn -d %s.fasta -i %s -m 7" % (wordSize, self.options.minsignificance, blastDBfileName, tmpQueryFileName)
            cmd = self.escape(cmd)

            STARTUPINFO = None
            if os.name == 'nt':
                STARTUPINFO = subprocess.STARTUPINFO()
                STARTUPINFO.dwFlags |= subprocess.STARTF_USESHOWWINDOW
            proc = subprocess.Popen(cmd,
                                    shell=True,
                                    startupinfo=STARTUPINFO,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            stdout_value, stderr_value = proc.communicate()
            blastContent = str(stdout_value)

            #            stdin, stdout, stderr = os.popen3(cmd)

            for f in glob.glob(
                    os.path.join(self.options.project, 'tmpBlastDB.*')):
                os.remove(f)


#             blastContent = stdout.read()
#
#             stdout.close()
#             stdin.close()
#             stderr.close()

# This is a hack to remove an xml tag that just confuses things with blastn:
            blastContent = re.sub(r'<Hit_id>.*?</Hit_id>', '', blastContent)

            writeFile(blastFileName, blastContent)

            #blastFile = StringIO.StringIO(blastContent)
            blastFile = open(blastFileName, 'r')

        #blastRecord = NCBIXML.read(blastFile)
        try:
            blastRecord = NCBIXML.read(blastFile)
        except:
            blastRecord = None

        blastFile.close()

        os.close(tmpQueryFileIdent)
        os.unlink(tmpQueryFileName)

        print "done.\n\t\t\t",
        sys.stdout.flush()

        return SearchResult(blastRecord)
Ejemplo n.º 2
0
    def search(self, fastaRecord, excludelist=[], usecache=True):

        # Write the query to a tmp file:
        tmpQueryFileIdent, tmpQueryFileName = tempfile.mkstemp()
        writeFile(tmpQueryFileName, str(fastaRecord))

        # File name used for blast cache
        fileSuffix = ''
        for name in excludelist:
            l = re.split(r'\s+', name)
            for n in l:
                fileSuffix += n[0]
        if fileSuffix:
           fileSuffix = '_' + fileSuffix

        blastFileName = os.path.join(self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title,
                                               self.options.maxblasthits, self.options.minsignificance, fileSuffix))

        if usecache and os.path.exists(blastFileName) and os.path.getsize(blastFileName)>0:
            # Use cached blast result
            if excludelist:
               print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(excludelist),
            else:
                print "\n\t\tUsing cached Blast results...", 
            sys.stdout.flush()

            blastFile = open(blastFileName, 'r')
        else:
            if excludelist:
                print "\n\t\tSearching database (excluding %s)..." % ', '.join(excludelist), 
            else:
                print "\n\t\tSearching database...", 
            sys.stdout.flush()

            if excludelist:
                # Build a blast db of the relevant records:
                excludeIDs = []
                for taxon in excludelist:
                    excludeIDs.extend(self.index[taxon])
                includeIDs = self.sequence_index.keys.difference(set(excludeIDs))

                if not includeIDs:
                    print "done (database exhausted)\n\t\t\t",
                    sys.stdout.flush()       
                    return SearchResult(None)

                blastDBfileName = os.path.join(self.options.project, "tmpBlastDB.fasta")
                tmpFastaFile = open(blastDBfileName, 'w')

                for key in includeIDs:
                    tmpFastaFile.write(str(self.sequence_index.get_entry(key)))
                tmpFastaFile.close()
                cmd = "makeblastdb -dbtype nucl -title %s -in %s" % (blastDBfileName, blastDBfileName)
                #cmd = "xdformat -n -o %s %s" % (blastDBfileName, blastDBfileName)
                cmd = self.escape(cmd)
                systemCall(cmd, stdout='IGNORE', stderr='IGNORE')
            else:
                blastDBfileName = self.blastSequenceFileName
                #blastDBfileName = self.blastDB
            
            # Blast:
            if self.options.blastwordsize:
                wordSize = '-word_size %s' % self.options.blastwordsize
            else:
                wordSize = ''

            if self.options.nolowcomplexfilter:
                filterOption = '-dust no'
            else:
                filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn

            cmd = 'blastn -db %s -outfmt 5 %s %s -evalue %s -max_target_seqs %s -query %s' \
                       % (blastDBfileName, wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits,
                          tmpQueryFileName)
               
#            cmd = "blastn %s -e %f -p blastn -d %s.fasta -i %s -m 7" % (wordSize, self.options.minsignificance, blastDBfileName, tmpQueryFileName)
            cmd = self.escape(cmd)

            STARTUPINFO = None
            if os.name == 'nt':
                STARTUPINFO = subprocess.STARTUPINFO()
                STARTUPINFO.dwFlags |= subprocess.STARTF_USESHOWWINDOW
            proc = subprocess.Popen(cmd, shell=True, startupinfo=STARTUPINFO,
                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout_value, stderr_value = proc.communicate()
            blastContent = str(stdout_value)

#            stdin, stdout, stderr = os.popen3(cmd)

            for f in glob.glob(os.path.join(self.options.project, 'tmpBlastDB.*')):
                os.remove(f)

#             blastContent = stdout.read()
# 
#             stdout.close()
#             stdin.close()
#             stderr.close()

            # This is a hack to remove an xml tag that just confuses things with blastn:
            blastContent = re.sub(r'<Hit_id>.*?</Hit_id>', '', blastContent)

            writeFile(blastFileName, blastContent)

            #blastFile = StringIO.StringIO(blastContent)
            blastFile = open(blastFileName, 'r')

        #blastRecord = NCBIXML.read(blastFile)
        try:
            blastRecord = NCBIXML.read(blastFile)
        except:
            blastRecord = None

        blastFile.close()

        os.close(tmpQueryFileIdent)
        os.unlink(tmpQueryFileName)

        print "done.\n\t\t\t",
        sys.stdout.flush()
        
        return SearchResult(blastRecord)
Ejemplo n.º 3
0
    def _netblast_search(self, fastaRecord, excludelist=[], usecache=True):
        """
        Blast against genbank over web
        """

        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################
        if self.options.ONEMISSING:
            orig_limit_query = self.options.limitquery
            genus, species = re.search(r'_([^_]+)_([^_]+)$',
                                       fastaRecord.title).groups()
            self.options.limitquery = "barcode[keyword] NOT %s %s[ORGN]" % (
                genus, species)
            print "REFORMATTING LIMIT QUERY TO", self.options.limitquery
        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################

        # Make a query to filter the returned results:
        if excludelist:
            entrezQuery = '(' + self.options.limitquery + ') NOT (uncultured[WORD] OR ' + '[ORGN] OR '.join(
                excludelist) + '[ORGN])'
        else:
            entrezQuery = '(' + self.options.limitquery + ') NOT uncultured[WORD]'

        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################
        if self.options.ONEMISSING:
            self.options.limitquery = orig_limit_query
        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################

        fileSuffix = ''
        for name in excludelist:
            l = re.split(r'\s+', name)
            for n in l:
                fileSuffix += n[0]
        if fileSuffix:
            fileSuffix = '_' + fileSuffix

        # File name used for blast cache
        blastFileName = os.path.join(
            self.options.blastcache,
            "%s.%d_%s%s.xml" % (fastaRecord.title, self.options.maxblasthits,
                                self.options.minsignificance, fileSuffix))

        if usecache and os.path.exists(
                blastFileName) and os.path.getsize(blastFileName) > 0:
            # Use cached blast result
            if excludelist:
                print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(
                    excludelist),
            else:
                print "\n\t\tUsing cached Blast results...",
            sys.stdout.flush()
        else:
            # Make a query to filter the returned results:
            if excludelist:
                print "\n\t\tSearching database (excluding %s)..." % ', '.join(
                    excludelist),
            else:
                print "\n\t\tSearching database...",
            sys.stdout.flush()

            fastaRecordFileName = os.path.join(self.options.project,
                                               utils.randomString(8))
            fastaRecordFile = open(fastaRecordFileName, 'w')
            fastaRecordFile.write(str(fastaRecord))
            fastaRecordFile.close()
            resultHandle = None
            if self.options.nolowcomplexfilter:
                filterOption = '-dust no'
            else:
                filterOption = '-dust yes'  # FIXME: Check that this is an ok default... It is not the defalut in blastn

            if self.options.blastwordsize:
                wordSize = '-word_size %s' % self.options.blastwordsize
            else:
                wordSize = ''

            blastCmd = 'blastn -remote -outfmt 5 -db nt %s %s -evalue %s -max_target_seqs %s -entrez_query "%s" -query %s -out %s' \
                       % (wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, \
                          entrezQuery, fastaRecordFileName, blastFileName)

            for i in range(20):
                time.sleep(2 * i)
                error = utils.systemCall(blastCmd,
                                         stdout='IGNORE',
                                         stderr='IGNORE')
                try:

                    #                     retval = os.system(blastCmd)
                    #                     if retval != 0:
                    #                        print "Netblast failed with return value %d. Trying again..." % retval
                    error = utils.systemCall(blastCmd)
                    if error or not os.path.exists(
                            blastFileName) or os.path.getsize(
                                blastFileName) == 0:
                        print "Netblast failed. Trying again..."
                        continue

                    break
                except KeyboardInterrupt:
                    sys.exit()
                except:
                    print "Netblast failed. Trying again..."
                    pass
            os.remove(fastaRecordFileName)

        # Read file from cache
        blastHandle = open(blastFileName, 'r')

        # Parse the result:
        try:
            blastRecord = NCBIXML.read(blastHandle)
            print "done.\n\t\t\t",
            sys.stdout.flush()
        except:
            blastRecord = None
        blastHandle.close()

        return blastRecord
Ejemplo n.º 4
0
    def _netblast_search(self, fastaRecord, excludelist=[], usecache=True):
        """
        Blast against genbank over web
        """
        
        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################
        if self.options.ONEMISSING:
           orig_limit_query = self.options.limitquery
           genus, species = re.search(r'_([^_]+)_([^_]+)$', fastaRecord.title).groups()
           self.options.limitquery = "barcode[keyword] NOT %s %s[ORGN]" % (genus, species)
           print "REFORMATTING LIMIT QUERY TO", self.options.limitquery
        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################


        # Make a query to filter the returned results:
        if excludelist:
            entrezQuery = '(' + self.options.limitquery + ') NOT (uncultured[WORD] OR ' + '[ORGN] OR '.join(excludelist) + '[ORGN])'
        else:
            entrezQuery = '(' + self.options.limitquery + ') NOT uncultured[WORD]'

        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################
        if self.options.ONEMISSING:
           self.options.limitquery = orig_limit_query
        ### THIS IS A HACK FOR TESTING PURPOSES TO REMOVED AGAIN ##########################################
            
        fileSuffix = ''
        for name in excludelist:
            l = re.split(r'\s+', name)
            for n in l:
                fileSuffix += n[0]
        if fileSuffix:
           fileSuffix = '_' + fileSuffix

        # File name used for blast cache
        blastFileName = os.path.join(self.options.blastcache, "%s.%d_%s%s.xml" % (fastaRecord.title,
                                               self.options.maxblasthits, self.options.minsignificance, fileSuffix))
        
        if usecache and os.path.exists(blastFileName) and os.path.getsize(blastFileName)>0:
            # Use cached blast result
            if excludelist:
               print "\n\t\tUsing cached Blast results (excluding %s)..." % ', '.join(excludelist),
            else:
                print "\n\t\tUsing cached Blast results...", 
            sys.stdout.flush()
        else:
            # Make a query to filter the returned results:
            if excludelist:
                print "\n\t\tSearching database (excluding %s)..." % ', '.join(excludelist), 
            else:
                print "\n\t\tSearching database...", 
            sys.stdout.flush()

            fastaRecordFileName = os.path.join(self.options.project, utils.randomString(8))
            fastaRecordFile = open(fastaRecordFileName, 'w')
            fastaRecordFile.write(str(fastaRecord))
            fastaRecordFile.close()
            resultHandle = None
            if self.options.nolowcomplexfilter:
                filterOption = '-dust no'
            else:
                filterOption = '-dust yes' # FIXME: Check that this is an ok default... It is not the defalut in blastn

            if self.options.blastwordsize:
                wordSize = '-word_size %s' % self.options.blastwordsize
            else:
                wordSize = ''

            blastCmd = 'blastn -remote -outfmt 5 -db nt %s %s -evalue %s -max_target_seqs %s -entrez_query "%s" -query %s -out %s' \
                       % (wordSize, filterOption, self.options.minsignificance, self.options.maxblasthits, \
                          entrezQuery, fastaRecordFileName, blastFileName)

            for i in range(20):
                time.sleep(2 * i)
                error = utils.systemCall(blastCmd, stdout='IGNORE', stderr='IGNORE')
                try:

#                     retval = os.system(blastCmd)
#                     if retval != 0:
#                        print "Netblast failed with return value %d. Trying again..." % retval                   
                    error = utils.systemCall(blastCmd)
                    if error or not os.path.exists(blastFileName) or os.path.getsize(blastFileName) == 0:
                       print "Netblast failed. Trying again..."
                       continue

                    break
                except KeyboardInterrupt:
                    sys.exit()
                except:
                    print "Netblast failed. Trying again..."
                    pass
            os.remove(fastaRecordFileName)

        # Read file from cache
        blastHandle = open(blastFileName, 'r')

        # Parse the result:
        try:
           blastRecord = NCBIXML.read(blastHandle)
           print "done.\n\t\t\t",
           sys.stdout.flush()
        except:
            blastRecord = None
        blastHandle.close()

        return blastRecord