Ejemplo n.º 1
0
    def parse_result(self):
        """
        Align fasta formated sequence to hmm profile.
        
        @return: alignment and  matching hmm positions with gaps
        @rtype: str, str
        """
        ## check that the outfut file is there and seems valid
        if not os.path.exists(self.f_out):
            raise HmmerError,\
                  'Hmmeralign result file %s does not exist.'%self.f_out

        if T.fileLength(self.f_out) < 1:
            raise HmmerError,\
                  'Hmmeralign result file %s seems incomplete.'%self.f_out

        ## read result
        hmm = open(self.f_out, 'r')
        out = hmm.read()
        hmm.close()

        ## extract search sequence
        fastaSeq = re.findall(self.fastaID + '[ ]+[-a-yA-Y]+', out)
        fastaSeq = string.join([string.split(i)[1] for i in fastaSeq], '')

        ## extract hmm sequence
        hmmSeq = re.findall('#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out)
        hmmSeq = string.join(
            [string.strip(string.split(i)[2]) for i in hmmSeq], '')

        return fastaSeq, hmmSeq
Ejemplo n.º 2
0
    def parse_result( self ):
        """
        Align fasta formated sequence to hmm profile.
        
        @return: alignment and  matching hmm positions with gaps
        @rtype: str, str
        """
        ## check that the outfut file is there and seems valid
        if not os.path.exists( self.f_out ):
            raise HmmerError,\
                  'Hmmeralign result file %s does not exist.'%self.f_out
        
        if T.fileLength( self.f_out ) < 1:
            raise HmmerError,\
                  'Hmmeralign result file %s seems incomplete.'%self.f_out
        
        ## read result
        hmm = open( self.f_out, 'r')
        out = hmm.read()
        hmm.close()
        
        ## extract search sequence
        fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out )
        fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '')

        ## extract hmm sequence
        hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out )
        hmmSeq = string.join([ string.strip( string.split(i)[2] ) for i in hmmSeq ], '')

        return fastaSeq, hmmSeq
Ejemplo n.º 3
0
    def clusterFasta( self, fastaIn=None, simCut=1.75, lenCut=0.9, ncpu=1 ):
        """
        Cluster sequences. The input fasta titles must be the IDs.
        fastaClust( fastaIn [, simCut, lenCut, ncpu] )

        @param fastaIn: name of input fasta file
        @type  fastaIn: str
        @param simCut: similarity threshold (score < 3 or %identity)
                       (default: 1.75)
        @type  simCut: double
        @param lenCut: length threshold (default: 0.9)
        @type  lenCut: double
        @param ncpu: number of CPUs
        @type  ncpu: int

        @raise BlastError: if fastaIn is empty
        """
        fastaIn = fastaIn or self.outFolder + self.F_FASTA_ALL

        if T.fileLength( fastaIn ) < 1:
            raise IOError( "File %s empty. Nothing to cluster"%fastaIn )

        if self.verbose:
            self.log.add( "\nClustering sequences:\n%s"%('-'*20) )

        cmd = settings.blastclust_bin + ' -i %s -S %f -L %f -a %i' %\
            (fastaIn, simCut, lenCut, ncpu)

        if self.verbose:
            self.log.add("- Command: %s"%cmd)

        ## bugfix: at all cost prevent blastclust from using shared temp folder
        tmp = os.environ.get( 'TMPDIR', None )
        if tmp:
            del os.environ['TMPDIR']

        err, o = commands.getstatusoutput( cmd )
        if err:
            raise BlastError( "blastclust failed. Error code: " + str(err) )

        if tmp:
            os.environ['TMPDIR'] = tmp

        ## blastclust might write errors to file, if so the errors
        ## occur before the dateline
        lines = [ l.split() for l in o.split('\n') ]
        dateline = [ l[-1] for l in lines ].index('queries')
        self.clusters = lines[dateline+1:]

        self.reportClustering( raw=o )

        self.bestOfCluster = [ self.selectFasta( ids )
                               for ids in self.clusters ]
Ejemplo n.º 4
0
    def clusterFasta(self, fastaIn=None, simCut=1.75, lenCut=0.9, ncpu=1):
        """
        Cluster sequences. The input fasta titles must be the IDs.
        fastaClust( fastaIn [, simCut, lenCut, ncpu] )

        @param fastaIn: name of input fasta file
        @type  fastaIn: str
        @param simCut: similarity threshold (score < 3 or %identity)
                       (default: 1.75)
        @type  simCut: double
        @param lenCut: length threshold (default: 0.9)
        @type  lenCut: double
        @param ncpu: number of CPUs
        @type  ncpu: int

        @raise BlastError: if fastaIn is empty
        """
        fastaIn = fastaIn or self.outFolder + self.F_FASTA_ALL

        if T.fileLength(fastaIn) < 1:
            raise IOError("File %s empty. Nothing to cluster" % fastaIn)

        if self.verbose:
            self.log.add("\nClustering sequences:\n%s" % ('-' * 20))

        cmd = settings.blastclust_bin + ' -i %s -S %f -L %f -a %i' %\
            (fastaIn, simCut, lenCut, ncpu)

        if self.verbose:
            self.log.add("- Command: %s" % cmd)

        ## bugfix: at all cost prevent blastclust from using shared temp folder
        tmp = os.environ.get('TMPDIR', None)
        if tmp:
            del os.environ['TMPDIR']

        err, o = commands.getstatusoutput(cmd)
        if err:
            raise BlastError("blastclust failed. Error code: " + str(err))

        if tmp:
            os.environ['TMPDIR'] = tmp

        ## blastclust might write errors to file, if so the errors
        ## occur before the dateline
        lines = [l.split() for l in o.split('\n')]
        dateline = [l[-1] for l in lines].index('queries')
        self.clusters = lines[dateline + 1:]

        self.reportClustering(raw=o)

        self.bestOfCluster = [self.selectFasta(ids) for ids in self.clusters]
Ejemplo n.º 5
0
    def parse_result( self ):
        """
        Parse the output from hmmpfam.
        
        @return: dictionary witn profile names as keys and a list of
                 lists containing information about the range where the
                 profile matches the sequence
        @rtype: dict, [list]
        """
        matches = {}
        hits = []

        ## check that the outfut file is there and seems valid
        if not os.path.exists( self.f_out ):
            raise HmmerError,\
                  'Hmmersearch result file %s does not exist.'%self.f_out
        
        if T.fileLength( self.f_out ) < 10:
            raise HmmerError,\
                  'Hmmersearch result file %s seems incomplete.'%self.f_out 

        try:
            out = open( self.f_out, 'r' )
            while 1:
                l = out.readline()
                ## get names and descriptions of matching profiles
                if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l):
                    m = string.split( out.readline() )
                    while len(m) != 0:
                        matches[m[0]] =  m[1:] 
                        m = string.split( out .readline() )

                ## get hits, scores and alignment positions
                if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l):
                    h = string.split( out.readline() )
                    while len(h) != 0:
                        hits += [ h ] 
                        h = string.split( out.readline() )
                    break
                
        except:
            raise HmmerError,\
                  'ERROR parsing hmmpfam search result: %s'%self.f_out
        
        out.close() 

        return matches, hits
Ejemplo n.º 6
0
 def __init__( self, hmmFile, fastaFile, fastaID, **kw ):
     """
     @param hmmFile: path to hmm file (profile)
     @type  hmmFile: str
     @param fastaFile: path to fasta search sequence
     @type  fastaFile: str
     @param fastaID: fasta id of search sequence
     @type  fastaID: str     
     """
     self.fastaID = fastaID
     self.hmmFile = hmmFile
     self.fastaFile = fastaFile
     
     assert T.fileLength( self.hmmFile ) > 10, \
            'input HMM file missing or empty'
     
     Executor.__init__( self, 'hmmalign',
                        args=' -q %s %s'%(hmmFile, fastaFile), **kw )
Ejemplo n.º 7
0
    def parse_result(self):
        """
        Parse the output from hmmpfam.
        
        @return: dictionary with profile names as keys and a list of
                 lists containing information about the range where the
                 profile matches the sequence
        @rtype: dict, [list]
        """
        matches = {}
        hits = []

        ## check that the outfut file is there and seems valid
        if not os.path.exists(self.f_out):
            raise HmmerError,\
                  'Hmmersearch result file %s does not exist.'%self.f_out

        if T.fileLength(self.f_out) < 10:
            raise HmmerError,\
                  'Hmmersearch result file %s seems incomplete.'%self.f_out

        try:
            lines = open(self.f_out, 'r').readlines()
            while lines:
                l = lines.pop(0)
                ## get names and descriptions of matching profiles
                if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l):
                    m = string.split(lines.pop(0))
                    while len(m) != 0:
                        matches[m[0]] = m[1:]
                        m = string.split(lines.pop(0))

                ## get hits, scores and alignment positions
                if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l):
                    h = string.split(lines.pop(0))
                    while len(h) != 0:
                        hits += [h]
                        h = string.split(lines.pop(0))
                    break

        except Exception, why:
            raise HmmerError,\
                  'ERROR parsing hmmpfam search result: %s'%self.f_out +\
                  '\n%r' % why
Ejemplo n.º 8
0
    def __init__(self, hmmFile, fastaFile, fastaID, **kw):
        """
        @param hmmFile: path to hmm file (profile)
        @type  hmmFile: str
        @param fastaFile: path to fasta search sequence
        @type  fastaFile: str
        @param fastaID: fasta id of search sequence
        @type  fastaID: str     
        """
        self.fastaID = fastaID
        self.hmmFile = hmmFile
        self.fastaFile = fastaFile

        assert T.fileLength( self.hmmFile ) > 10, \
               'input HMM file missing or empty'

        Executor.__init__(self,
                          'hmmalign',
                          args=' -q %s %s' % (hmmFile, fastaFile),
                          **kw)
Ejemplo n.º 9
0
    def parse_result(self):
        """
        Extract some information about the profile as well as the
        match state emmission scores. Keys of the returned dictionary::
          'AA', 'name', 'NrSeq', 'emmScore', 'accession',
          'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum'
          
        @return: dictionary with warious information about the profile
        @rtype: dict
        """
        ## check that the outfut file is there and seems valid
        if not os.path.exists(self.f_out):
            raise HmmerError,\
                  'Hmmerfetch result file %s does not exist.'%self.f_out

        if T.fileLength(self.f_out) < 10:
            raise HmmerError,\
                  'Hmmerfetch result file %s seems incomplete.'%self.f_out

        profileDic = {}

        ## read result
        hmm = open(self.f_out, 'r')
        out = hmm.read()
        hmm.close()

        ## collect some data about the hmm profile
        profileDic['name'] = self.hmmName
        profileDic['profLength'] = \
                  int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] )
        profileDic['accession'] = \
                  string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1]
        profileDic['NrSeq'] = \
                  int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] )
        profileDic['AA'] = \
              string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:]

        ## collect null emmission scores
        pattern = 'NULE[ ]+' + '[-0-9]+[ ]+' * 20
        nullEmm = [
            float(j) for j in string.split(re.findall(pattern, out)[0])[1:]
        ]

        ## get emmision scores
        prob = []
        for i in range(1, profileDic['profLength'] + 1):
            pattern = "[ ]+%i" % i + "[ ]+[-0-9]+" * 20
            e = [float(j) for j in string.split(re.findall(pattern, out)[0])]
            prob += [e]

        profileDic['seqNr'] = N.transpose(N.take(prob, (0, ), 1))
        profileDic['emmScore'] = N.array(prob)[:, 1:]

        ## calculate emission probablitities
        emmProb, nullProb = self.hmmEmm2Prob(nullEmm, profileDic['emmScore'])

        ent = [
            N.resize(self.entropy(e, nullProb), (1, 20))[0] for e in emmProb
        ]
        profileDic['ent'] = N.array(ent)

        ###### TEST #####

        proba = N.array(prob)[:, 1:]

        ##         # test set all to max score
        ##         p = proba
        ##         p1 = []
        ##         for i in range( len(p) ):
        ##             p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ]
        ##         profileDic['maxAll'] = p1

        # test set all to N.sum( abs( probabilities ) )
        p = proba
        p2 = []
        for i in range(len(p)):
            p2 += [N.resize(N.sum(N.absolute(p[i])), N.shape(p[i]))]
        profileDic['absSum'] = p2

        # set all to normalized max score
        p = proba
        p4 = []
        for i in range(len(p)):
            p_scale = (p[i] - N.average(p[i])) / math.SD(p[i])
            p4 += [
                N.resize(p_scale[N.argmax(N.array(p_scale))], N.shape(p[i]))
            ]
        profileDic['maxAllScale'] = p4

        return profileDic
Ejemplo n.º 10
0
    def parse_result( self ):
        """
        Extract some information about the profile as well as the
        match state emmission scores. Keys of the returned dictionary::
          'AA', 'name', 'NrSeq', 'emmScore', 'accession',
          'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum'
          
        @return: dictionary with warious information about the profile
        @rtype: dict
        """
        ## check that the outfut file is there and seems valid
        if not os.path.exists( self.f_out ):
            raise HmmerError,\
                  'Hmmerfetch result file %s does not exist.'%self.f_out
        
        if T.fileLength( self.f_out ) < 10:
            raise HmmerError,\
                  'Hmmerfetch result file %s seems incomplete.'%self.f_out
        
        profileDic = {}

        ## read result
        hmm = open( self.f_out, 'r')
        out = hmm.read()
        hmm.close()

        ## collect some data about the hmm profile
        profileDic['name'] =  self.hmmName 
        profileDic['profLength'] = \
                  int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] )
        profileDic['accession'] = \
                  string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1] 
        profileDic['NrSeq'] = \
                  int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] )
        profileDic['AA'] = \
              string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:]

        ## collect null emmission scores
        pattern = 'NULE[ ]+' + '[-0-9]+[ ]+'*20
        nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ]

        ## get emmision scores
        prob=[]
        for i in range(1, profileDic['profLength']+1):
            pattern = "[ ]+%i"%i + "[ ]+[-0-9]+"*20
            e = [ float(j) for j in string.split(re.findall(pattern, out)[0]) ]
            prob += [ e ]

        profileDic['seqNr'] = N.transpose( N.take( prob, (0,),1 ) )
        profileDic['emmScore'] = N.array(prob)[:,1:]

        ## calculate emission probablitities
        emmProb, nullProb = self.hmmEmm2Prob( nullEmm, profileDic['emmScore'])

        ent = [ N.resize( self.entropy(e, nullProb), (1,20) )[0] for e in emmProb ]
        profileDic['ent'] = N.array(ent)

        ###### TEST #####

        proba = N.array(prob)[:,1:]

##         # test set all to max score
##         p = proba
##         p1 = []
##         for i in range( len(p) ):
##             p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ]
##         profileDic['maxAll'] = p1

        # test set all to N.sum( abs( probabilities ) )
        p = proba
        p2 = []
        for i in range( len(p) ) :
            p2 += [ N.resize( N.sum( N.absolute( p[i] )), N.shape( p[i] ) ) ]
        profileDic['absSum'] = p2

        # set all to normalized max score 
        p = proba
        p4 = []
        for i in range( len(p) ) :
            p_scale = (p[i] - N.average(p[i]) )/ math.SD(p[i])
            p4 += [ N.resize( p_scale[N.argmax( N.array(p_scale) )] ,
                              N.shape( p[i] ) ) ]
        profileDic['maxAllScale'] = p4

        return profileDic