예제 #1
0
 def combined_sd(self, v1, v2):
     """
     Calculate the overall standard deviation of two measurements that
     are connected by addition or substraction.
     v1 - [ float ], measurements of value 1
     v2 - [ float ], measurements of value 2
     -> float, standard dev of (v1 +/- v2)
     """
     sd1 = MU.SD(v1)
     sd2 = MU.SD(v2)
     return sqrt(sd1**2 + sd2**2)
예제 #2
0
    def filter_z(self, cutoff=None):
        """
        Filter out templates that are further away from the target sequence
        than the average template.

        @param zcutoff: z-value cutoff (default: TemplateFilter.Z_CUTOFF)
        @type  zcutoff: float

        @return: a mask with 0 for every template that is zcutoff standard
                 deviations below the average similarity to the target
        @rtype: numpy.array
        """
        cutoff = cutoff or self.Z_CUTOFF

        avg = N.average(self.identities)
        sd = M.SD(self.identities) or 1e-10  ## replace 0 standard deviation
        z = (self.identities - avg) / sd

        r = N.greater(z, -1. * cutoff)

        self.filter_mask = r * self.filter_mask

        if self.verbose:
            self.log.add('%i of %i templates fall through z-value filter.' %
                         (len(N.flatnonzero(r == 0)), len(self.templates)))

        return r
예제 #3
0
    def averageRms(self):
        """
        @return: average pairwise rmsd and it's standard deviation
        @rtype: (float, float)

        @raise FlexError: if there are no results yet
        """
        r = self.rmsList()
        return N0.average(r), mathUtils.SD(r)
예제 #4
0
    def outliers(self,
                 z=1.0,
                 mask=None,
                 prof='rmsCA_last',
                 last=10,
                 step=1,
                 verbose=1):
        """
        Identify outlier trajectories. First we calculate the CA-RMS of every
        |step|th frame to the last frame. Outliers are member trajectories for
        which the slope of this rms profile is z standard deviations below the
        mean of all members.
        
        @param z: z-value threshold
        @type  z: float
        @param mask: atom mask used (default: ref.maskCA())
        @type  mask: [int]
        @param prof: name of pre-calculated profile to use
                     (default: 'rmsCA_last')
        @type  prof: str
        @param last: skip |last| last frames from linear regression
        @type  last: int
        @param step: frame offset
        @type  step: int
        
        @return: member mask of outlier trajectories
        @rtype: [0|1]
        """
        if mask is None: mask = self.ref.maskCA()

        traj = self.compressAtoms(mask)
        if step != 1:
            traj = traj.thin(step)

        if not prof in traj.profiles:
            traj.fitMembers(refIndex=-1, prof=prof, verbose=verbose)

        p_all = traj.profiles[prof]
        n = traj.n_members
        l = len(traj)

        pm = [p_all[member:l:n][:-last] for member in range(n)]

        slopes = [M.linfit(range(l / n - last), p)[0] for p in pm]

        mean, sd = N0.average(slopes), M.SD(slopes)

        return [r - mean < -z * sd for r in slopes]
예제 #5
0
    def parse_result(self):
        """
        Extract some information about the profile as well as the
        match state emmission scores. Keys of the returned dictionary::
          'AA', 'name', 'NrSeq', 'emmScore', 'accession',
          'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum'
          
        @return: dictionary with warious information about the profile
        @rtype: dict
        """
        ## check that the outfut file is there and seems valid
        if not os.path.exists(self.f_out):
            raise HmmerError,\
                  'Hmmerfetch result file %s does not exist.'%self.f_out

        if T.fileLength(self.f_out) < 10:
            raise HmmerError,\
                  'Hmmerfetch result file %s seems incomplete.'%self.f_out

        profileDic = {}

        ## read result
        hmm = open(self.f_out, 'r')
        out = hmm.read()
        hmm.close()

        ## collect some data about the hmm profile
        profileDic['name'] = self.hmmName
        profileDic['profLength'] = \
                  int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] )
        profileDic['accession'] = \
                  string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1]
        profileDic['NrSeq'] = \
                  int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] )
        profileDic['AA'] = \
              string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:]

        ## collect null emmission scores
        pattern = 'NULE[ ]+' + '[-0-9]+[ ]+' * 20
        nullEmm = [
            float(j) for j in string.split(re.findall(pattern, out)[0])[1:]
        ]

        ## get emmision scores
        prob = []
        for i in range(1, profileDic['profLength'] + 1):
            pattern = "[ ]+%i" % i + "[ ]+[-0-9]+" * 20
            e = [float(j) for j in string.split(re.findall(pattern, out)[0])]
            prob += [e]

        profileDic['seqNr'] = N.transpose(N.take(prob, (0, ), 1))
        profileDic['emmScore'] = N.array(prob)[:, 1:]

        ## calculate emission probablitities
        emmProb, nullProb = self.hmmEmm2Prob(nullEmm, profileDic['emmScore'])

        ent = [
            N.resize(self.entropy(e, nullProb), (1, 20))[0] for e in emmProb
        ]
        profileDic['ent'] = N.array(ent)

        ###### TEST #####

        proba = N.array(prob)[:, 1:]

        ##         # test set all to max score
        ##         p = proba
        ##         p1 = []
        ##         for i in range( len(p) ):
        ##             p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ]
        ##         profileDic['maxAll'] = p1

        # test set all to N.sum( abs( probabilities ) )
        p = proba
        p2 = []
        for i in range(len(p)):
            p2 += [N.resize(N.sum(N.absolute(p[i])), N.shape(p[i]))]
        profileDic['absSum'] = p2

        # set all to normalized max score
        p = proba
        p4 = []
        for i in range(len(p)):
            p_scale = (p[i] - N.average(p[i])) / math.SD(p[i])
            p4 += [
                N.resize(p_scale[N.argmax(N.array(p_scale))], N.shape(p[i]))
            ]
        profileDic['maxAllScale'] = p4

        return profileDic
예제 #6
0
def randomSurfaces( base_folder, label, mask ):
    """
    calculate surfaces for all peptides and return the
    average and SD
    """
    ## container for results and standard deviations
    MS,    AS    = {}, {}
    MS_sd, AS_sd = {}, {}

    ## loop over peptide directories
    for k in MOU.aaAtoms.keys():
        dir = base_folder + 'GLY-%s-GLY_pcr/pcr_00'%(k)
        fLst = glob.glob( dir + '/*.pdb')
        
        msLst = []
        asLst = []
        
        ## loop over pdb files for each peptide
        T.flushPrint( '\nNow collecting data in %s'%dir )
        for f in fLst:

            ## load peptide and remove waters and hydrogens
            m = PDBModel( f )
            m = m.compress( m.maskProtein() * m.maskHeavy() )
            T.flushPrint( '.')

            ## add surface data
            try:
                d = PDBDope( m )
                d.addSurfaceRacer( probe=1.4 )

                ## remove tailing GLY
                m = m.compress( m.res2atomMask(mask) )
                
                ## collect surface data for each peptide
                msLst += [ m.profile('MS') ]
                asLst += [ m.profile('AS') ]
                       
            except:
                print 'Failed calculating exposure for GLY-%s-GLY'%(k)
                print '\t and file %s'%f
                
        ## get result dictionary for peptide
        T.flushPrint('\nCollecting data ...\n')
        msDic = {}
        asDic = {}
        msDic_sd = {}
        asDic_sd = {}

        j = 0
        #atoms =  [ a['name'] for a in m.atoms ]
        for n in m['name']:
            msDic[n]    = N0.average(msLst)[j]
            asDic[n]    = N0.average(asLst)[j]
            msDic_sd[n] = MAU.SD( msLst )[j]
            asDic_sd[n] = MAU.SD( asLst )[j]
            j += 1

        MS[ k ] = msDic
        AS[ k ] = asDic
        MS_sd[ k ] = msDic_sd
        AS_sd[ k ] = asDic_sd

    return MS, AS, MS_sd, AS_sd
예제 #7
0
파일: rmsFit.py 프로젝트: tybiot/biskit
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05):
    """
    Matches two arrays onto each other, while iteratively removing outliers.
    Superimposed array y would be C{ N0.dot(y, N0.transpose(r)) + t }.

    @param n_iterations: number of calculations::
                           1 .. no iteration 
                           0 .. until convergence
    @type  n_iterations: 1|0
    @param z: number of standard deviations for outlier definition (default: 2)
    @type  z: float
    @param eps_rmsd: tolerance in rmsd (default: 0.5)
    @type  eps_rmsd: float
    @param eps_stdv: tolerance in standard deviations (default: 0.05)
    @type  eps_stdv: float

    @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ]
    @rtype: (array, array), [float, float, int]
    """
    iter_trace = []

    rmsd_old = 0
    stdv_old = 0

    n = 0
    converged = 0

    mask = N0.ones(len(y), N0.Int32)

    while not converged:

        ## find transformation for best match
        r, t = findTransformation(N0.compress(mask, x, 0),
                                  N0.compress(mask, y, 0))

        ## transform coordinates
        xt = N0.dot(y, N0.transpose(r)) + t

        ## calculate row distances
        d = N0.sqrt(N0.sum(N0.power(x - xt, 2), 1)) * mask

        ## calculate rmsd and stdv
        rmsd = N0.sqrt(N0.average(N0.compress(mask, d)**2))
        stdv = MU.SD(N0.compress(mask, d))

        ## check conditions for convergence
        d_rmsd = abs(rmsd - rmsd_old)
        d_stdv = abs(1 - stdv_old / stdv)

        if d_rmsd < eps_rmsd and d_stdv < eps_stdv:
            converged = 1
        else:
            rmsd_old = rmsd
            stdv_old = stdv

        ## store result
        perc = round(float(N0.sum(mask)) / float(len(mask)), 2)

        ## throw out non-matching rows
        mask = N0.logical_and(mask, N0.less(d, rmsd + z * stdv))
        outliers = N0.nonzero(N0.logical_not(mask))
        iter_trace.append([perc, round(rmsd, 3), outliers])

        n += 1

        if n_iterations and n >= n_iterations:
            break

    return (r, t), iter_trace
예제 #8
0
 def standardDeviation(self):
     sd = MU.SD(self.msm)
     return sd
예제 #9
0
 def entropySD(self):
     centropy = N0.sum(-N0.log(self.msm)*\
                      self.msm)/float(self.n_cluster)
     return MU.SD(centropy)