def combined_sd(self, v1, v2): """ Calculate the overall standard deviation of two measurements that are connected by addition or substraction. v1 - [ float ], measurements of value 1 v2 - [ float ], measurements of value 2 -> float, standard dev of (v1 +/- v2) """ sd1 = MU.SD(v1) sd2 = MU.SD(v2) return sqrt(sd1**2 + sd2**2)
def filter_z(self, cutoff=None): """ Filter out templates that are further away from the target sequence than the average template. @param zcutoff: z-value cutoff (default: TemplateFilter.Z_CUTOFF) @type zcutoff: float @return: a mask with 0 for every template that is zcutoff standard deviations below the average similarity to the target @rtype: numpy.array """ cutoff = cutoff or self.Z_CUTOFF avg = N.average(self.identities) sd = M.SD(self.identities) or 1e-10 ## replace 0 standard deviation z = (self.identities - avg) / sd r = N.greater(z, -1. * cutoff) self.filter_mask = r * self.filter_mask if self.verbose: self.log.add('%i of %i templates fall through z-value filter.' % (len(N.flatnonzero(r == 0)), len(self.templates))) return r
def averageRms(self): """ @return: average pairwise rmsd and it's standard deviation @rtype: (float, float) @raise FlexError: if there are no results yet """ r = self.rmsList() return N0.average(r), mathUtils.SD(r)
def outliers(self, z=1.0, mask=None, prof='rmsCA_last', last=10, step=1, verbose=1): """ Identify outlier trajectories. First we calculate the CA-RMS of every |step|th frame to the last frame. Outliers are member trajectories for which the slope of this rms profile is z standard deviations below the mean of all members. @param z: z-value threshold @type z: float @param mask: atom mask used (default: ref.maskCA()) @type mask: [int] @param prof: name of pre-calculated profile to use (default: 'rmsCA_last') @type prof: str @param last: skip |last| last frames from linear regression @type last: int @param step: frame offset @type step: int @return: member mask of outlier trajectories @rtype: [0|1] """ if mask is None: mask = self.ref.maskCA() traj = self.compressAtoms(mask) if step != 1: traj = traj.thin(step) if not prof in traj.profiles: traj.fitMembers(refIndex=-1, prof=prof, verbose=verbose) p_all = traj.profiles[prof] n = traj.n_members l = len(traj) pm = [p_all[member:l:n][:-last] for member in range(n)] slopes = [M.linfit(range(l / n - last), p)[0] for p in pm] mean, sd = N0.average(slopes), M.SD(slopes) return [r - mean < -z * sd for r in slopes]
def parse_result(self): """ Extract some information about the profile as well as the match state emmission scores. Keys of the returned dictionary:: 'AA', 'name', 'NrSeq', 'emmScore', 'accession', 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum' @return: dictionary with warious information about the profile @rtype: dict """ ## check that the outfut file is there and seems valid if not os.path.exists(self.f_out): raise HmmerError,\ 'Hmmerfetch result file %s does not exist.'%self.f_out if T.fileLength(self.f_out) < 10: raise HmmerError,\ 'Hmmerfetch result file %s seems incomplete.'%self.f_out profileDic = {} ## read result hmm = open(self.f_out, 'r') out = hmm.read() hmm.close() ## collect some data about the hmm profile profileDic['name'] = self.hmmName profileDic['profLength'] = \ int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] ) profileDic['accession'] = \ string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1] profileDic['NrSeq'] = \ int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] ) profileDic['AA'] = \ string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:] ## collect null emmission scores pattern = 'NULE[ ]+' + '[-0-9]+[ ]+' * 20 nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ] ## get emmision scores prob = [] for i in range(1, profileDic['profLength'] + 1): pattern = "[ ]+%i" % i + "[ ]+[-0-9]+" * 20 e = [float(j) for j in string.split(re.findall(pattern, out)[0])] prob += [e] profileDic['seqNr'] = N.transpose(N.take(prob, (0, ), 1)) profileDic['emmScore'] = N.array(prob)[:, 1:] ## calculate emission probablitities emmProb, nullProb = self.hmmEmm2Prob(nullEmm, profileDic['emmScore']) ent = [ N.resize(self.entropy(e, nullProb), (1, 20))[0] for e in emmProb ] profileDic['ent'] = N.array(ent) ###### TEST ##### proba = N.array(prob)[:, 1:] ## # test set all to max score ## p = proba ## p1 = [] ## for i in range( len(p) ): ## p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ] ## profileDic['maxAll'] = p1 # test set all to N.sum( abs( probabilities ) ) p = proba p2 = [] for i in range(len(p)): p2 += [N.resize(N.sum(N.absolute(p[i])), N.shape(p[i]))] profileDic['absSum'] = p2 # set all to normalized max score p = proba p4 = [] for i in range(len(p)): p_scale = (p[i] - N.average(p[i])) / math.SD(p[i]) p4 += [ N.resize(p_scale[N.argmax(N.array(p_scale))], N.shape(p[i])) ] profileDic['maxAllScale'] = p4 return profileDic
def randomSurfaces( base_folder, label, mask ): """ calculate surfaces for all peptides and return the average and SD """ ## container for results and standard deviations MS, AS = {}, {} MS_sd, AS_sd = {}, {} ## loop over peptide directories for k in MOU.aaAtoms.keys(): dir = base_folder + 'GLY-%s-GLY_pcr/pcr_00'%(k) fLst = glob.glob( dir + '/*.pdb') msLst = [] asLst = [] ## loop over pdb files for each peptide T.flushPrint( '\nNow collecting data in %s'%dir ) for f in fLst: ## load peptide and remove waters and hydrogens m = PDBModel( f ) m = m.compress( m.maskProtein() * m.maskHeavy() ) T.flushPrint( '.') ## add surface data try: d = PDBDope( m ) d.addSurfaceRacer( probe=1.4 ) ## remove tailing GLY m = m.compress( m.res2atomMask(mask) ) ## collect surface data for each peptide msLst += [ m.profile('MS') ] asLst += [ m.profile('AS') ] except: print 'Failed calculating exposure for GLY-%s-GLY'%(k) print '\t and file %s'%f ## get result dictionary for peptide T.flushPrint('\nCollecting data ...\n') msDic = {} asDic = {} msDic_sd = {} asDic_sd = {} j = 0 #atoms = [ a['name'] for a in m.atoms ] for n in m['name']: msDic[n] = N0.average(msLst)[j] asDic[n] = N0.average(asLst)[j] msDic_sd[n] = MAU.SD( msLst )[j] asDic_sd[n] = MAU.SD( asLst )[j] j += 1 MS[ k ] = msDic AS[ k ] = asDic MS_sd[ k ] = msDic_sd AS_sd[ k ] = asDic_sd return MS, AS, MS_sd, AS_sd
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05): """ Matches two arrays onto each other, while iteratively removing outliers. Superimposed array y would be C{ N0.dot(y, N0.transpose(r)) + t }. @param n_iterations: number of calculations:: 1 .. no iteration 0 .. until convergence @type n_iterations: 1|0 @param z: number of standard deviations for outlier definition (default: 2) @type z: float @param eps_rmsd: tolerance in rmsd (default: 0.5) @type eps_rmsd: float @param eps_stdv: tolerance in standard deviations (default: 0.05) @type eps_stdv: float @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ] @rtype: (array, array), [float, float, int] """ iter_trace = [] rmsd_old = 0 stdv_old = 0 n = 0 converged = 0 mask = N0.ones(len(y), N0.Int32) while not converged: ## find transformation for best match r, t = findTransformation(N0.compress(mask, x, 0), N0.compress(mask, y, 0)) ## transform coordinates xt = N0.dot(y, N0.transpose(r)) + t ## calculate row distances d = N0.sqrt(N0.sum(N0.power(x - xt, 2), 1)) * mask ## calculate rmsd and stdv rmsd = N0.sqrt(N0.average(N0.compress(mask, d)**2)) stdv = MU.SD(N0.compress(mask, d)) ## check conditions for convergence d_rmsd = abs(rmsd - rmsd_old) d_stdv = abs(1 - stdv_old / stdv) if d_rmsd < eps_rmsd and d_stdv < eps_stdv: converged = 1 else: rmsd_old = rmsd stdv_old = stdv ## store result perc = round(float(N0.sum(mask)) / float(len(mask)), 2) ## throw out non-matching rows mask = N0.logical_and(mask, N0.less(d, rmsd + z * stdv)) outliers = N0.nonzero(N0.logical_not(mask)) iter_trace.append([perc, round(rmsd, 3), outliers]) n += 1 if n_iterations and n >= n_iterations: break return (r, t), iter_trace
def standardDeviation(self): sd = MU.SD(self.msm) return sd
def entropySD(self): centropy = N0.sum(-N0.log(self.msm)*\ self.msm)/float(self.n_cluster) return MU.SD(centropy)