def FixPredictedDistProb(predDistMatrix, labelWeight, labelDistribution): newPredDistMatrix = dict() for response in predDistMatrix.keys(): fixedProb = DistanceUtils.FixDistProb( predDistMatrix[response], labelWeight[response], labelDistribution[response]) newPredDistMatrix[response] = fixedProb return newPredDistMatrix
def DetermineProbThresholds(fixedProb, ratio_4_15A, distCutoffs): size = fixedProb.shape M1s = np.ones((size[0], size[1]), dtype=np.int8) mask_LR = np.triu(M1s, 24) + np.tril(M1s, -24) mask_MLR = np.triu(M1s, 12) + np.tril(M1s, -12) mask_SMLR = np.triu(M1s, 6) + np.tril(M1s, -6) mask_all = np.triu(M1s, 2) + np.tril(M1s, -2) mask_MR = mask_MLR - mask_LR mask_SR = mask_SMLR - mask_MLR mask_NR = mask_all - mask_SMLR len = fixedProb.shape[0] maxNums = [np.int32(x * len * 2) for x in ratio_4_15A] labelOf15 = DistanceUtils.LabelsOfOneDistance(config.InteractionLimit, distCutoffs) fixedProb_revised = np.sum(fixedProb[:, :, labelOf15:], axis=2) cutoffs = [] for mask, maxnum in zip([mask_LR, mask_MR, mask_SR, mask_NR], maxNums): res = fixedProb_revised[mask.nonzero()] res_sorted = res[res.argsort()] if res_sorted.shape[0] < maxnum + 1: cutoffs.append(res_sorted[-1]) else: cutoffs.append(res_sorted[maxnum]) return cutoffs
def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, prdProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProb = refData[response][0] potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential[:, :, lastDistBin: ] =0 CheckPotentialValues(potential) potentials[response] = potential return potentials
def main(argv): inputFile = None if len(argv) < 1: Usage() exit(1) inputFile = argv[0] printBoundPKL = False if len(argv) >= 2: printBoundPKL = True if inputFile is None: print 'ERROR: Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'ERROR: The input file does not exist: ', inputFile exit(1) content = DistanceUtils.LoadRawDistProbFile(inputFile) targetName, sequence, predictedDistProbMatrix, predictedContactProbMatrix = content[: 4] """ Skip this step since in version 3, we use an unbiased deep model if labelWeight is not None: fixedProb = dict() for apt in predictedDistProb.keys(): #print 'shapes: ', predictedDistProb[apt].shape, np.array(labelWeight[apt]).shape, np.array(labelDistribution[apt]).shape fixedProb[apt] = DistanceUtils.FixDistProb( predictedDistProb[apt], labelWeight[apt], labelDistribution[apt]) else: ## in this case, the probability values in predictedDistProb are already corrected fixedProb = predictedDistProb fixedProb = predictedDistProb if printProbMatrix: probFileName = targetName + probFileSuffix fh = open(probFileName, 'wb') cPickle.dump(fixedProb, fh, protocol = cPickle.HIGHEST_PROTOCOL) fh.close() """ bounds = EstimateDistanceBounds(predictedDistProbMatrix) ## output Cb-Cb bound in text format if bounds.has_key('CbCb'): boundFileName = targetName + '.bound.txt' boundMatrix = bounds['CbCb'] SaveBoundInListFormat(targetName, sequence, boundMatrix, boundFileName) if not printBoundPKL: return boundFileName = targetName + '.bound.pkl' with open(boundFileName, 'wb') as fh: cPickle.dump((bounds, targetName, sequence), fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, predProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProbList = refData[response][1] length = predProb.shape[0] if length < 400: refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ] else: refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ] print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length refProb = np.average(refProbs, axis=0) potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential{;, :, lastDistBin: ] = 0 CheckPotentialValues(potential) potentials[response] = potential return potentials
def main(argv): inputFile = None targetName = None labelNames = config.allAtomPairNames + config.allOrientationNames potentialFileSuffix = 'pkl' minPotential = -30.0 maxPotential = 30.0 UseWeight4Orientation = True UseWeight4Distance = True ## the largest dist cutoff rc = 18 alpha4DFIRE = 1.61 alpha4DFIREstr = '1.61' rgScale4DOPE = 1. ## reference reference = 'DFIRE' ## UseRef4Orientation = True ## refFile for SimuRW refFile = None #savefolder = os.getcwd() savefile="" if len(argv) < 1: Usage() exit(1) try: opts, args = getopt.getopt(argv,"a:w:r:l:u:f:s:o",["labelNames=", "useWeight=", "refState=", "minPotential=", "maxPotential=", "refFile=", "savefile=", "noRef4Orientation="]) #print opts, args except getopt.GetoptError: Usage() exit(1) if len(args) != 1: Usage() exit(1) inputFile = args[0] for opt, arg in opts: if opt in ("-a", "--labelNames"): labelNames = config.ParseLabelNames(arg) elif opt in ("-w", "--useWeight"): scheme = np.int32(arg) UseWeight4Orientation = (2 & scheme)>0 UseWeight4Distance = (1 & scheme)>0 elif opt in ("-r", "--refState"): fields = arg.split('+') reference = fields[0].upper() if reference not in allRefTypes: print 'ERROR: allowed reference types: ', allRefTypes exit(1) if len(fields) > 1: if fields[1].isdigit(): rc = np.int32(fields[1]) else: rc = np.float32(fields[1]) if reference == 'DFIRE': if len(fields) > 2: alpha4DFIREstr = fields[2] alpha4DFIRE = np.float32(fields[2]) elif reference == 'DOPE': if len(fields) > 2: rgScale4DOPE = np.float32(fields[2]) elif reference == 'SimuRW'.upper(): #rc = np.float32(fields[1]) print 'Using SimuRW potential' else: print 'ERROR: unsupported reference format: ', arg exit(1) elif opt in ("-f", "--refFile"): refFile = arg if not os.path.isfile(refFile): print 'the provided file for reference state is not valid: ', refFile exit(1) elif opt in ("-o", "--noRef4Orientation"): UseRef4Orientation = False elif opt in ("-s", "--savefile"): savefile = arg elif opt in ("-l", "--minPotential"): minPotential = np.float32(arg) elif opt in ("-u", "--maxPotential"): maxPotential = np.float32(arg) else: Usage() exit(1) if inputFile is None: print 'ERROR: Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'ERROR: The input file does not exist: ', inputFile exit(1) if reference in allRefTypesWithFiles and refFile is None: print 'ERROR: The file for user-sepcified reference state is empty' exit(1) if reference == 'DFIRE': if alpha4DFIRE > 10: ## take a random value between 1.57 and 1.63 alpha4DFIRE=random.uniform(1.57, 1.63) print 'alpha for DFIRE potential is ', alpha4DFIRE if alpha4DFIRE<1.55 or alpha4DFIRE>1.75: print 'ERROR: alpha4DFIRE shall be between 1.55 and 1.75' exit(1) if reference == 'DOPE': print 'rgScale for DOPE potential is', rgScale4DOPE if rgScale4DOPE > 1.2 or rgScale4DOPE <0.8: print 'ERROR: rgScale4DOPE shall be between 0.8 and 1.2' exit(1) if UseWeight4Distance: print 'Use weight for distance potential' if UseWeight4Orientation: print 'Use weight for orientation potential' if not UseRef4Orientation: print 'Do not use reference for orientation' content = DistanceUtils.LoadRawDistProbFile(inputFile) assert len(content) >=6 name, sequence, predictedProb, predictedContactProb, labelWeight, labelDistribution = content[:6] assert labelWeight is not None, "labelWeight shall not be empty" predData = (predictedProb, labelWeight, labelDistribution) targetName = os.path.basename(inputFile).split('.')[0] print 'Generating potential for ', targetName, 'with the following labels: ', labelNames filenames = [ targetName, 'pairPotential'] if reference == 'DFIRE': pairPotential, cutoffs, validProb, distPotential, oriPotential = CalcDistOriPotential(predData, labelNames, distPotType='DFIRE', param4Potential=alpha4DFIRE, largestDistance=rc, useWeight4Dist=UseWeight4Distance, useRef4Ori=UseRef4Orientation, useWeight4Ori=UseWeight4Orientation, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), alpha4DFIREstr]) elif reference == 'DOPE': pairPotential, cutoffs, validProb, distPotential, oriPotential = CalcDistOriPotential(predData, labelNames, distPotType='DOPE', param4Potential=rgScale4DOPE, largestDistance=rc, useWeight4Dist=UseWeight4Distance, useRef4Ori=UseRef4Orientation, useWeight4Ori=UseWeight4Orientation, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(rgScale4DOPE)]) else: print 'ERROR: unimplemented potential type: ', reference exit(1) if bool(oriPotential) and UseRef4Orientation: filenames.append('Ref4O') wStr=None if (bool(distPotential) and UseWeight4Distance) and (bool(oriPotential) and UseWeight4Orientation): wStr = 'Wt4OD' elif bool(oriPotential) and UseWeight4Orientation: wStr = 'Wt4O' elif bool(distPotential) and UseWeight4Distance: wStr = 'Wt4D' if wStr is not None: filenames.append(wStr) filenames.append('pkl') if savefile == "": savefile = '.'.join(filenames) ## save the result with open(savefile, 'wb') as fh: cPickle.dump((name, sequence, pairPotential, cutoffs, validProb), fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcPotentialBySimuRW(predDistMatrix, refFile, largestDistance=20, sequence=None, useWeight=False, minPotential=-30., maxPotential=30.): f=open(refFile, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for SimuRW potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue predProb = predDistMatrix[response] ## the first row of refProb corresponds to offset=1 refProb = refData[response] if labelName != 'CbCb': print 'distance label name not supported yet: ', labelName exit(1) if not subType.endswith('34C'): print 'distance label type not supported yet: ', subType exit(1) cutoff = config.GetCutoffs(response) length = predProb.shape[0] numLabels = predProb.shape[2] assert numLabels == refProb.shape[1] ## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ] maxAllowedDist[0] = 0 eps = 0.00001 maxAllowedDist[2] = 10.5 - eps maxAllowedDist[3] = 13.0 - eps maxAllowedDist[4] = 15.5 - eps maxAllowedDist[5] = 17.5 - eps maxAllowedDist[6] = 19.5 - eps potential = np.zeros_like(predProb) for i in range(0, length): for j in range(i+2, length): offset = j-i ## find the distance bin into which the maxAllowedDist falls lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff) if lastDistBin < (numLabels - 1): ## merge the pred prob and ref prob in the bins from lastDistBin to the end pred = predProb[i, j, : lastDistBin+1] ref = refProb[offset-1][:lastDistBin+1] potential[i, j, :lastDistBin+1] = -np.log( pred / ref ) potential[i, j, lastDistBin+1: ] = maxPotential else: ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) refProbLen = refProb.shape[0] #idx4rc = numLabels - 2 potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] ) potential[i, j] -= potential[i, j, rc_index] potential[i, j, rc_index + 1: ] = 0 ## only valid for symmetric atom pairs potential[j, i] = potential[i, j] if useWeigt and subType.endswith('Plus'): potential *= (1-predProb[:, :, -1]) CheckPotentialValues(potential) potentials[response] = potential return potentials
def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., useWeight=False, minPotential=-30., maxPotential=30.): potentials = dict() validProbs = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for DOPE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## a is the radius of reference sphere and rg is the estimated radius of gyration length = predDistMatrix[response].shape[0] rg = 0.395*length**(3./5)+7.257 a = np.sqrt(5./3) * rg * rgScale """ calculate n(r,a) defined in the DOPE paper. Below is the original formulation. ## rc is the upper bound of distance between two atoms rc = bincenters[-1] if rc <= 2*a: #nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3)) else: #nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6) """ ## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.square(points * (points - 2*a)) * (points + 4*a) tmpNra = np.average(values) return tmpNra ## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5 for i in range(len(binwidths)): if binwidths[i] >= 1: nra[i] = CalcApproxRefPot(i) ## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc) ## \delta(r) is equal to binwidths refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] ) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] if subType.endswith('Plus'): validProb = 1 - predProb[:, :, -1] else: validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32) ##if useWeight and the prob of disroder exists, adjust potential by prob of not beining in disorder status if useWeight and subType.endswith('Plus'): potential *= validProb[:, :, np.newaxis] ## remove the potential for the last distance bin, which corresponds to disorder status if subType.endswith('Plus'): potential = potential[:, :, :-1] CheckPotentialValues(m=potential) potentials[response] = potential.astype(np.float32) validProbs[response] = validProb.astype(np.float32) return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=18, useWeight=False, minPotential=-30, maxPotential=30): potentials = dict() ## validProbs saves the prob of one atom/residue pair likely have valid coordinates validProbs = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for DFIRE potential: ', response continue if not config.IsDiscreteLabel(labelType): print 'WARNING: the distance label is not discrete: ', response continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc) ## \delta(r) is binwidths and r is the bincenters refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] ) ## idx is the index for a bin def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.power(points / bincenters[rc_index], alpha) avg = np.average(values) tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] ) return tmpRefPot ## get a more accurate estimation of reference for the bin with a large width for i in range(len(binwidths)): if binwidths[i] >= 1: refPot[i] = CalcApproxRefPot(i) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] #obsPot = np.log(predProb / (sys.float_info.min + predProbRC)) obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference potential and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] if subType.endswith('Plus'): validProb = 1 - predProb[:, :, -1] else: validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32) ##if useWeight=True and the prob of being disorder exists, adjust potential by the prob of not being in disorder status if useWeight and subType.endswith('Plus'): potential *= validProb[:, :, np.newaxis] ## remove the potential for the last distance bin, which corresponds to disorder status if subType.endswith('Plus'): potential = potential[:, :, :-1] CheckPotentialValues(m=potential) potentials[response] = potential.astype(np.float32) validProbs[response] = validProb.astype(np.float32) return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=15, minPotential=-20, maxPotential=20): potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for DFIRE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc) ## \delta(r) is binwidths and r is the bincenters refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] ) ## idx is the index for binwidth def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.power(points / bincenters[rc_index], alpha) avg = np.average(values) tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] ) return tmpRefPot ## get a more accurate estimation of reference for the first bin [ refPot[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwdiths[i] >= 1 ] ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference potential and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] CheckPotentialValues(m=potential) potentials[response] = potential return potentials def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., minPotential=-20., maxPotential=20.): potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for DOPE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## a is the radius of reference sphere and rg is the estimated radius of gyration length = predDistMatrix[response].shape[0] rg = 0.395*length**(3./5)+7.257 a = np.sqrt(5./3) * rg * rgScale """ calculate n(r,a) defined in the DOPE paper. Below is the original formulation. ## rc is the upper bound of distance between two atoms rc = bincenters[-1] if rc <= 2*a: #nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3)) else: #nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6) """ ## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.square(points * (points - 2*a)) * (points + 4*a) tmpNra = np.average(values) return tmpNra ## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5 [ nra[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwidths[i] >= 1 ] ## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc) ## \delta(r) is equal to binwidths refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] ) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] CheckPotentialValues(m=potential) potentials[response] = potential return potentials def CalcPotentialBySimuRW(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for SimuRW potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue predProb = predDistMatrix[response] ## the first row of refProb corresponds to offset=1 refProb = refData[response] if labelName != 'CbCb': print 'distance label name not supported yet: ', labelName exit(1) if not subType.endswith('34C'): print 'distance label type not supported yet: ', subType exit(1) cutoff = config.GetCutoffs(response) length = predProb.shape[0] numLabels = predProb.shape[2] assert numLabels == refProb.shape[1] ## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ] maxAllowedDist[0] = 0 eps = 0.00001 maxAllowedDist[2] = 10.5 - eps maxAllowedDist[3] = 13.0 - eps maxAllowedDist[4] = 15.5 - eps maxAllowedDist[5] = 17.5 - eps maxAllowedDist[6] = 19.5 - eps potential = np.zeros_like(predProb) for i in range(0, length): for j in range(i+2, length): offset = j-i ## find the distance bin into which the maxAllowedDist falls lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff) if lastDistBin < (numLabels - 1): ## merge the pred prob and ref prob in the bins from lastDistBin to the end pred = predProb[i, j, : lastDistBin+1] ref = refProb[offset-1][:lastDistBin+1] potential[i, j, :lastDistBin+1] = -np.log( pred / ref ) potential[i, j, lastDistBin+1: ] = maxPotential else: ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) refProbLen = refProb.shape[0] #idx4rc = numLabels - 2 potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] ) potential[i, j] -= potential[i, j, rc_index] potential[i, j, rc_index + 1: ] = 0 ## only valid for symmetric atom pairs potential[j, i] = potential[i, j] CheckPotentialValues(potential) potentials[response] = potential return potentials def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, predProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProbList = refData[response][1] length = predProb.shape[0] if length < 400: refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ] else: refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ] print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length refProb = np.average(refProbs, axis=0) potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential{;, :, lastDistBin: ] = 0 CheckPotentialValues(potential) potentials[response] = potential return potentials def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, prdProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProb = refData[response][0] potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential[:, :, lastDistBin: ] =0 CheckPotentialValues(potential) potentials[response] = potential return potentials allRefTypesWithFiles = [ ref.upper() for ref in ['SimuRW', 'EmpSI', 'EmpSD'] ] allRefTypes = [ 'DFIRE', 'DOPE' ] + allRefTypesWithFiles def main(argv): inputFile = None targetName = None labelNames = ['CbCb'] potentialFileSuffix = 'pkl' minPotential = -30.0 maxPotential = 30.0 minSeqSep = 3 minSeqSepStr='3' ## the largest dist cutoff rc = 18 alpha4DFIRE = 1.61 rgScale4DOPE = 1. ## reference reference = 'DFIRE' ## refFile refFile = None try: opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="]) print opts, args except getopt.GetoptError: Usage() exit(1) if len(opts) < 1: Usage() exit(1) for opt, arg in opts: if opt in ("-i", "--input"): inputFile = arg elif opt in ("-a", "--atomPairType"): labelNames = config.ParseLabelNames(arg) elif opt in ("-r", "--refState"): fields = arg.split('+') reference = fields[0].upper() if reference not in allRefTypes: print 'allowed reference types: ', allRefTypes exit(1) if len(fields) > 1: if reference == 'DFIRE': rc = np.float32(fields[1]) if len(fields) > 2: alpha4DFIRE = np.float32(fields[2]) elif reference == 'DOPE': rc = np.float32(fields[1]) if len(fields) > 2: rgScale4DOPE = np.float32(fields[2]) elif reference == 'SimuRW'.upper(): rc = np.float32(fields[1]) else: print 'WARNING: unsupported reference format: ', arg elif opt in ("-f", "--refFile"): refFile = arg if not os.path.isfile(refFile): print 'the provided file for reference state is not valid: ', refFile exit(1) elif opt in ("-l", "--minPotential"): minPotential = np.float32(arg) elif opt in ("-u", "--maxPotential"): maxPotential = np.float32(arg) elif opt in ("-s", "--minSeqSep"): minSeqSep = np.int32(arg) minSeqSepStr = arg if minSeqSep < 1: print 'ERROR: minSeqSep shall be at least 1' exit(1) elif opt in ("-t", "--textFormat"): potentialFileSuffix = '.txt' elif opt in ("-n", "--nonZero"): resetFlag = False else: Usage() exit(1) if inputFile is None: print 'Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'The input file does not exist: ', inputFile exit(1) if reference in allRefTypesWithFiles and refFile is None: print 'The file for user-sepcified reference state is empty' exit(1) targetName = os.path.basename(inputFile).split('.')[0] content = DistanceUtils.LoadRawDistProbFile(inputFile) assert len(content) >=6 name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6] assert labelWeight is not None, "labelWeight shall not be empty" ## if needed, add code to here the predicted dist probability filenames = [ targetName, 'distPotential'] if reference == 'DFIRE': potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix]) elif reference == 'DOPE': potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix]) elif reference == 'SimuRW'.upper(): potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), potentialFileSuffix]) else: print 'ERROR: unimplemented reference state: ', reference exit(1) potentialFileName = '.'.join(filenames) ## save to PKL file if potentialFileName.endswith('.pkl'): fh = open(potentialFileName, 'wb') potential_new = dict() distCutoffs = dict() for response, pot in potential.iteritems(): labelName = config.Response2LabelName(response) if labelName not in set(labelNames): continue potential_new[response] = pot distCutoffs[response] = config.GetCutoffs(response) cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return ## save to text file potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix fh = open(potentialFileName, 'w') fh.write('#TARGET\t' + targetName + '\n') fh.write('#SEQ\t' + sequence + '\n') fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n') for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue size = pot.shape for i in xrange(size[0]): rawPotStrs = [] for j in xrange(i+ minSeqSep, size[1]): atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName) y = pot[i, j] rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] ) rawPotStrs.append(rawPotStr) if len(rawPotStrs) >0: fh.write('\n'.join(rawPotStrs) + '\n') fh.close() if __name__ == "__main__": main(sys.argv[1:])
def main(argv): inputFile = None targetName = None labelNames = ['CbCb'] potentialFileSuffix = 'pkl' minPotential = -30.0 maxPotential = 30.0 minSeqSep = 3 minSeqSepStr='3' ## the largest dist cutoff rc = 18 alpha4DFIRE = 1.61 rgScale4DOPE = 1. ## reference reference = 'DFIRE' ## refFile refFile = None try: opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="]) print opts, args except getopt.GetoptError: Usage() exit(1) if len(opts) < 1: Usage() exit(1) for opt, arg in opts: if opt in ("-i", "--input"): inputFile = arg elif opt in ("-a", "--atomPairType"): labelNames = config.ParseLabelNames(arg) elif opt in ("-r", "--refState"): fields = arg.split('+') reference = fields[0].upper() if reference not in allRefTypes: print 'allowed reference types: ', allRefTypes exit(1) if len(fields) > 1: if reference == 'DFIRE': rc = np.float32(fields[1]) if len(fields) > 2: alpha4DFIRE = np.float32(fields[2]) elif reference == 'DOPE': rc = np.float32(fields[1]) if len(fields) > 2: rgScale4DOPE = np.float32(fields[2]) elif reference == 'SimuRW'.upper(): rc = np.float32(fields[1]) else: print 'WARNING: unsupported reference format: ', arg elif opt in ("-f", "--refFile"): refFile = arg if not os.path.isfile(refFile): print 'the provided file for reference state is not valid: ', refFile exit(1) elif opt in ("-l", "--minPotential"): minPotential = np.float32(arg) elif opt in ("-u", "--maxPotential"): maxPotential = np.float32(arg) elif opt in ("-s", "--minSeqSep"): minSeqSep = np.int32(arg) minSeqSepStr = arg if minSeqSep < 1: print 'ERROR: minSeqSep shall be at least 1' exit(1) elif opt in ("-t", "--textFormat"): potentialFileSuffix = '.txt' elif opt in ("-n", "--nonZero"): resetFlag = False else: Usage() exit(1) if inputFile is None: print 'Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'The input file does not exist: ', inputFile exit(1) if reference in allRefTypesWithFiles and refFile is None: print 'The file for user-sepcified reference state is empty' exit(1) targetName = os.path.basename(inputFile).split('.')[0] content = DistanceUtils.LoadRawDistProbFile(inputFile) assert len(content) >=6 name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6] assert labelWeight is not None, "labelWeight shall not be empty" ## if needed, add code to here the predicted dist probability filenames = [ targetName, 'distPotential'] if reference == 'DFIRE': potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix]) elif reference == 'DOPE': potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix]) elif reference == 'SimuRW'.upper(): potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), potentialFileSuffix]) else: print 'ERROR: unimplemented reference state: ', reference exit(1) potentialFileName = '.'.join(filenames) ## save to PKL file if potentialFileName.endswith('.pkl'): fh = open(potentialFileName, 'wb') potential_new = dict() distCutoffs = dict() for response, pot in potential.iteritems(): labelName = config.Response2LabelName(response) if labelName not in set(labelNames): continue potential_new[response] = pot distCutoffs[response] = config.GetCutoffs(response) cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return ## save to text file potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix fh = open(potentialFileName, 'w') fh.write('#TARGET\t' + targetName + '\n') fh.write('#SEQ\t' + sequence + '\n') fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n') for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue size = pot.shape for i in xrange(size[0]): rawPotStrs = [] for j in xrange(i+ minSeqSep, size[1]): atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName) y = pot[i, j] rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] ) rawPotStrs.append(rawPotStr) if len(rawPotStrs) >0: fh.write('\n'.join(rawPotStrs) + '\n') fh.close()
def Score(pairwiseMatrix, potential, labelNames, outputDetails=False, minSeqSep=6, maxCstDist=None): totalScore = 0.0 scores = dict() for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue if not pairwiseMatrix.has_key(labelName): print 'WARNING: the atomDistMatrix does not have distance information for atom pair:', labelName continue if not labelType.startswith('Discrete'): print 'ERROR: unsupported labelType: ', labelType exit(1) pm = pairwiseMatrix[labelName] assert pm.shape == ( pot.shape[0], pot.shape[1] ), "the size of the pairwise potential not compatible with the matrix" if labelName in config.allAtomPairNames: ## discretize the distance matrix, an invalid entry -1 will have the largest label number labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix( pm, config.distCutoffs[subType], invalidDistanceSeparated=False) elif labelName in config.allOrientationNames: labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix( pm, config.distCutoffs[subType], distMatrix=pairwiseMatrix['CbCb'], invalidEntrySeparated=False) size = pot.shape m = np.mgrid[0:size[0], 0:size[1]] scoreMatrix = pot[m[0], m[1], labelMatrix] if labelName in config.allAtomPairNames and maxCstDist is not None: label4maxDist = DistanceUtils.LabelsOfOneDistance( maxCstDist, config.distCutoffs[subType]) np.putmask(scoreMatrix, labelMatrix > label4maxDist, 0) scores[response] = np.sum(np.triu(scoreMatrix, minSeqSep)) totalScore += scores[response] if outputDetails: ## note that if the potential matrix is not symmetric, we have to do something more here indices = np.triu_indices(size[0], k=minSeqSep, m=size[1]) scores = scoreMatrix[indices] labels = labelMatrix[indices] for i, j, s, label in zip(indices[0], indices[1], scores, labels): outinfo = [ str(i + 1), str(j + 1), apt, str(label), "{:.4f}".format(s) ] + ["{:.3f}".format(v) for v in pot[i, j]] outstr = ' '.join(outinfo) print outstr return totalScore, scores
def GenerateSplinePotential4Distance(potData, labelNames=['CbCb'], topRatio=None, minSeqSep=1, potThreshold=np.finfo(np.float32).max, barrier=1.0): target, sequence, potential, distCutoffs = potData[:4] if len(potData) > 4: validProb = potData[4] else: print 'WARNING: it is better to provide validProb for distance potential' validProb = None allConstraints = [] for response, pot in potential.iteritems(): #print 'response=', response description = response labelName, labelType, subType = ParseResponse(response) if labelName not in config.allAtomPairNames: continue if labelName not in labelNames: continue #x = distCutoffs[response][1:] x = distCutoffs[response] binWidths = [b - a for a, b in zip(x[1:-1], x[2:])] binWidth = np.average(binWidths) #assert all([ (binWidth-b)<0.0001 for b in binWidths]) ## here we add repulsion to reduce steric clashes ## for CaCa and CbCb, the minimum distance is 3.6A. The penalty is 3 in [2, 3.6] and 10 in [0, 2] ## for NO, the minimum distance is 2, and the penalty is 1 for [2, 2.4] and 5 for [0, 2] firstMinDist = 2 secondMinDist = 3.6 yPenalty = [10, 4, 0.5] if labelName == 'NO': secondMinDist = 2.4 yPenalty = [8, 3, 0.2] xPrefix = [0, firstMinDist, secondMinDist] ## find the index of the 2nd min distance in x, i.e., x[secondLabel] <=secondMinDist < x[secondLabel+1] secondLabel = DistanceUtils.LabelsOfOneDistance( secondMinDist + 0.0001, x) #print 'secondLabel=', secondLabel assert secondLabel >= 1 assert secondLabel < len(distCutoffs[response]) xk = [(a + b) / 2 for a, b in zip(x[secondLabel:-1], x[secondLabel + 1:])] xk.append(x[-1] + binWidth / 2.) xk = xPrefix + xk #print 'xk=', xk #LabelOfAdjacentCAs = DistanceUtils.LabelsOfOneDistance(4.50001, x) """ ## the first interval of distCutoffs, i.e., [0, x[0]), usually has width > binWidth ## Here we split the first interval into several bins with the same binwidth ## Assume that x[0] = binWidth times an integer bstep = barrier * binWidth xPrefix = np.linspace(-binWidth, x[0]-binWidth, np.rint(x[0]/binWidth).astype(np.int32)+1 ) xPrefix = xPrefix.tolist() xk = xPrefix + x.tolist() xk2 = [ (a+b)/2 for a, b in zip(xk[:-1], xk[1:]) ] xk2.append(x[-1]+binWidth/2.) xk = xk2 yPrefix = np.arange( len(xPrefix)-1, -1, -1) * bstep yPrefix = yPrefix.tolist() """ size = pot.shape residuePairs = [] for i in xrange(size[0]): jstart = i + minSeqSep if not config.IsSymmetricLabel(labelName): jstart = 0 for j in xrange(jstart, size[1]): offset = abs(i - j) if offset < minSeqSep: continue residuePairs.append((i, j)) ## always use repulsion potential for two sequentially adjacent Ca atoms if minSeqSep > 1 and labelName == 'CaCa': residuePairs.extend([(i, i + 1) for i in xrange(size[0] - 1)]) for i, j in residuePairs: y = pot[i, j] """ ## y[0] is the potential for the first interval [0, x[0]). We increase potential for distance < x[0] for every binWidth Angstrom yPrefix2 = [ y[0] + ye for ye in yPrefix ] yk = yPrefix2 + y[1:].tolist() """ yPrefix = [max(y[secondLabel], 0) + ye for ye in yPenalty] y2 = y.tolist() yk = yPrefix + y2[secondLabel:] assert len(xk) == len( yk ), 'xk and yk length does not match for ' + labelName + ' and residues ' + str( i) + ' ' + str(j) ## when one atom pair is not symmetric (e.g., NO), it appears twice in the constraint set, so we divide its potential by 2 if not config.IsSymmetricLabel(labelName): yk = [ye / 2. for ye in yk] atom1, atom2 = SelectAtomPair(sequence, i, j, labelName) constraint = dict() constraint['x'] = xk constraint['y'] = yk constraint['response'] = response constraint['binWidth'] = binWidth constraint['type'] = 'AtomPair' constraint['atoms'] = [atom1, atom2] constraint['atomNums'] = [i + 1, j + 1] allConstraints.append(constraint) return allConstraints
def EstimateDistanceBound(fixedProb0, response): labelName, distLabelType, subType = config.ParseResponse(response) if 'Plus' in distLabelType: ## merge the last interval (which represents invalid distance) to the last second one size = fixedProb0.shape fixedProb = np.zeros((size[0], size[1], size[2] - 1), dtype=fixedProb0.dtype) fixedProb[:, :, :-1] = fixedProb0[:, :, :-2] fixedProb[:, :, -1] = np.sum(fixedProb0[:, :, -2:], axis=2) else: fixedProb = fixedProb0 if config.GetResponseProbDims(response) < 12: print 'ERROR: it is not meaningful to estimate inter-resdiue distance when the number of labels is < 12' exit(1) subType = distLabelType[len('Discrete'):] distCutoffs_original = config.distCutoffs[subType] distCutoffs = distCutoffs_original[1:] ## probability thresholds for 15 Angstrom, from long-range, to medium-range, short-range and near-range ## if the predicted probability for 15A is larger than the threshold, we think the distance of one residue pair shall be larger than 15A thresholds_4_15A = [0.75, 0.65, 0.5, 1.] """ if distCutoffs[-1] > 17.0: thresholds_4_15A = [0.85, 0.75, 0.6, 1. ] """ ## ratio threshods for 15A. we take at most ratio*L long-, medium- and short-range distance restraints for a protein of L residues ratios_4_15A = [9.5, 2.2, 2.2, 4.] ## determine the real probability thresholds for 15A by ratio cutoffs = DetermineProbThresholds(fixedProb, ratios_4_15A, distCutoffs_original) #print 'prob cutoffs determined by ratio are: ', cutoffs prob_thresholds_4_15A = [ min(x, y) for x, y in zip(thresholds_4_15A, cutoffs) ] labelOf15 = DistanceUtils.LabelsOfOneDistance(config.InteractionLimit, distCutoffs_original) #print 'the final prob cutoffs are: ', thresholds_4_15A halfBinWidth = np.average([ distCutoffs[i] - distCutoffs[i - 1] for i in range(1, len(distCutoffs)) ]) / 2. mid_distance = np.array(distCutoffs - halfBinWidth).astype(np.float32) upper_boundary_distance = np.array(distCutoffs).astype(np.float32) numDistBins = mid_distance.shape[0] mid_distance_sq = np.square(mid_distance) size = fixedProb.shape #print size #print numDistBins assert size[2] == numDistBins + 1 ## estimates[:, :, 0] is the expected distance if it is less than 15A ## estimates[:, :, 1] is the variance ## estimates[:, :, 2] is the lower bound ## estimates[:, :, 3] is the upper bound ## not sure why initilize to -1 estimates = np.full((size[0], size[1], 10), -1, dtype=np.float32) for i in range(size[0]): for j in range(size[1]): offset = abs(i - j) if offset < 2: continue elif offset < 6: rangeIndex = 3 elif offset < 12: rangeIndex = 2 elif offset < 24: rangeIndex = 1 else: rangeIndex = 0 ## if the prob of this residue pair suggest that the estimated distance is likely to be >15A, then do nothing if np.sum( fixedProb[i, j, labelOf15:]) > prob_thresholds_4_15A[rangeIndex]: continue ## renormalize the distance prob distribution by setting the prob of the largest distance bin to 0 newProb = fixedProb[i, j, :numDistBins] / np.sum( fixedProb[i, j, :numDistBins]) dist_mean = np.average(mid_distance, weights=newProb) dist_sq_mean = np.average(mid_distance_sq, weights=newProb) dist_std = np.sqrt(dist_sq_mean - np.square(dist_mean) + np.square(halfBinWidth) * 1. / 3) ## find the bin into which dist_mean falls into binIndex = 0 while dist_mean > upper_boundary_distance[binIndex]: binIndex = binIndex + 1 ## now dist_mean <= upper_boundary_distance[binIndex] and dist_mean > upper_boundary_distance[binIndex - 1] upperProb = np.zeros(numDistBins - binIndex, dtype=np.float32) upperProb[0] = (upper_boundary_distance[binIndex] - dist_mean) / (2 * halfBinWidth) * newProb[binIndex] upperProb[1:] = newProb[binIndex + 1:] lowerProb = np.zeros(binIndex + 1, dtype=np.float32) lowerProb[0:binIndex] = newProb[0:binIndex] lowerProb[binIndex] = newProb[binIndex] - upperProb[0] ## calculate the upper distance std dist_var_upper = np.dot( np.square(mid_distance[binIndex + 1:] - dist_mean) + np.square(halfBinWidth) * 1. / 3, upperProb[1:]) + upperProb[0] * np.square( upper_boundary_distance[binIndex] - dist_mean) * 1. / 3 ## the unnormalized distance deviation dist_std_upper = np.sqrt(dist_var_upper) ## the normalized distance deviation dist_std_upper2 = np.sqrt(dist_var_upper / np.sum(upperProb)) dist_std_upper3 = (np.dot(mid_distance[binIndex + 1:] - dist_mean, upperProb[1:]) + upperProb[0] * (upper_boundary_distance[binIndex] - dist_mean) / 2) / np.sum(upperProb) ## the expected distance deviation dist_std_upper4 = np.sum(upperProb) * dist_std_upper2 ## calculate the lower distance std if binIndex == 0: left_boundary = distCutoffs[0] - 2 * halfBinWidth else: left_boundary = upper_boundary_distance[binIndex - 1] dist_var_lower = np.dot( np.square(mid_distance[:binIndex] - dist_mean) + np.square(halfBinWidth) * 1. / 3, lowerProb[:binIndex]) + lowerProb[binIndex] * np.square( dist_mean - left_boundary) * 1. / 3 ## the unnormalized distance deviation dist_std_lower = np.sqrt(dist_var_lower) ## the normalized distance deviation dist_std_lower2 = np.sqrt(dist_var_lower / np.sum(lowerProb)) dist_std_lower3 = ( np.dot(dist_mean - mid_distance[:binIndex], lowerProb[:binIndex]) + lowerProb[binIndex] * (dist_mean - left_boundary) / 2) / np.sum(lowerProb) ## the expected distance deviation dist_std_lower4 = np.sum(lowerProb) * dist_std_lower2 """ ##only keep those residue pairs with estimated distance < 15 Angstrom if dist_mean >= 15.0: estimates[i, j] = np.array( [ -1. ] * 10 ).astype(np.float32) else: estimates[i, j] = np.array([dist_mean, dist_std, dist_std_lower, dist_std_upper, dist_std_lower2, dist_std_upper2, dist_std_lower3, dist_std_upper3, dist_std_lower4, dist_std_upper4]).astype(np.float32) """ estimates[i, j] = np.array([ dist_mean, dist_std, dist_std_lower, dist_std_upper, dist_std_lower2, dist_std_upper2, dist_std_lower3, dist_std_upper3, dist_std_lower4, dist_std_upper4 ]).astype(np.float32) return estimates