def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, prdProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProb = refData[response][0] potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential[:, :, lastDistBin: ] =0 CheckPotentialValues(potential) potentials[response] = potential return potentials
def CalcDistOriPotential(predData, labelNames=['CaCa', 'CbCb', 'NO'] + ['Ca1Cb1Cb2Ca2','N1Ca1Cb1Cb2','Ca1Cb1Cb2'], distPotType='DFIRE', param4Potential=1.61, largestDistance=18, useWeight4Dist=True, useRef4Ori=True, useWeight4Ori=True, minPotential=-30, maxPotential=30): assert distPotType.upper() in ['DFIRE', 'DOPE'] predProbMatrix, labelWeight, labelDistribution = predData validDistribution = dict() validLabelWeight = dict() validLabelDistribution = dict() existingLabelNames = [] for response, pred in predProbMatrix.iteritems(): labelName,_, _ = config.ParseResponse(response) if labelName not in labelNames: continue existingLabelNames.append(labelName) validDistribution[response] = pred validLabelWeight[response] = labelWeight[response] validLabelDistribution[response] = labelDistribution[response] missingLabelNames = list(set(labelNames) - set(existingLabelNames)) if len(missingLabelNames)>0: print 'WARNING: the predicted probability file does not have information for the following label names: ', missingLabelNames pairPotential = dict() validProb = dict() if distPotType == 'DOPE': distPotential, distValidProb = CalcPotentialByDOPE(validDistribution, largestDistance=rc, rgScale=param4Potential, useWeight=useWeight4Dist, minPotential=minPotential, maxPotential=maxPotential) else: distPotential, distValidProb = CalcPotentialByDFIRE(validDistribution, alpha=param4Potential, largestDistance=largestDistance, useWeight=useWeight4Dist, minPotential=minPotential, maxPotential=maxPotential) pairPotential.update(distPotential) validProb.update(distValidProb) oriPotential, oriValidProb = CalcOrientationPotential(validDistribution, useRef=useRef4Ori, useWeight=useWeight4Ori, labelWeight=validLabelWeight, labelDistribution=validLabelDistribution, minPotential=minPotential, maxPotential=maxPotential) pairPotential.update(oriPotential) validProb.update(oriValidProb) cutoffs = dict() for response in pairPotential.keys(): cutoffs[response] = config.GetCutoffs(response) return pairPotential, cutoffs, validProb, distPotential, oriPotential
def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, predProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProbList = refData[response][1] length = predProb.shape[0] if length < 400: refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ] else: refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ] print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length refProb = np.average(refProbs, axis=0) potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential{;, :, lastDistBin: ] = 0 CheckPotentialValues(potential) potentials[response] = potential return potentials
def CalcPotentialBySimuRW(predDistMatrix, refFile, largestDistance=20, sequence=None, useWeight=False, minPotential=-30., maxPotential=30.): f=open(refFile, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for SimuRW potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue predProb = predDistMatrix[response] ## the first row of refProb corresponds to offset=1 refProb = refData[response] if labelName != 'CbCb': print 'distance label name not supported yet: ', labelName exit(1) if not subType.endswith('34C'): print 'distance label type not supported yet: ', subType exit(1) cutoff = config.GetCutoffs(response) length = predProb.shape[0] numLabels = predProb.shape[2] assert numLabels == refProb.shape[1] ## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ] maxAllowedDist[0] = 0 eps = 0.00001 maxAllowedDist[2] = 10.5 - eps maxAllowedDist[3] = 13.0 - eps maxAllowedDist[4] = 15.5 - eps maxAllowedDist[5] = 17.5 - eps maxAllowedDist[6] = 19.5 - eps potential = np.zeros_like(predProb) for i in range(0, length): for j in range(i+2, length): offset = j-i ## find the distance bin into which the maxAllowedDist falls lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff) if lastDistBin < (numLabels - 1): ## merge the pred prob and ref prob in the bins from lastDistBin to the end pred = predProb[i, j, : lastDistBin+1] ref = refProb[offset-1][:lastDistBin+1] potential[i, j, :lastDistBin+1] = -np.log( pred / ref ) potential[i, j, lastDistBin+1: ] = maxPotential else: ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) refProbLen = refProb.shape[0] #idx4rc = numLabels - 2 potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] ) potential[i, j] -= potential[i, j, rc_index] potential[i, j, rc_index + 1: ] = 0 ## only valid for symmetric atom pairs potential[j, i] = potential[i, j] if useWeigt and subType.endswith('Plus'): potential *= (1-predProb[:, :, -1]) CheckPotentialValues(potential) potentials[response] = potential return potentials
def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., useWeight=False, minPotential=-30., maxPotential=30.): potentials = dict() validProbs = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for DOPE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## a is the radius of reference sphere and rg is the estimated radius of gyration length = predDistMatrix[response].shape[0] rg = 0.395*length**(3./5)+7.257 a = np.sqrt(5./3) * rg * rgScale """ calculate n(r,a) defined in the DOPE paper. Below is the original formulation. ## rc is the upper bound of distance between two atoms rc = bincenters[-1] if rc <= 2*a: #nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3)) else: #nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6) """ ## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.square(points * (points - 2*a)) * (points + 4*a) tmpNra = np.average(values) return tmpNra ## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5 for i in range(len(binwidths)): if binwidths[i] >= 1: nra[i] = CalcApproxRefPot(i) ## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc) ## \delta(r) is equal to binwidths refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] ) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] if subType.endswith('Plus'): validProb = 1 - predProb[:, :, -1] else: validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32) ##if useWeight and the prob of disroder exists, adjust potential by prob of not beining in disorder status if useWeight and subType.endswith('Plus'): potential *= validProb[:, :, np.newaxis] ## remove the potential for the last distance bin, which corresponds to disorder status if subType.endswith('Plus'): potential = potential[:, :, :-1] CheckPotentialValues(m=potential) potentials[response] = potential.astype(np.float32) validProbs[response] = validProb.astype(np.float32) return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=18, useWeight=False, minPotential=-30, maxPotential=30): potentials = dict() ## validProbs saves the prob of one atom/residue pair likely have valid coordinates validProbs = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: #print 'WARNING: unsupported response for DFIRE potential: ', response continue if not config.IsDiscreteLabel(labelType): print 'WARNING: the distance label is not discrete: ', response continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc) ## \delta(r) is binwidths and r is the bincenters refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] ) ## idx is the index for a bin def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.power(points / bincenters[rc_index], alpha) avg = np.average(values) tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] ) return tmpRefPot ## get a more accurate estimation of reference for the bin with a large width for i in range(len(binwidths)): if binwidths[i] >= 1: refPot[i] = CalcApproxRefPot(i) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] #obsPot = np.log(predProb / (sys.float_info.min + predProbRC)) obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference potential and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] if subType.endswith('Plus'): validProb = 1 - predProb[:, :, -1] else: validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32) ##if useWeight=True and the prob of being disorder exists, adjust potential by the prob of not being in disorder status if useWeight and subType.endswith('Plus'): potential *= validProb[:, :, np.newaxis] ## remove the potential for the last distance bin, which corresponds to disorder status if subType.endswith('Plus'): potential = potential[:, :, :-1] CheckPotentialValues(m=potential) potentials[response] = potential.astype(np.float32) validProbs[response] = validProb.astype(np.float32) return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=15, minPotential=-20, maxPotential=20): potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for DFIRE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc) ## \delta(r) is binwidths and r is the bincenters refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] ) ## idx is the index for binwidth def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.power(points / bincenters[rc_index], alpha) avg = np.average(values) tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] ) return tmpRefPot ## get a more accurate estimation of reference for the first bin [ refPot[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwdiths[i] >= 1 ] ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference potential and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] CheckPotentialValues(m=potential) potentials[response] = potential return potentials def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., minPotential=-20., maxPotential=20.): potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for DOPE potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue cutoff = config.GetCutoffs(response) ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ] ## a is the radius of reference sphere and rg is the estimated radius of gyration length = predDistMatrix[response].shape[0] rg = 0.395*length**(3./5)+7.257 a = np.sqrt(5./3) * rg * rgScale """ calculate n(r,a) defined in the DOPE paper. Below is the original formulation. ## rc is the upper bound of distance between two atoms rc = bincenters[-1] if rc <= 2*a: #nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3)) else: #nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6) """ ## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) def CalcApproxRefPot(idx=0): points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5) values = np.square(points * (points - 2*a)) * (points + 4*a) tmpNra = np.average(values) return tmpNra ## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5 [ nra[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwidths[i] >= 1 ] ## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc) ## \delta(r) is equal to binwidths refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] ) ## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability predProb = predDistMatrix[response] predProbRC = predProb[:, :, rc_index : rc_index+1] obsPot = np.log(predProb / predProbRC) ## calculate the final potential, which is the difference between reference and observed potential potential = np.zeros_like(predDistMatrix[response]) potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index] CheckPotentialValues(m=potential) potentials[response] = potential return potentials def CalcPotentialBySimuRW(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response in predDistMatrix.keys(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: print 'WARNING: unsupported response for SimuRW potential: ', response continue if not conifg.IsDiscreteLabel(labelType): continue predProb = predDistMatrix[response] ## the first row of refProb corresponds to offset=1 refProb = refData[response] if labelName != 'CbCb': print 'distance label name not supported yet: ', labelName exit(1) if not subType.endswith('34C'): print 'distance label type not supported yet: ', subType exit(1) cutoff = config.GetCutoffs(response) length = predProb.shape[0] numLabels = predProb.shape[2] assert numLabels == refProb.shape[1] ## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ] maxAllowedDist[0] = 0 eps = 0.00001 maxAllowedDist[2] = 10.5 - eps maxAllowedDist[3] = 13.0 - eps maxAllowedDist[4] = 15.5 - eps maxAllowedDist[5] = 17.5 - eps maxAllowedDist[6] = 19.5 - eps potential = np.zeros_like(predProb) for i in range(0, length): for j in range(i+2, length): offset = j-i ## find the distance bin into which the maxAllowedDist falls lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff) if lastDistBin < (numLabels - 1): ## merge the pred prob and ref prob in the bins from lastDistBin to the end pred = predProb[i, j, : lastDistBin+1] ref = refProb[offset-1][:lastDistBin+1] potential[i, j, :lastDistBin+1] = -np.log( pred / ref ) potential[i, j, lastDistBin+1: ] = maxPotential else: ## determine the last distance bin rc = min(cutoff[-1], largestDistance) - 0.001 if (rc<10.0): print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc exit(1) rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff) refProbLen = refProb.shape[0] #idx4rc = numLabels - 2 potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] ) potential[i, j] -= potential[i, j, rc_index] potential[i, j, rc_index + 1: ] = 0 ## only valid for symmetric atom pairs potential[j, i] = potential[i, j] CheckPotentialValues(potential) potentials[response] = potential return potentials def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, predProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProbList = refData[response][1] length = predProb.shape[0] if length < 400: refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ] else: refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ] print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length refProb = np.average(refProbs, axis=0) potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential{;, :, lastDistBin: ] = 0 CheckPotentialValues(potential) potentials[response] = potential return potentials def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.): f=open(userRef, 'rb') refData = cPickle.load(f) f.close() potentials = dict() for response, prdProb in predDistMatrix.iteritems(): labelName, labelType, _ = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if not conifg.IsDiscreteLabel(labelType): continue refProb = refData[response][0] potential = - np.log ( predProb / refProb ) rc = largestDistance cutoff = config.GetCutoffs(response) lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff) lastCol = potential[:, :, lastDistBin] potential = potential - lastCol potential[:, :, lastDistBin: ] =0 CheckPotentialValues(potential) potentials[response] = potential return potentials allRefTypesWithFiles = [ ref.upper() for ref in ['SimuRW', 'EmpSI', 'EmpSD'] ] allRefTypes = [ 'DFIRE', 'DOPE' ] + allRefTypesWithFiles def main(argv): inputFile = None targetName = None labelNames = ['CbCb'] potentialFileSuffix = 'pkl' minPotential = -30.0 maxPotential = 30.0 minSeqSep = 3 minSeqSepStr='3' ## the largest dist cutoff rc = 18 alpha4DFIRE = 1.61 rgScale4DOPE = 1. ## reference reference = 'DFIRE' ## refFile refFile = None try: opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="]) print opts, args except getopt.GetoptError: Usage() exit(1) if len(opts) < 1: Usage() exit(1) for opt, arg in opts: if opt in ("-i", "--input"): inputFile = arg elif opt in ("-a", "--atomPairType"): labelNames = config.ParseLabelNames(arg) elif opt in ("-r", "--refState"): fields = arg.split('+') reference = fields[0].upper() if reference not in allRefTypes: print 'allowed reference types: ', allRefTypes exit(1) if len(fields) > 1: if reference == 'DFIRE': rc = np.float32(fields[1]) if len(fields) > 2: alpha4DFIRE = np.float32(fields[2]) elif reference == 'DOPE': rc = np.float32(fields[1]) if len(fields) > 2: rgScale4DOPE = np.float32(fields[2]) elif reference == 'SimuRW'.upper(): rc = np.float32(fields[1]) else: print 'WARNING: unsupported reference format: ', arg elif opt in ("-f", "--refFile"): refFile = arg if not os.path.isfile(refFile): print 'the provided file for reference state is not valid: ', refFile exit(1) elif opt in ("-l", "--minPotential"): minPotential = np.float32(arg) elif opt in ("-u", "--maxPotential"): maxPotential = np.float32(arg) elif opt in ("-s", "--minSeqSep"): minSeqSep = np.int32(arg) minSeqSepStr = arg if minSeqSep < 1: print 'ERROR: minSeqSep shall be at least 1' exit(1) elif opt in ("-t", "--textFormat"): potentialFileSuffix = '.txt' elif opt in ("-n", "--nonZero"): resetFlag = False else: Usage() exit(1) if inputFile is None: print 'Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'The input file does not exist: ', inputFile exit(1) if reference in allRefTypesWithFiles and refFile is None: print 'The file for user-sepcified reference state is empty' exit(1) targetName = os.path.basename(inputFile).split('.')[0] content = DistanceUtils.LoadRawDistProbFile(inputFile) assert len(content) >=6 name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6] assert labelWeight is not None, "labelWeight shall not be empty" ## if needed, add code to here the predicted dist probability filenames = [ targetName, 'distPotential'] if reference == 'DFIRE': potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix]) elif reference == 'DOPE': potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix]) elif reference == 'SimuRW'.upper(): potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), potentialFileSuffix]) else: print 'ERROR: unimplemented reference state: ', reference exit(1) potentialFileName = '.'.join(filenames) ## save to PKL file if potentialFileName.endswith('.pkl'): fh = open(potentialFileName, 'wb') potential_new = dict() distCutoffs = dict() for response, pot in potential.iteritems(): labelName = config.Response2LabelName(response) if labelName not in set(labelNames): continue potential_new[response] = pot distCutoffs[response] = config.GetCutoffs(response) cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return ## save to text file potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix fh = open(potentialFileName, 'w') fh.write('#TARGET\t' + targetName + '\n') fh.write('#SEQ\t' + sequence + '\n') fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n') for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue size = pot.shape for i in xrange(size[0]): rawPotStrs = [] for j in xrange(i+ minSeqSep, size[1]): atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName) y = pot[i, j] rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] ) rawPotStrs.append(rawPotStr) if len(rawPotStrs) >0: fh.write('\n'.join(rawPotStrs) + '\n') fh.close() if __name__ == "__main__": main(sys.argv[1:])
def main(argv): inputFile = None targetName = None labelNames = ['CbCb'] potentialFileSuffix = 'pkl' minPotential = -30.0 maxPotential = 30.0 minSeqSep = 3 minSeqSepStr='3' ## the largest dist cutoff rc = 18 alpha4DFIRE = 1.61 rgScale4DOPE = 1. ## reference reference = 'DFIRE' ## refFile refFile = None try: opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="]) print opts, args except getopt.GetoptError: Usage() exit(1) if len(opts) < 1: Usage() exit(1) for opt, arg in opts: if opt in ("-i", "--input"): inputFile = arg elif opt in ("-a", "--atomPairType"): labelNames = config.ParseLabelNames(arg) elif opt in ("-r", "--refState"): fields = arg.split('+') reference = fields[0].upper() if reference not in allRefTypes: print 'allowed reference types: ', allRefTypes exit(1) if len(fields) > 1: if reference == 'DFIRE': rc = np.float32(fields[1]) if len(fields) > 2: alpha4DFIRE = np.float32(fields[2]) elif reference == 'DOPE': rc = np.float32(fields[1]) if len(fields) > 2: rgScale4DOPE = np.float32(fields[2]) elif reference == 'SimuRW'.upper(): rc = np.float32(fields[1]) else: print 'WARNING: unsupported reference format: ', arg elif opt in ("-f", "--refFile"): refFile = arg if not os.path.isfile(refFile): print 'the provided file for reference state is not valid: ', refFile exit(1) elif opt in ("-l", "--minPotential"): minPotential = np.float32(arg) elif opt in ("-u", "--maxPotential"): maxPotential = np.float32(arg) elif opt in ("-s", "--minSeqSep"): minSeqSep = np.int32(arg) minSeqSepStr = arg if minSeqSep < 1: print 'ERROR: minSeqSep shall be at least 1' exit(1) elif opt in ("-t", "--textFormat"): potentialFileSuffix = '.txt' elif opt in ("-n", "--nonZero"): resetFlag = False else: Usage() exit(1) if inputFile is None: print 'Please provide an input file' exit(1) if not os.path.isfile(inputFile): print 'The input file does not exist: ', inputFile exit(1) if reference in allRefTypesWithFiles and refFile is None: print 'The file for user-sepcified reference state is empty' exit(1) targetName = os.path.basename(inputFile).split('.')[0] content = DistanceUtils.LoadRawDistProbFile(inputFile) assert len(content) >=6 name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6] assert labelWeight is not None, "labelWeight shall not be empty" ## if needed, add code to here the predicted dist probability filenames = [ targetName, 'distPotential'] if reference == 'DFIRE': potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix]) elif reference == 'DOPE': potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix]) elif reference == 'SimuRW'.upper(): potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential) filenames.extend([reference, str(rc), potentialFileSuffix]) else: print 'ERROR: unimplemented reference state: ', reference exit(1) potentialFileName = '.'.join(filenames) ## save to PKL file if potentialFileName.endswith('.pkl'): fh = open(potentialFileName, 'wb') potential_new = dict() distCutoffs = dict() for response, pot in potential.iteritems(): labelName = config.Response2LabelName(response) if labelName not in set(labelNames): continue potential_new[response] = pot distCutoffs[response] = config.GetCutoffs(response) cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return ## save to text file potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix fh = open(potentialFileName, 'w') fh.write('#TARGET\t' + targetName + '\n') fh.write('#SEQ\t' + sequence + '\n') fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n') for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue size = pot.shape for i in xrange(size[0]): rawPotStrs = [] for j in xrange(i+ minSeqSep, size[1]): atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName) y = pot[i, j] rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] ) rawPotStrs.append(rawPotStr) if len(rawPotStrs) >0: fh.write('\n'.join(rawPotStrs) + '\n') fh.close()