def FixPredictedDistProb(predDistMatrix, labelWeight, labelDistribution):
	newPredDistMatrix = dict()
	for response in predDistMatrix.keys():
    		fixedProb = DistanceUtils.FixDistProb( predDistMatrix[response], labelWeight[response], labelDistribution[response])
		newPredDistMatrix[response] = fixedProb

    	return newPredDistMatrix
Esempio n. 2
0
def DetermineProbThresholds(fixedProb, ratio_4_15A, distCutoffs):

    size = fixedProb.shape
    M1s = np.ones((size[0], size[1]), dtype=np.int8)
    mask_LR = np.triu(M1s, 24) + np.tril(M1s, -24)
    mask_MLR = np.triu(M1s, 12) + np.tril(M1s, -12)
    mask_SMLR = np.triu(M1s, 6) + np.tril(M1s, -6)
    mask_all = np.triu(M1s, 2) + np.tril(M1s, -2)
    mask_MR = mask_MLR - mask_LR
    mask_SR = mask_SMLR - mask_MLR
    mask_NR = mask_all - mask_SMLR

    len = fixedProb.shape[0]
    maxNums = [np.int32(x * len * 2) for x in ratio_4_15A]

    labelOf15 = DistanceUtils.LabelsOfOneDistance(config.InteractionLimit,
                                                  distCutoffs)
    fixedProb_revised = np.sum(fixedProb[:, :, labelOf15:], axis=2)

    cutoffs = []

    for mask, maxnum in zip([mask_LR, mask_MR, mask_SR, mask_NR], maxNums):
        res = fixedProb_revised[mask.nonzero()]
        res_sorted = res[res.argsort()]

        if res_sorted.shape[0] < maxnum + 1:
            cutoffs.append(res_sorted[-1])
        else:
            cutoffs.append(res_sorted[maxnum])

    return cutoffs
Esempio n. 3
0
def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, prdProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProb = refData[response][0]
		potential = - np.log ( predProb / refProb )
		
		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential[:, :, lastDistBin: ] =0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials
Esempio n. 4
0
def main(argv):
    inputFile = None
    if len(argv) < 1:
        Usage()
        exit(1)
    inputFile = argv[0]

    printBoundPKL = False
    if len(argv) >= 2:
        printBoundPKL = True

    if inputFile is None:
        print 'ERROR: Please provide an input file'
        exit(1)
    if not os.path.isfile(inputFile):
        print 'ERROR: The input file does not exist: ', inputFile
        exit(1)

    content = DistanceUtils.LoadRawDistProbFile(inputFile)
    targetName, sequence, predictedDistProbMatrix, predictedContactProbMatrix = content[:
                                                                                        4]
    """ Skip this step since in version 3, we use an unbiased deep model
	if labelWeight is not None:
    		fixedProb = dict()
		for apt in predictedDistProb.keys():
			#print 'shapes: ', predictedDistProb[apt].shape, np.array(labelWeight[apt]).shape, np.array(labelDistribution[apt]).shape
    			fixedProb[apt] = DistanceUtils.FixDistProb( predictedDistProb[apt], labelWeight[apt], labelDistribution[apt])
		
	else:
		## in this case, the probability values in predictedDistProb are already corrected
		fixedProb = predictedDistProb
	fixedProb = predictedDistProb

    	if printProbMatrix:
		probFileName = targetName + probFileSuffix
		fh = open(probFileName, 'wb')
		cPickle.dump(fixedProb, fh, protocol = cPickle.HIGHEST_PROTOCOL)
		fh.close()
	"""

    bounds = EstimateDistanceBounds(predictedDistProbMatrix)

    ## output Cb-Cb bound in text format
    if bounds.has_key('CbCb'):
        boundFileName = targetName + '.bound.txt'
        boundMatrix = bounds['CbCb']
        SaveBoundInListFormat(targetName, sequence, boundMatrix, boundFileName)

    if not printBoundPKL:
        return
    boundFileName = targetName + '.bound.pkl'
    with open(boundFileName, 'wb') as fh:
        cPickle.dump((bounds, targetName, sequence),
                     fh,
                     protocol=cPickle.HIGHEST_PROTOCOL)
Esempio n. 5
0
def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, predProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProbList = refData[response][1]

		length = predProb.shape[0]
		if length < 400:
			refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ]
		else:
			refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ]

		print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length

		refProb = np.average(refProbs, axis=0)
		potential = - np.log ( predProb / refProb )

		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential{;, :, lastDistBin: ] = 0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials
def main(argv):

    	inputFile = None
    	targetName = None
	labelNames = config.allAtomPairNames + config.allOrientationNames
	potentialFileSuffix = 'pkl'
	minPotential = -30.0
	maxPotential = 30.0

	UseWeight4Orientation = True
	UseWeight4Distance = True

	## the largest dist cutoff
	rc = 18

	alpha4DFIRE = 1.61
	alpha4DFIREstr = '1.61'

	rgScale4DOPE = 1.

	## reference 
	reference = 'DFIRE'

	##
	UseRef4Orientation = True

	## refFile for SimuRW
	refFile = None

	#savefolder = os.getcwd()
	savefile=""

	if len(argv) < 1:
		Usage()
		exit(1)

    	try:
        	opts, args = getopt.getopt(argv,"a:w:r:l:u:f:s:o",["labelNames=", "useWeight=", "refState=", "minPotential=", "maxPotential=", "refFile=", "savefile=", "noRef4Orientation="])
        	#print opts, args
    	except getopt.GetoptError:
        	Usage()
        	exit(1)

    	if len(args) != 1:
        	Usage()
        	exit(1)

	inputFile = args[0]

    	for opt, arg in opts:
		if opt in ("-a", "--labelNames"):
			labelNames = config.ParseLabelNames(arg)

		elif opt in ("-w", "--useWeight"):
			scheme = np.int32(arg)
			UseWeight4Orientation = (2 & scheme)>0
			UseWeight4Distance = (1 & scheme)>0

		elif opt in ("-r", "--refState"):
			fields = arg.split('+')
			reference = fields[0].upper()
			if reference not in allRefTypes:
				print 'ERROR: allowed reference types: ', allRefTypes
				exit(1)

			if len(fields) > 1:
				if fields[1].isdigit():
					rc = np.int32(fields[1])
				else:
					rc = np.float32(fields[1])

				if reference  == 'DFIRE':
					if len(fields) > 2:
						alpha4DFIREstr = fields[2]
						alpha4DFIRE = np.float32(fields[2])

				elif reference == 'DOPE':
					if len(fields) > 2:
						rgScale4DOPE = np.float32(fields[2])
				elif reference == 'SimuRW'.upper():
					#rc = np.float32(fields[1])
					print 'Using SimuRW potential'
				else:
					print 'ERROR: unsupported reference format: ', arg
					exit(1)
				

		elif opt in ("-f", "--refFile"):
			refFile = arg
			if not os.path.isfile(refFile):
				print 'the provided file for reference state is not valid: ', refFile
				exit(1)

		elif opt in ("-o", "--noRef4Orientation"):
			UseRef4Orientation = False

		elif opt in ("-s", "--savefile"):
			savefile = arg

		elif opt in ("-l", "--minPotential"):
			minPotential = np.float32(arg)
		elif opt in ("-u", "--maxPotential"):
			maxPotential = np.float32(arg)

		else:
	    		Usage()
	    		exit(1)

    	if inputFile is None:
		print 'ERROR: Please provide an input file'
		exit(1)
    	if not os.path.isfile(inputFile):
		print 'ERROR: The input file does not exist: ', inputFile
		exit(1)

	if reference in allRefTypesWithFiles and refFile is None:
		print 'ERROR: The file for user-sepcified reference state is empty'
		exit(1)

	if reference == 'DFIRE':
		if alpha4DFIRE > 10:
			## take a random value between 1.57 and 1.63
			alpha4DFIRE=random.uniform(1.57, 1.63)

		print 'alpha for DFIRE potential is ', alpha4DFIRE
		if alpha4DFIRE<1.55 or alpha4DFIRE>1.75:
			print 'ERROR: alpha4DFIRE shall be between 1.55 and 1.75'
			exit(1)

	if reference == 'DOPE':
		print 'rgScale for DOPE potential is', rgScale4DOPE
		if rgScale4DOPE > 1.2 or rgScale4DOPE <0.8:
			print 'ERROR: rgScale4DOPE shall be between 0.8 and 1.2'
			exit(1)

	if UseWeight4Distance:
		print 'Use weight for distance potential'
	if UseWeight4Orientation:
		print 'Use weight for orientation potential'
	if not UseRef4Orientation:
		print 'Do not use reference for orientation'


    	content = DistanceUtils.LoadRawDistProbFile(inputFile)
	assert len(content) >=6

    	name, sequence, predictedProb, predictedContactProb, labelWeight, labelDistribution = content[:6]
	assert labelWeight is not None, "labelWeight shall not be empty"
	predData = (predictedProb, labelWeight, labelDistribution)

        targetName = os.path.basename(inputFile).split('.')[0]
	print 'Generating potential for ', targetName, 'with the following labels: ', labelNames

	filenames = [ targetName, 'pairPotential']

	if reference == 'DFIRE':
		pairPotential, cutoffs, validProb, distPotential, oriPotential = CalcDistOriPotential(predData, labelNames, distPotType='DFIRE', param4Potential=alpha4DFIRE, largestDistance=rc, useWeight4Dist=UseWeight4Distance, useRef4Ori=UseRef4Orientation, useWeight4Ori=UseWeight4Orientation, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), alpha4DFIREstr])
	elif reference == 'DOPE':
		pairPotential, cutoffs, validProb, distPotential, oriPotential = CalcDistOriPotential(predData, labelNames, distPotType='DOPE', param4Potential=rgScale4DOPE, largestDistance=rc, useWeight4Dist=UseWeight4Distance, useRef4Ori=UseRef4Orientation, useWeight4Ori=UseWeight4Orientation, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(rgScale4DOPE)])
	else:
		print 'ERROR: unimplemented potential type: ', reference
		exit(1)

	if bool(oriPotential) and UseRef4Orientation:
		filenames.append('Ref4O')

	wStr=None
	if (bool(distPotential) and UseWeight4Distance) and (bool(oriPotential) and UseWeight4Orientation):
		wStr = 'Wt4OD'
	elif bool(oriPotential) and UseWeight4Orientation:
		wStr = 'Wt4O'
	elif bool(distPotential) and UseWeight4Distance:
		wStr = 'Wt4D'

	if wStr is not None:
		filenames.append(wStr)

	filenames.append('pkl')
	if savefile == "":
		savefile = '.'.join(filenames)

	## save the result
        with open(savefile, 'wb') as fh:
		cPickle.dump((name, sequence, pairPotential, cutoffs, validProb), fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcPotentialBySimuRW(predDistMatrix, refFile, largestDistance=20, sequence=None, useWeight=False, minPotential=-30., maxPotential=30.):
	f=open(refFile, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response in predDistMatrix.keys():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
                        #print 'WARNING: unsupported response for SimuRW potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                predProb = predDistMatrix[response]

		## the first row of refProb corresponds to offset=1
                refProb = refData[response]
		if labelName != 'CbCb':
			print 'distance label name not supported yet: ', labelName
			exit(1)

		if not subType.endswith('34C'):
			print 'distance label type not supported yet: ', subType
			exit(1)

		cutoff = config.GetCutoffs(response)

		length = predProb.shape[0]
		numLabels = predProb.shape[2]
		assert numLabels == refProb.shape[1]

		## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset
		maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ]
		maxAllowedDist[0] = 0
		eps = 0.00001
		maxAllowedDist[2] = 10.5 - eps
		maxAllowedDist[3] = 13.0 - eps
		maxAllowedDist[4] = 15.5 - eps
		maxAllowedDist[5] = 17.5 - eps
		maxAllowedDist[6] = 19.5 - eps

		potential = np.zeros_like(predProb)

		for i in range(0, length):
			for j in range(i+2, length):
				offset = j-i
				## find the distance bin into which the maxAllowedDist falls
				lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff)
				if lastDistBin < (numLabels - 1):
					## merge the pred prob and ref prob in the bins from lastDistBin to the end
					pred = predProb[i, j,  : lastDistBin+1]
					ref = refProb[offset-1][:lastDistBin+1]

					potential[i, j, :lastDistBin+1] = -np.log( pred / ref )
					potential[i, j, lastDistBin+1: ] = maxPotential
				else:
					## determine the last distance bin
                			rc = min(cutoff[-1], largestDistance) - 0.001
                			if (rc<10.0):
                        			print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc
                        			exit(1)
                			rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

					refProbLen = refProb.shape[0]
					#idx4rc = numLabels - 2
					potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] )
					potential[i, j] -= potential[i, j, rc_index]
					potential[i, j, rc_index + 1: ] = 0

				## only valid for symmetric atom pairs
				potential[j, i] = potential[i, j]

		if useWeigt and subType.endswith('Plus'):
			potential *= (1-predProb[:, :, -1])

                CheckPotentialValues(potential)

		potentials[response] = potential

        return potentials
def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., useWeight=False, minPotential=-30., maxPotential=30.):
	potentials = dict()
	validProbs = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
                if labelName not in config.allAtomPairNames:
                        #print 'WARNING: unsupported response for DOPE potential: ', response
                        continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                cutoff = config.GetCutoffs(response)

		## determine the last distance bin
                rc = min(cutoff[-1], largestDistance) - 0.001
                if (rc<10.0):
                        print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc
                        exit(1)
                rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## a is the radius of reference sphere and rg is the estimated radius of gyration
		length = predDistMatrix[response].shape[0]
		rg = 0.395*length**(3./5)+7.257	
		a = np.sqrt(5./3) * rg * rgScale

		""" calculate n(r,a) defined in the DOPE paper. Below is the original formulation.
		## rc is the upper bound of distance between two atoms
		rc = bincenters[-1]
		if rc <= 2*a:
			#nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3))
		else:
			#nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6)
		"""
		## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins
		nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) 

		def CalcApproxRefPot(idx=0):
			points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
			values = np.square(points * (points - 2*a)) * (points + 4*a) 
			tmpNra = np.average(values)	
			return tmpNra

		## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5
		for i in range(len(binwidths)):
			if binwidths[i] >= 1:
				nra[i] = CalcApproxRefPot(i) 

		## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc)
		## \delta(r) is equal to binwidths
		refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] )
		
	  	## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
                predProb = predDistMatrix[response]
                predProbRC = predProb[:, :, rc_index : rc_index+1]
                obsPot = np.log(predProb / predProbRC)

                ## calculate the final potential, which is the difference between reference and observed potential
                potential = np.zeros_like(predDistMatrix[response])
                potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		if subType.endswith('Plus'):
			validProb = 1 - predProb[:, :, -1]
		else:
			validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32)

		##if useWeight and the prob of disroder exists, adjust potential by prob of not beining in disorder status
		if useWeight and subType.endswith('Plus'):
			potential *= validProb[:, :, np.newaxis]

		## remove the potential for the last distance bin, which corresponds to disorder status
		if subType.endswith('Plus'):
			potential = potential[:, :, :-1]

		CheckPotentialValues(m=potential)

		potentials[response] = potential.astype(np.float32)
		validProbs[response] = validProb.astype(np.float32)

	return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=18, useWeight=False, minPotential=-30, maxPotential=30):
	potentials = dict()

	## validProbs saves the prob of one atom/residue pair likely have valid coordinates
	validProbs = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			#print 'WARNING: unsupported response for DFIRE potential: ', response
			continue
		if not config.IsDiscreteLabel(labelType):
                        print 'WARNING: the distance label is not discrete: ', response
			continue

		cutoff = config.GetCutoffs(response)

		## determine the last distance bin
		rc = min(cutoff[-1], largestDistance) - 0.001
		if (rc<10.0):
			print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc
			exit(1)
		rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc)
		## \delta(r) is binwidths and r is the bincenters
		refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] )

		## idx is the index for a bin
		def CalcApproxRefPot(idx=0):
                        points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
                        values = np.power(points / bincenters[rc_index], alpha)
			avg = np.average(values)
                        tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] )
			return tmpRefPot

		## get a more accurate estimation of reference for the bin with a large width
		for i in range(len(binwidths)):
			if binwidths[i] >= 1:
				refPot[i] = CalcApproxRefPot(i)
		
		## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
		predProb = predDistMatrix[response]
		predProbRC = predProb[:, :, rc_index : rc_index+1]
		#obsPot = np.log(predProb / (sys.float_info.min + predProbRC))
		obsPot = np.log(predProb / predProbRC)

		## calculate the final potential, which is the difference between reference potential and observed potential
		potential = np.zeros_like(predDistMatrix[response])
		potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		if subType.endswith('Plus'):
			validProb = 1 - predProb[:, :, -1]
		else:
			validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32)

		##if useWeight=True and the prob of being disorder exists, adjust potential by the prob of not being in disorder status
		if useWeight and subType.endswith('Plus'):
			potential *= validProb[:, :, np.newaxis]

		## remove the potential for the last distance bin, which corresponds to disorder status
		if subType.endswith('Plus'):
			potential = potential[:, :, :-1]

		CheckPotentialValues(m=potential)

		potentials[response] = potential.astype(np.float32)
		validProbs[response] = validProb.astype(np.float32)

	return potentials, validProbs
Esempio n. 10
0
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=15, minPotential=-20, maxPotential=20):
	potentials = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			print 'WARNING: unsupported response for DFIRE potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

		cutoff = config.GetCutoffs(response)

		## determine the last distance bin
		rc = min(cutoff[-1], largestDistance) - 0.001
		if (rc<10.0):
			print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc
			exit(1)
		rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc)
		## \delta(r) is binwidths and r is the bincenters
		refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] )

		## idx is the index for binwidth
		def CalcApproxRefPot(idx=0):
                        points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
                        values = np.power(points / bincenters[rc_index], alpha)
			avg = np.average(values)
                        tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] )
			return tmpRefPot

		## get a more accurate estimation of reference for the first bin
		[ refPot[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwdiths[i] >= 1 ]
		
		## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
		predProb = predDistMatrix[response]
		predProbRC = predProb[:, :, rc_index : rc_index+1]
		obsPot = np.log(predProb / predProbRC)

		## calculate the final potential, which is the difference between reference potential and observed potential
		potential = np.zeros_like(predDistMatrix[response])
		potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		CheckPotentialValues(m=potential)

		potentials[response] = potential

	return potentials

def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., minPotential=-20., maxPotential=20.):
	potentials = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
                if labelName not in config.allAtomPairNames:
                        print 'WARNING: unsupported response for DOPE potential: ', response
                        continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                cutoff = config.GetCutoffs(response)

		## determine the last distance bin
                rc = min(cutoff[-1], largestDistance) - 0.001
                if (rc<10.0):
                        print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc
                        exit(1)
                rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## a is the radius of reference sphere and rg is the estimated radius of gyration
		length = predDistMatrix[response].shape[0]
		rg = 0.395*length**(3./5)+7.257	
		a = np.sqrt(5./3) * rg * rgScale

		""" calculate n(r,a) defined in the DOPE paper. Below is the original formulation.
		## rc is the upper bound of distance between two atoms
		rc = bincenters[-1]
		if rc <= 2*a:
			#nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3))
		else:
			#nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6)
		"""
		## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins
		nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) 

		def CalcApproxRefPot(idx=0):
			points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
			values = np.square(points * (points - 2*a)) * (points + 4*a) 
			tmpNra = np.average(values)	
			return tmpNra

		## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5
		[ nra[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwidths[i] >= 1 ]

		## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc)
		## \delta(r) is equal to binwidths
		refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] )
		
	  	## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
                predProb = predDistMatrix[response]
                predProbRC = predProb[:, :, rc_index : rc_index+1]
                obsPot = np.log(predProb / predProbRC)

                ## calculate the final potential, which is the difference between reference and observed potential
                potential = np.zeros_like(predDistMatrix[response])
                potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		CheckPotentialValues(m=potential)

		potentials[response] = potential

	return potentials
		

def CalcPotentialBySimuRW(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response in predDistMatrix.keys():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
                        print 'WARNING: unsupported response for SimuRW potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                predProb = predDistMatrix[response]

		## the first row of refProb corresponds to offset=1
                refProb = refData[response]
		if labelName != 'CbCb':
			print 'distance label name not supported yet: ', labelName
			exit(1)

		if not subType.endswith('34C'):
			print 'distance label type not supported yet: ', subType
			exit(1)

		cutoff = config.GetCutoffs(response)

		length = predProb.shape[0]
		numLabels = predProb.shape[2]
		assert numLabels == refProb.shape[1]

		## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset
		maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ]
		maxAllowedDist[0] = 0
		eps = 0.00001
		maxAllowedDist[2] = 10.5 - eps
		maxAllowedDist[3] = 13.0 - eps
		maxAllowedDist[4] = 15.5 - eps
		maxAllowedDist[5] = 17.5 - eps
		maxAllowedDist[6] = 19.5 - eps

		potential = np.zeros_like(predProb)

		for i in range(0, length):
			for j in range(i+2, length):
				offset = j-i
				## find the distance bin into which the maxAllowedDist falls
				lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff)
				if lastDistBin < (numLabels - 1):
					## merge the pred prob and ref prob in the bins from lastDistBin to the end
					pred = predProb[i, j,  : lastDistBin+1]
					ref = refProb[offset-1][:lastDistBin+1]

					potential[i, j, :lastDistBin+1] = -np.log( pred / ref )
					potential[i, j, lastDistBin+1: ] = maxPotential
				else:
					## determine the last distance bin
                			rc = min(cutoff[-1], largestDistance) - 0.001
                			if (rc<10.0):
                        			print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc
                        			exit(1)
                			rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

					refProbLen = refProb.shape[0]
					#idx4rc = numLabels - 2
					potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] )
					potential[i, j] -= potential[i, j, rc_index]
					potential[i, j, rc_index + 1: ] = 0

				## only valid for symmetric atom pairs
				potential[j, i] = potential[i, j]

                CheckPotentialValues(potential)

		potentials[response] = potential

        return potentials

def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, predProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProbList = refData[response][1]

		length = predProb.shape[0]
		if length < 400:
			refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ]
		else:
			refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ]

		print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length

		refProb = np.average(refProbs, axis=0)
		potential = - np.log ( predProb / refProb )

		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential{;, :, lastDistBin: ] = 0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials

def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, prdProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProb = refData[response][0]
		potential = - np.log ( predProb / refProb )
		
		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential[:, :, lastDistBin: ] =0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials

	

allRefTypesWithFiles = [ ref.upper() for ref in ['SimuRW', 'EmpSI', 'EmpSD'] ]
allRefTypes = [ 'DFIRE', 'DOPE' ] + allRefTypesWithFiles

def main(argv):

    	inputFile = None
    	targetName = None
	labelNames = ['CbCb']
	potentialFileSuffix = 'pkl'
	minPotential = -30.0
	maxPotential = 30.0
	minSeqSep = 3
	minSeqSepStr='3'

	## the largest dist cutoff
	rc = 18

	alpha4DFIRE = 1.61
	rgScale4DOPE = 1.

	## reference 
	reference = 'DFIRE'

	## refFile
	refFile = None

    	try:
        	opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="])
        	print opts, args
    	except getopt.GetoptError:
        	Usage()
        	exit(1)


    	if len(opts) < 1:
        	Usage()
        	exit(1)

    	for opt, arg in opts:
		if opt in ("-i", "--input"):
	    		inputFile = arg

		elif opt in ("-a", "--atomPairType"):
			labelNames = config.ParseLabelNames(arg)

		elif opt in ("-r", "--refState"):
			fields = arg.split('+')
			reference = fields[0].upper()
			if reference not in allRefTypes:
				print 'allowed reference types: ', allRefTypes
				exit(1)

			if len(fields) > 1:
				if reference  == 'DFIRE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						alpha4DFIRE = np.float32(fields[2])

				elif reference == 'DOPE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						rgScale4DOPE = np.float32(fields[2])
				elif reference == 'SimuRW'.upper():
					rc = np.float32(fields[1])
				else:
					print 'WARNING: unsupported reference format: ', arg
				

		elif opt in ("-f", "--refFile"):
			refFile = arg
			if not os.path.isfile(refFile):
				print 'the provided file for reference state is not valid: ', refFile
				exit(1)

		elif opt in ("-l", "--minPotential"):
			minPotential = np.float32(arg)
		elif opt in ("-u", "--maxPotential"):
			maxPotential = np.float32(arg)

		elif opt in ("-s", "--minSeqSep"):
			minSeqSep = np.int32(arg)
			minSeqSepStr = arg
			if minSeqSep < 1:
				print 'ERROR: minSeqSep shall be at least 1'
				exit(1)

		elif opt in ("-t", "--textFormat"):
	    		potentialFileSuffix = '.txt'

		elif opt in ("-n", "--nonZero"):
			resetFlag = False	

		else:
	    		Usage()
	    		exit(1)

    	if inputFile is None:
		print 'Please provide an input file'
		exit(1)
    	if not os.path.isfile(inputFile):
		print 'The input file does not exist: ', inputFile
		exit(1)

	if reference in allRefTypesWithFiles and refFile is None:
		print 'The file for user-sepcified reference state is empty'
		exit(1)

        targetName = os.path.basename(inputFile).split('.')[0]

    	content = DistanceUtils.LoadRawDistProbFile(inputFile)
	assert len(content) >=6

    	name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6]
	assert labelWeight is not None, "labelWeight shall not be empty"

	## if needed, add code to here the predicted dist probability

	filenames = [ targetName, 'distPotential']
	if reference == 'DFIRE':
		potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix])
	elif reference == 'DOPE':
		potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix])
	elif reference == 'SimuRW'.upper():
		potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), potentialFileSuffix])
	else:
		print 'ERROR: unimplemented reference state: ', reference
		exit(1)

	potentialFileName = '.'.join(filenames)

	## save to PKL file
	if potentialFileName.endswith('.pkl'):
        	fh = open(potentialFileName, 'wb')
		potential_new = dict()
		distCutoffs = dict()
		for response, pot in potential.iteritems():
			labelName = config.Response2LabelName(response)
			if labelName not in set(labelNames):
				continue

			potential_new[response] = pot
			distCutoffs[response] = config.GetCutoffs(response)

		cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()
		return

	## save to text file
	potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix
	fh = open(potentialFileName, 'w')
	fh.write('#TARGET\t' + targetName + '\n')
	fh.write('#SEQ\t' + sequence + '\n')
	fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n')

	for response, pot in potential.iteritems():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in set(labelNames):
			continue

		size = pot.shape
		for i in xrange(size[0]):
			rawPotStrs = []

			for j in xrange(i+ minSeqSep, size[1]):
				atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName)
				y = pot[i, j]

				rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] )
				rawPotStrs.append(rawPotStr)

			if len(rawPotStrs) >0:
				fh.write('\n'.join(rawPotStrs) + '\n')

	fh.close()


if __name__ == "__main__":
    	main(sys.argv[1:])
Esempio n. 11
0
def main(argv):

    	inputFile = None
    	targetName = None
	labelNames = ['CbCb']
	potentialFileSuffix = 'pkl'
	minPotential = -30.0
	maxPotential = 30.0
	minSeqSep = 3
	minSeqSepStr='3'

	## the largest dist cutoff
	rc = 18

	alpha4DFIRE = 1.61
	rgScale4DOPE = 1.

	## reference 
	reference = 'DFIRE'

	## refFile
	refFile = None

    	try:
        	opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="])
        	print opts, args
    	except getopt.GetoptError:
        	Usage()
        	exit(1)


    	if len(opts) < 1:
        	Usage()
        	exit(1)

    	for opt, arg in opts:
		if opt in ("-i", "--input"):
	    		inputFile = arg

		elif opt in ("-a", "--atomPairType"):
			labelNames = config.ParseLabelNames(arg)

		elif opt in ("-r", "--refState"):
			fields = arg.split('+')
			reference = fields[0].upper()
			if reference not in allRefTypes:
				print 'allowed reference types: ', allRefTypes
				exit(1)

			if len(fields) > 1:
				if reference  == 'DFIRE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						alpha4DFIRE = np.float32(fields[2])

				elif reference == 'DOPE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						rgScale4DOPE = np.float32(fields[2])
				elif reference == 'SimuRW'.upper():
					rc = np.float32(fields[1])
				else:
					print 'WARNING: unsupported reference format: ', arg
				

		elif opt in ("-f", "--refFile"):
			refFile = arg
			if not os.path.isfile(refFile):
				print 'the provided file for reference state is not valid: ', refFile
				exit(1)

		elif opt in ("-l", "--minPotential"):
			minPotential = np.float32(arg)
		elif opt in ("-u", "--maxPotential"):
			maxPotential = np.float32(arg)

		elif opt in ("-s", "--minSeqSep"):
			minSeqSep = np.int32(arg)
			minSeqSepStr = arg
			if minSeqSep < 1:
				print 'ERROR: minSeqSep shall be at least 1'
				exit(1)

		elif opt in ("-t", "--textFormat"):
	    		potentialFileSuffix = '.txt'

		elif opt in ("-n", "--nonZero"):
			resetFlag = False	

		else:
	    		Usage()
	    		exit(1)

    	if inputFile is None:
		print 'Please provide an input file'
		exit(1)
    	if not os.path.isfile(inputFile):
		print 'The input file does not exist: ', inputFile
		exit(1)

	if reference in allRefTypesWithFiles and refFile is None:
		print 'The file for user-sepcified reference state is empty'
		exit(1)

        targetName = os.path.basename(inputFile).split('.')[0]

    	content = DistanceUtils.LoadRawDistProbFile(inputFile)
	assert len(content) >=6

    	name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6]
	assert labelWeight is not None, "labelWeight shall not be empty"

	## if needed, add code to here the predicted dist probability

	filenames = [ targetName, 'distPotential']
	if reference == 'DFIRE':
		potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix])
	elif reference == 'DOPE':
		potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix])
	elif reference == 'SimuRW'.upper():
		potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), potentialFileSuffix])
	else:
		print 'ERROR: unimplemented reference state: ', reference
		exit(1)

	potentialFileName = '.'.join(filenames)

	## save to PKL file
	if potentialFileName.endswith('.pkl'):
        	fh = open(potentialFileName, 'wb')
		potential_new = dict()
		distCutoffs = dict()
		for response, pot in potential.iteritems():
			labelName = config.Response2LabelName(response)
			if labelName not in set(labelNames):
				continue

			potential_new[response] = pot
			distCutoffs[response] = config.GetCutoffs(response)

		cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()
		return

	## save to text file
	potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix
	fh = open(potentialFileName, 'w')
	fh.write('#TARGET\t' + targetName + '\n')
	fh.write('#SEQ\t' + sequence + '\n')
	fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n')

	for response, pot in potential.iteritems():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in set(labelNames):
			continue

		size = pot.shape
		for i in xrange(size[0]):
			rawPotStrs = []

			for j in xrange(i+ minSeqSep, size[1]):
				atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName)
				y = pot[i, j]

				rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] )
				rawPotStrs.append(rawPotStr)

			if len(rawPotStrs) >0:
				fh.write('\n'.join(rawPotStrs) + '\n')

	fh.close()
Esempio n. 12
0
def Score(pairwiseMatrix,
          potential,
          labelNames,
          outputDetails=False,
          minSeqSep=6,
          maxCstDist=None):

    totalScore = 0.0
    scores = dict()

    for response, pot in potential.iteritems():
        labelName, labelType, subType = config.ParseResponse(response)
        if labelName not in set(labelNames):
            continue
        if not pairwiseMatrix.has_key(labelName):
            print 'WARNING: the atomDistMatrix does not have distance information for atom pair:', labelName
            continue
        if not labelType.startswith('Discrete'):
            print 'ERROR: unsupported labelType: ', labelType
            exit(1)

        pm = pairwiseMatrix[labelName]
        assert pm.shape == (
            pot.shape[0], pot.shape[1]
        ), "the size of the pairwise potential not compatible with the matrix"

        if labelName in config.allAtomPairNames:
            ## discretize the distance matrix, an invalid entry -1 will have the largest label number
            labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
                pm,
                config.distCutoffs[subType],
                invalidDistanceSeparated=False)
        elif labelName in config.allOrientationNames:
            labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix(
                pm,
                config.distCutoffs[subType],
                distMatrix=pairwiseMatrix['CbCb'],
                invalidEntrySeparated=False)

        size = pot.shape
        m = np.mgrid[0:size[0], 0:size[1]]
        scoreMatrix = pot[m[0], m[1], labelMatrix]

        if labelName in config.allAtomPairNames and maxCstDist is not None:
            label4maxDist = DistanceUtils.LabelsOfOneDistance(
                maxCstDist, config.distCutoffs[subType])
            np.putmask(scoreMatrix, labelMatrix > label4maxDist, 0)

        scores[response] = np.sum(np.triu(scoreMatrix, minSeqSep))
        totalScore += scores[response]

        if outputDetails:
            ## note that if the potential matrix is not symmetric, we have to do something more here
            indices = np.triu_indices(size[0], k=minSeqSep, m=size[1])
            scores = scoreMatrix[indices]
            labels = labelMatrix[indices]
            for i, j, s, label in zip(indices[0], indices[1], scores, labels):
                outinfo = [
                    str(i + 1),
                    str(j + 1), apt,
                    str(label), "{:.4f}".format(s)
                ] + ["{:.3f}".format(v) for v in pot[i, j]]
                outstr = ' '.join(outinfo)
                print outstr

    return totalScore, scores
def GenerateSplinePotential4Distance(potData,
                                     labelNames=['CbCb'],
                                     topRatio=None,
                                     minSeqSep=1,
                                     potThreshold=np.finfo(np.float32).max,
                                     barrier=1.0):

    target, sequence, potential, distCutoffs = potData[:4]
    if len(potData) > 4:
        validProb = potData[4]
    else:
        print 'WARNING: it is better to provide validProb for distance potential'
        validProb = None

    allConstraints = []
    for response, pot in potential.iteritems():
        #print 'response=', response

        description = response
        labelName, labelType, subType = ParseResponse(response)

        if labelName not in config.allAtomPairNames:
            continue
        if labelName not in labelNames:
            continue

#x = distCutoffs[response][1:]
        x = distCutoffs[response]
        binWidths = [b - a for a, b in zip(x[1:-1], x[2:])]
        binWidth = np.average(binWidths)
        #assert all([ (binWidth-b)<0.0001 for b in binWidths])

        ## here we add repulsion to reduce steric clashes
        ## for CaCa and CbCb, the minimum distance is 3.6A. The penalty is 3 in [2, 3.6] and 10 in [0, 2]

        ## for NO, the minimum distance is 2, and the penalty is 1 for [2, 2.4] and 5 for [0, 2]
        firstMinDist = 2
        secondMinDist = 3.6
        yPenalty = [10, 4, 0.5]

        if labelName == 'NO':
            secondMinDist = 2.4
            yPenalty = [8, 3, 0.2]
        xPrefix = [0, firstMinDist, secondMinDist]

        ## find the index of the 2nd min distance in x, i.e., x[secondLabel] <=secondMinDist < x[secondLabel+1]
        secondLabel = DistanceUtils.LabelsOfOneDistance(
            secondMinDist + 0.0001, x)
        #print 'secondLabel=', secondLabel
        assert secondLabel >= 1
        assert secondLabel < len(distCutoffs[response])

        xk = [(a + b) / 2
              for a, b in zip(x[secondLabel:-1], x[secondLabel + 1:])]
        xk.append(x[-1] + binWidth / 2.)
        xk = xPrefix + xk
        #print 'xk=', xk

        #LabelOfAdjacentCAs = DistanceUtils.LabelsOfOneDistance(4.50001, x)
        """
		## the first interval of distCutoffs, i.e., [0, x[0]), usually has width > binWidth
		## Here we split the first interval into several bins with the same binwidth
		## Assume that x[0] = binWidth times an integer
		bstep = barrier * binWidth
		xPrefix = np.linspace(-binWidth, x[0]-binWidth, np.rint(x[0]/binWidth).astype(np.int32)+1 )
		xPrefix = xPrefix.tolist()

                xk = xPrefix + x.tolist()
		xk2 = [ (a+b)/2 for a, b in zip(xk[:-1], xk[1:]) ]
		xk2.append(x[-1]+binWidth/2.)
		xk = xk2

		yPrefix = np.arange( len(xPrefix)-1, -1, -1) * bstep
		yPrefix = yPrefix.tolist()
		"""

        size = pot.shape
        residuePairs = []

        for i in xrange(size[0]):
            jstart = i + minSeqSep
            if not config.IsSymmetricLabel(labelName):
                jstart = 0

            for j in xrange(jstart, size[1]):
                offset = abs(i - j)
                if offset < minSeqSep:
                    continue
                residuePairs.append((i, j))

        ## always use repulsion potential for two sequentially adjacent Ca atoms
        if minSeqSep > 1 and labelName == 'CaCa':
            residuePairs.extend([(i, i + 1) for i in xrange(size[0] - 1)])

        for i, j in residuePairs:
            y = pot[i, j]
            """
			## y[0] is the potential for the first interval [0, x[0]). We increase potential for distance < x[0] for every binWidth Angstrom
			yPrefix2 = [ y[0] + ye for ye in yPrefix ]
			yk = yPrefix2 + y[1:].tolist()
			"""
            yPrefix = [max(y[secondLabel], 0) + ye for ye in yPenalty]
            y2 = y.tolist()
            yk = yPrefix + y2[secondLabel:]

            assert len(xk) == len(
                yk
            ), 'xk and yk length does not match for ' + labelName + ' and residues ' + str(
                i) + ' ' + str(j)

            ## when one atom pair is not symmetric (e.g., NO), it appears twice in the constraint set, so we divide its potential by 2
            if not config.IsSymmetricLabel(labelName):
                yk = [ye / 2. for ye in yk]

            atom1, atom2 = SelectAtomPair(sequence, i, j, labelName)

            constraint = dict()
            constraint['x'] = xk
            constraint['y'] = yk
            constraint['response'] = response
            constraint['binWidth'] = binWidth

            constraint['type'] = 'AtomPair'
            constraint['atoms'] = [atom1, atom2]
            constraint['atomNums'] = [i + 1, j + 1]

            allConstraints.append(constraint)

    return allConstraints
Esempio n. 14
0
def EstimateDistanceBound(fixedProb0, response):

    labelName, distLabelType, subType = config.ParseResponse(response)
    if 'Plus' in distLabelType:
        ## merge the last interval (which represents invalid distance) to the last second one
        size = fixedProb0.shape
        fixedProb = np.zeros((size[0], size[1], size[2] - 1),
                             dtype=fixedProb0.dtype)
        fixedProb[:, :, :-1] = fixedProb0[:, :, :-2]
        fixedProb[:, :, -1] = np.sum(fixedProb0[:, :, -2:], axis=2)
    else:
        fixedProb = fixedProb0

    if config.GetResponseProbDims(response) < 12:
        print 'ERROR: it is not meaningful to estimate inter-resdiue distance when the number of labels is < 12'
        exit(1)

    subType = distLabelType[len('Discrete'):]
    distCutoffs_original = config.distCutoffs[subType]
    distCutoffs = distCutoffs_original[1:]

    ## probability thresholds for 15 Angstrom, from long-range, to medium-range, short-range and near-range
    ## if the predicted probability for 15A is larger than the threshold, we think the distance of one residue pair shall be larger than 15A
    thresholds_4_15A = [0.75, 0.65, 0.5, 1.]
    """
	if distCutoffs[-1] > 17.0:
    		thresholds_4_15A = [0.85, 0.75, 0.6, 1. ]
	"""

    ## ratio threshods for 15A. we take at most ratio*L long-, medium- and short-range distance restraints for a protein of L residues
    ratios_4_15A = [9.5, 2.2, 2.2, 4.]

    ## determine the real probability thresholds for 15A by ratio
    cutoffs = DetermineProbThresholds(fixedProb, ratios_4_15A,
                                      distCutoffs_original)

    #print 'prob cutoffs determined by ratio are: ', cutoffs

    prob_thresholds_4_15A = [
        min(x, y) for x, y in zip(thresholds_4_15A, cutoffs)
    ]

    labelOf15 = DistanceUtils.LabelsOfOneDistance(config.InteractionLimit,
                                                  distCutoffs_original)

    #print 'the final prob cutoffs are: ', thresholds_4_15A

    halfBinWidth = np.average([
        distCutoffs[i] - distCutoffs[i - 1]
        for i in range(1, len(distCutoffs))
    ]) / 2.

    mid_distance = np.array(distCutoffs - halfBinWidth).astype(np.float32)
    upper_boundary_distance = np.array(distCutoffs).astype(np.float32)

    numDistBins = mid_distance.shape[0]
    mid_distance_sq = np.square(mid_distance)

    size = fixedProb.shape
    #print size
    #print numDistBins
    assert size[2] == numDistBins + 1

    ## estimates[:, :, 0] is the expected distance if it is less than 15A
    ## estimates[:, :, 1] is the variance
    ## estimates[:, :, 2] is the lower bound
    ## estimates[:, :, 3] is the upper bound
    ## not sure why initilize to -1
    estimates = np.full((size[0], size[1], 10), -1, dtype=np.float32)

    for i in range(size[0]):
        for j in range(size[1]):
            offset = abs(i - j)
            if offset < 2:
                continue
            elif offset < 6:
                rangeIndex = 3
            elif offset < 12:
                rangeIndex = 2
            elif offset < 24:
                rangeIndex = 1
            else:
                rangeIndex = 0

            ## if the prob of this residue pair suggest that the estimated distance is likely to be >15A, then do nothing
            if np.sum(
                    fixedProb[i, j,
                              labelOf15:]) > prob_thresholds_4_15A[rangeIndex]:
                continue

            ## renormalize the distance prob distribution by setting the prob of the largest distance bin to 0
            newProb = fixedProb[i, j, :numDistBins] / np.sum(
                fixedProb[i, j, :numDistBins])

            dist_mean = np.average(mid_distance, weights=newProb)
            dist_sq_mean = np.average(mid_distance_sq, weights=newProb)
            dist_std = np.sqrt(dist_sq_mean - np.square(dist_mean) +
                               np.square(halfBinWidth) * 1. / 3)

            ## find the bin into which dist_mean falls into
            binIndex = 0
            while dist_mean > upper_boundary_distance[binIndex]:
                binIndex = binIndex + 1

## now dist_mean <= upper_boundary_distance[binIndex] and dist_mean > upper_boundary_distance[binIndex - 1]
            upperProb = np.zeros(numDistBins - binIndex, dtype=np.float32)
            upperProb[0] = (upper_boundary_distance[binIndex] -
                            dist_mean) / (2 * halfBinWidth) * newProb[binIndex]
            upperProb[1:] = newProb[binIndex + 1:]

            lowerProb = np.zeros(binIndex + 1, dtype=np.float32)
            lowerProb[0:binIndex] = newProb[0:binIndex]
            lowerProb[binIndex] = newProb[binIndex] - upperProb[0]

            ## calculate the upper distance std
            dist_var_upper = np.dot(
                np.square(mid_distance[binIndex + 1:] - dist_mean) +
                np.square(halfBinWidth) * 1. / 3,
                upperProb[1:]) + upperProb[0] * np.square(
                    upper_boundary_distance[binIndex] - dist_mean) * 1. / 3

            ## the unnormalized distance deviation
            dist_std_upper = np.sqrt(dist_var_upper)

            ## the normalized distance deviation
            dist_std_upper2 = np.sqrt(dist_var_upper / np.sum(upperProb))
            dist_std_upper3 = (np.dot(mid_distance[binIndex + 1:] - dist_mean,
                                      upperProb[1:]) + upperProb[0] *
                               (upper_boundary_distance[binIndex] - dist_mean)
                               / 2) / np.sum(upperProb)

            ## the expected distance deviation
            dist_std_upper4 = np.sum(upperProb) * dist_std_upper2

            ## calculate the lower distance std
            if binIndex == 0:
                left_boundary = distCutoffs[0] - 2 * halfBinWidth
            else:
                left_boundary = upper_boundary_distance[binIndex - 1]
            dist_var_lower = np.dot(
                np.square(mid_distance[:binIndex] - dist_mean) +
                np.square(halfBinWidth) * 1. / 3,
                lowerProb[:binIndex]) + lowerProb[binIndex] * np.square(
                    dist_mean - left_boundary) * 1. / 3

            ## the unnormalized distance deviation
            dist_std_lower = np.sqrt(dist_var_lower)

            ## the normalized distance deviation
            dist_std_lower2 = np.sqrt(dist_var_lower / np.sum(lowerProb))
            dist_std_lower3 = (
                np.dot(dist_mean - mid_distance[:binIndex],
                       lowerProb[:binIndex]) + lowerProb[binIndex] *
                (dist_mean - left_boundary) / 2) / np.sum(lowerProb)

            ## the expected distance deviation
            dist_std_lower4 = np.sum(lowerProb) * dist_std_lower2
            """
			##only keep those residue pairs with estimated distance < 15 Angstrom
	    		if dist_mean >= 15.0:
				estimates[i, j] = np.array( [ -1. ] * 10 ).astype(np.float32)
	    		else:
	        		estimates[i, j] = np.array([dist_mean, dist_std, dist_std_lower, dist_std_upper, dist_std_lower2, dist_std_upper2, dist_std_lower3, dist_std_upper3, dist_std_lower4, dist_std_upper4]).astype(np.float32)
			"""
            estimates[i, j] = np.array([
                dist_mean, dist_std, dist_std_lower, dist_std_upper,
                dist_std_lower2, dist_std_upper2, dist_std_lower3,
                dist_std_upper3, dist_std_lower4, dist_std_upper4
            ]).astype(np.float32)

    return estimates