Python Response2LabelName Examples

Programming Language: Python

Namespace/Package Name: config

Examples at hotexamples.com: 12

Python Response2LabelName - 12 examples found. These are the top rated real world Python examples of config.Response2LabelName extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Response2LabelName(12)

Frequently Used Methods

Response2LabelName (12)

Example #1

Show file

File: Model4DistancePrediction.py Project: tommyhuangthu/RaptorX-Contact

        def EvaluateAccuracy(pred_prob, truth, pad_len):
            pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                           dtype=theano.config.floatX)
            truth_in_correct_shape = truth[pad_len:, pad_len:]

            labelType = Response2LabelType(currentResponse)
            atomType = Response2LabelName(currentResponse)
            symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

            if labelType.startswith('LogNormal'):
                return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                            truth=truth_in_correct_shape,
                                            symmetric=symmetric)

            elif labelType.startswith('Normal'):
                return TopAccuracyNormal(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)

            elif labelType.startswith('Discrete'):
                subType = labelType[len('Discrete'):]
                if subType.startswith('2C'):
                    return TopAccuracy2C(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)
                else:
                    return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                             truth=truth_in_correct_shape,
                                             subType=subType,
                                             symmetric=symmetric)
            else:
                print 'unsupported label type in EvaluateAccuracy: ', labelType
                exit(-1)

Example #2

Show file

File: CalcEmpiricalRefState.py Project: zhujianwei31415/RaptorX-3DModeling

def CalcRefState4OneBatch(batch, modelSpecs, minSeqSep=3):
    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in batch
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.]).astype(np.float32)
            continue

        if modelSpecs.has_key('UseBoundingBox4RefProbs') and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=newLabelMatrices,
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)
        else:
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=[
                    m.astype(np.int32) for m in allLabelMatrices[response]
                ],
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)

    return allRefProbs, avgLen

Example #3

Show file

File: PropertyUtils.py Project: zhujianwei31415/RaptorX-3DModeling

def EvaluateSinglePropertyPrediction(prediction, nativeLabelFile):

    from DataProcessor import LoadNativeLabelsFromFile

    errors = dict()
    nativeLabels = LoadNativeLabelsFromFile(nativeLabelFile)

    for response, pred in prediction.iteritems():
        native = nativeLabels[Response2LabelName(response)]
        missing = nativeLabels['Missing']

        if response.startswith('DISO'):
            numResidues = len(pred)
            totalError = sum([p != t for p, t in zip(pred, native)])
            tmpError = np.array([numResidues, totalError])

        elif response.startswith('SS'):
            numResidues = sum([m == 0 for m in missing])
            totalError = sum(
                [p != t for p, t, m in zip(pred, native, missing) if m == 0])
            tmpError = np.array([numResidues, totalError])

        elif 'Phi' in response or 'Psi' in response:
            invalidResidues = [0] * len(missing)
            for i in xrange(len(missing)):
                if missing[i] == 1:
                    invalidResidues[i] = 1
                    if i > 0:
                        invalidResidues[i - 1] = 1
                    if i < len(missing) - 1:
                        invalidResidues[i + 1] = 1

            invalidResidues[0] = 1
            invalidResidues[len(missing) - 1] = 1

            numResidues = sum([m == 0 for m in invalidResidues])
            err1 = abs(pred - native)
            err2 = np.float32(2 * np.pi) - err1
            err = np.minimum(err1, err2)
            totalError = np.sum(
                [e for e, m in zip(err, invalidResidues) if m == 0], axis=0)
            tmpError = np.array([numResidues] + list(totalError))
        else:
            print 'The Evaluate function not implemented for response: ', response
            exit(1)

        if errors.has_key(response):
            errors[response].append(tmpError)
        else:
            errors[response] = [tmpError]

    ## calculate average error
    avgerrors = dict()
    for response, err in errors.iteritems():
        avgerrors[response] = np.average(err)

    return avgerrors

Example #4

Show file

    def EvaluateAccuracy(pred_prob, truth, pad_len):
        pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                       dtype=theano.config.floatX)
        truth_in_correct_shape = truth[pad_len:, pad_len:]

        labelType = Response2LabelType(currentResponse)
        atomType = Response2LabelName(currentResponse)
        symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

        if labelType.startswith('LogNormal'):
            return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                        truth=truth_in_correct_shape,
                                        symmetric=symmetric)
        elif labelType.startswith('Normal'):
            return TopAccuracyNormal(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
        elif labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]
            if subType.startswith('2C'):
                return TopAccuracy2C(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
            else:
                return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         subType=subType,
                                         symmetric=symmetric)
        else:
            print('unsupported label type in EvaluateAccuracy: ', labelType)
            exit(-1)

        accuracyList = []
        for res, out_prob, z, ratio in zip(self.responses,
                                           self.output_probList, zList,
                                           self.modelSpecs['topRatios']):
            ## currently TopAccuracy only works when the dimension of each z is 3
            assert z.ndim == 3
            if self.mask_1d is not None:
                paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d,
                                                            axis=1)
            else:
                paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32)
            currentResponse = res
            topRatio = ratio
            ##here we use scan to calculate accuracy for each protein
            result, updates = theano.scan(fn=EvaluateAccuracy,
                                          outputs_info=None,
                                          sequences=[out_prob, z, paddingLens])
            accuracy = T.mean(result, axis=0)
            accuracyList.append(accuracy)

        return T.stacklists(accuracyList)

Example #5

Show file

File: RunDistancePredictor2.py Project: tommyhuangthu/RaptorX-Contact

def PredictDistMatrix(modelFiles, predFiles, savefolder=None):
    	## load all the models from the files. Each file contains specification for one model.
	models = []
	for mFile in modelFiles:
    		fh = open(mFile, 'rb')
    		model = cPickle.load(fh)
    		fh.close()
		models.append(model)

	## check consistency among models. All the models shall have the same labelType for the same atom pair type
	labelTypes = dict()
	for model in models:
		for response in model['responses']:
			labelName = Response2LabelName(response)
			labelType = Response2LabelType(response)
			if not labelTypes.has_key(labelName):
				labelTypes[labelName] = labelType
			elif labelTypes[labelName] != labelType:
				print 'WARNING: at least two models have different label types for the same atom pair type.'
				exit(-1)
					

	allsequences = dict()

	##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices
	##We predict one prob_matrix from each model for each protein and each response
	## two different models may share some overlapping responses.

	allresults = dict()
	numModels = dict()
	for model, mfile in zip(models, modelFiles):
		if not model['network'] in config.allNetworks:

			print 'unsupported network architecture: ', model['network']
			exit(-1)

		distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False)

		inputVariables = [ x, y, xmask, ymask]
		if xem is not None:
			inputVariables.append(xem)

	  	pred_prob = distancePredictor.output_prob
        	predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' )

		## set model parameter values
		if not Compatible(distancePredictor.params, model['paramValues']):
			print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile
			exit(-1)

		[ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ]

		## We shall load these files for each model separately since each model may have different requirement of the data
		predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False)

		##make sure the input has the same number of features as the model. We do random check here to speed up
		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2]

		if predData[0].has_key('embedFeatures'):
			rindex = np.random.randint(0, high=len(predData) )
			assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1]

		## check if all the proteins of the same name have exactly the same sequence
		for d in predData:
			if not allsequences.has_key(d['name']):
				allsequences[d['name']] = d['sequence']
			elif allsequences[d['name']] != d['sequence']:
				print 'Error: inconsistent primary sequence for the same protein in the protein feature files'
				exit(-1)
			
		## predSeqData and names are in the exactly the same order, so we know which data is for which protein	
		predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model)
		print '#predData: ', len(predData), '#batches: ', len(predSeqData)

		for onebatch, names4onebatch in zip(predSeqData, names):
			input = onebatch[ : len(inputVariables) ]
			result = predict(*input)

			x1d, x2d, x1dmask, x2dmask = input[0:4]
			seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
			maxSeqLen = x1d.shape[1]

			##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
			assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] )

			## calculate the start and end positions of each response in the last dimension of result
			dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ]
                        endPositions = np.cumsum(dims)
                        startPositions =  endPositions - dims

			for name in names4onebatch:
				if not allresults.has_key(name):
					allresults[name]=dict() 
					numModels[name] =dict()

			## batchres is a batch of result, its ndim=4
			for response, start, end in zip(model['responses'], startPositions, endPositions):

				## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters
				batchres = result[:, :, :, start:end ]


				## remove masked positions
				revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ]

				for res4one, name in zip(revised_batchres, names4onebatch):
                                        if not allresults[name].has_key(response):
                                                allresults[name][response] = res4one
                                                numModels[name][response] = np.int32(1)
                                        else:
                                                ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                                                allresults[name][response] +=  res4one
                                                numModels[name][response] += np.int32(1)


		del predict
		del predData
		del predSeqData
		gc.collect()


	## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response
	finalresults = dict()
	for name, results in allresults.iteritems():
		if not finalresults.has_key(name):
			finalresults[name] = dict()

		## finalresults has 3 dimensions. 
		for response in results.keys():
			#finalresults[name][response] = np.average(allresults[name][response], axis=0)
			finalresults[name][response] = allresults[name][response]/numModels[name][response]

			##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
			apt = Response2LabelName(response)
			if config.IsSymmetricAPT( apt ):
				finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2.

	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for response in model['responses']:
			apt = response
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][response])
			labelWeights[apt].append(model['weight4labels'][response])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)

	## convert the predicted distance probability matrix into a predicted contact matrix. 
	## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions
	predictedContactMatrices = dict()
	from scipy.stats import norm
	for name, results in finalresults.iteritems():
		predictedContactMatrices[name] = dict()
		for response in results.keys():
			apt = Response2LabelName(response)
			labelType = Response2LabelType(response)

			if apt in config.allAtomPairTypes:
				if labelType.startswith('Discrete'):
					subType = labelType[len('Discrete'): ]
					labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
					predictedContactMatrices[name][apt] =  np.sum( finalresults[name][response][:, :, :labelOf8], axis=2)
				elif labelType.startswith('Normal'):
					assert labelType.startswith('Normal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(config.ContactDefinition)
				elif labelType.startswith('LogNormal'):
					assert labelType.startswith('LogNormal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(np.log(config.ContactDefinition) )
				else:
					print 'unsupported label type in response: ', response
					exit(-1)

			elif apt in ['HB', 'Beta']:
				predictedContactMatrices[name][apt] =  finalresults[name][response][:, :, 0]
			else:
				print 'unsupported atom type in response: ', response
				exit(-1)


	##write all the results here
	## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence)
        ## we store distLabelType for future use
	for name, results in finalresults.iteritems():

		savefilename = name + '.predictedDistMatrix.pkl'
		if savefolder is not None:
			savefilename = os.path.join(savefolder, savefilename)

		fh = open(savefilename, 'wb')
		cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()

	return finalresults, predictedContactMatrices, allsequences

Example #6

Show file

File: PropertyUtils.py Project: zhujianwei31415/RaptorX-3DModeling

def EvaluatePropertyPrediction(predictions, nativefolder):

    from DataProcessor import LoadNativeLabels

    errors = dict()
    names = []
    for name, preds in predictions.iteritems():
        #print 'name=', name
        nativeLabels = LoadNativeLabels(name, nativefolder, preds.keys())
        if nativeLabels is None:
            continue

        names.append(name)

        for response, pred in preds.iteritems():
            native = nativeLabels[Response2LabelName(response)]
            missing = nativeLabels['Missing']

            if response.startswith('DISO'):
                numResidues = len(pred)
                totalError = sum([p != t for p, t in zip(pred, native)])
                tmpError = np.array([numResidues, totalError])

            elif response.startswith('SS'):
                numResidues = sum([m == 0 for m in missing])
                totalError = sum([
                    p != t for p, t, m in zip(pred, native, missing) if m == 0
                ])
                tmpError = np.array([numResidues, totalError])

            elif 'Phi' in response or 'Psi' in response:
                invalidResidues = [0] * len(missing)
                for i in xrange(len(missing)):
                    if missing[i] == 1:
                        invalidResidues[i] = 1
                        if i > 0:
                            invalidResidues[i - 1] = 1
                        if i < len(missing) - 1:
                            invalidResidues[i + 1] = 1

                invalidResidues[0] = 1
                invalidResidues[len(missing) - 1] = 1

                numResidues = sum([m == 0 for m in invalidResidues])
                err1 = abs(pred - native)
                err2 = np.float32(2 * np.pi) - err1
                err = np.minimum(err1, err2)
                totalError = np.sum(
                    [e for e, m in zip(err, invalidResidues) if m == 0],
                    axis=0)
                tmpError = np.array([numResidues] + list(totalError))
            else:
                print 'The Evaluate function not implemented for response: ', response
                exit(1)

            if errors.has_key(response):
                errors[response].append(tmpError)
            else:
                errors[response] = [tmpError]

    ## calculate average error
    avgErrPerTarget = dict()
    avgErrPerResidue = dict()
    allerrors = dict()
    for response, e in errors.iteritems():

        err = np.array(e)
        err_avg = np.average(err, axis=0)
        err2 = err_avg[1:] * 1. / err_avg[0]

        ind_err = np.divide(err[:, 1:] * 1.0, err[:, 0:1])
        err1 = np.average(ind_err, axis=0)

        avgErrPerTarget[response] = err1
        avgErrPerResidue[response] = err2
        """
		print '*********************Error for response ', response, '************************'
		print 'avg by target: ', err1, ' avg by residue: ', err2
		print '                            '
		print '*********************Individual Error for response ', response, '************************'
		"""
        allerrors[response] = dict()
        for name, e0 in zip(names, ind_err):
            ##print name, e0
            allerrors[response][name] = e0

    return avgErrPerTarget, avgErrPerResidue, allerrors

Example #7

Show file

File: TPLMergePredictedDistMatrix.py Project: poliu2s/RaptorX-3DModeling

def MergeOneProtein(inputFiles, method):

        if inputFiles is None or len(inputFiles) < 2:
                print 'Please provide at least two predicted matrices for merge'
                exit(-1)

        seqName = None
        sequence = None

        distProbs = dict()
        contactProbs = dict()
        labelDistributions = dict()
        labelWeights = dict()
        labelWeightFlags = []

	tempNames = []
        for inputFile in inputFiles:
                content = DistanceUtils.LoadRawDistProbFile(inputFile)

                name0, sequence0, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content

                ##add code here to check all the input files have the same protein name
		seqName0 = '-'.join(name0.split('-')[0:-1])
		tempName = name0.split('-')[-1]
		tempNames.append(tempName)

		labelWeightFlags.append( labelWeight is not None )

		if seqName is None:
			seqName = seqName0
		else:
			assert seqName == seqName0

                if sequence is None:
                        sequence = sequence0
                else:
                        assert sequence == sequence0


                for apt in predictedDistProb.keys():
                        if not distProbs.has_key(apt):
                                distProbs[apt] =[]
                        distProbs[apt].append( predictedDistProb[apt] )

                for apt in predictedContactProb.keys():
                        if not contactProbs.has_key(apt):
                                contactProbs[apt] = []
                        contactProbs[apt].append( predictedContactProb[apt] )

                if labelWeight is not None:
                        for apt in labelWeight.keys():
                                if not labelWeights.has_key(apt):
                                        labelWeights[apt] = []
                                labelWeights[apt].append( labelWeight[apt] )

                for apt in labelDistribution.keys():
                        if not labelDistributions.has_key(apt):
                                labelDistributions[apt] = []
                        labelDistributions[apt].append( labelDistribution[apt] )

        ## check consistency among labelWeightFlags
        consistent  = all( flag==labelWeightFlags[0] for flag in labelWeightFlags)
        if not consistent:
                print 'ERROR: the input matrix files have inconsistent format. Some have a labelWeight while others do not.'
                exit(-1)

        ### Ms is a dictionary, each value in Ms is a list of matrices
        ### this function calculates the geometric mean of all the matrices in the same list and the renormalize the last dim of the resultant mean

        def CalcGeometricMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = scipy.stats.mstats.gmean(v, axis=0)
                        tmp_sum = np.sum(result[apt], axis=-1, keepdims=True)
                        result[apt] = result[apt]/tmp_sum

                return result

        ## calculate arithmetic mean
        def CalcArithmeticMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = np.mean(v, axis=0)

                return result

        if method == 'amean':
                distMatrixProb = CalcArithmeticMean(distProbs)
                labelDistribution = CalcArithmeticMean(labelDistributions)
        else:
                distMatrixProb = CalcGeometricMean(distProbs)
                labelDistribution = CalcGeometricMean(labelDistributions)

	contactMatrixProb = dict()
	for k in distMatrixProb.keys():
		apt = Response2LabelName(k)
		labelType = Response2LabelType(k)

		if not labelType.startswith('Discrete'):
			print 'ERROR: this labelType currently not supported in TPLMergePredicteDistMatrix.py : ', labelType
			exit(-1)

		subType = labelType[ len('Discrete'): ]
		labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
		contactMatrixProb[apt] = ContactUtils.Distance2Contact(distMatrixProb[k], labelOf8)

        if labelWeightFlags[0] is True:
                labelWeight = CalcArithmeticMean(labelWeights)

	targetName = '-'.join( [ seqName ] + tempNames )
        if labelWeightFlags[0] is True:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, labelWeight, labelDistribution)
        else:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, None, labelDistribution)

        return contactMatrixProb, content4save

Example #8

Show file

def CalcLabelDistributionAndWeight(data=None, modelSpecs=None):
    ## weight for different ranges (long, medium, short, and near-ranges)
    if 'weight4range' not in modelSpecs:
        modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape(
            (4, 1)).astype(np.float32)
    else:
        modelSpecs['weight4range'].reshape((4, 1)).astype(np.float32)
    print('weight for range: ', modelSpecs['weight4range'])

    ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15
    if 'LRbias' in modelSpecs:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C[modelSpecs['LRbias']], modelSpecs['weight4range'])
    else:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C['mid'], modelSpecs['weight4range'])
    print('LRbias= ', modelSpecs['LRbias'], 'weight43C= ',
          modelSpecs['weight4Discrete3C'])

    ## weight for 2C
    modelSpecs['weight4HB_Discrete2C'] = np.multiply(
        config.weight4HB2C, modelSpecs['weight4range'])
    modelSpecs['weight4Beta_Discrete2C'] = np.multiply(
        config.weight4Beta2C, modelSpecs['weight4range'])

    ## weight for real value
    modelSpecs['weight4continuous'] = np.multiply(
        np.array([1.] * 4).reshape((4, 1)).astype(np.float32),
        modelSpecs['weight4range'])

    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in data
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.] * 4).reshape(
                (4, 1)).astype(np.float32)
            continue

        if 'UseBoundingBox4RefProbs' in modelSpecs and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=newLabelMatrices,
                numLabels=config.responseProbDims[labelType])
        else:
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=[m.astype(np.int32) for m in allLabelMatrices[response]],
                numLabels=config.responseProbDims[labelType])

    modelSpecs['labelRefProbs'] = allRefProbs

    ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier
    modelSpecs['weight4labels'] = dict()

    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)

        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            ## just need to assign range weight
            modelSpecs['weight4labels'][response] = modelSpecs[
                'weight4continuous']
            continue

        if labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]

            ## if the response is for HB and BetaPairing
            if subType.startswith('2C'):
                modelSpecs['weight4labels'][response] = modelSpecs['weight4' +
                                                                   response]
                continue

            ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            if subType.startswith('3C'):
                modelSpecs['weight4labels'][response] = modelSpecs[
                    'weight4Discrete3C']
                continue

            ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            modelSpecs['weight4labels'][
                response] = DistanceUtils.CalcLabelWeight(
                    modelSpecs['weight4Discrete3C'], allRefProbs[response],
                    config.distCutoffs[subType])
            continue

        print('unsupported response in CalcLabelDistributionAndWeight: ',
              response)
        exit(-1)

    return modelSpecs['labelRefProbs'], modelSpecs['weight4labels']

Example #9

Show file

def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print('the feature file is empty')
        exit(-1)

    fhs = [open(file, 'rb') for file in files]
    data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], [])
    [fh.close() for fh in fhs]

    ## each protein has sequential and  pairwise features as input and distance matrix as label
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ## convert the primary sequence to a one-hot encoding
        oneHotEncoding = config.SeqOneHotEncoding(d['sequence'])

        ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure
        if config.EmbeddingUsed(modelSpecs):
            if 'Seq+SS' in modelSpecs['seq2matrixMode']:
                embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3'])
            else:
                embedFeature = oneHotEncoding
            oneprotein['embedFeatures'] = embedFeature

        ##collecting sequential features...
        seqMatrices = [oneHotEncoding]

        ## 3-state secondary structure shall always be placed before the other features, why?
        if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True):
            seqMatrices.append(d['SS3'])

        if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True):
            seqMatrices.append(d['ACC'])

        if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True):
            seqMatrices.append(d['PSSM'])

        if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True:
            seqMatrices.append(d['DISO'])

        ##membrane protein specific features
        useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and (
            modelSpecs['UseMPSpecificFeatures'] is True)
        if useMPSpecificFeatures:
            if 'MemAcc' in d:
                seqMatrices.append(d['MemAcc'])
            else:
                print('The data does not have a feature called MemAcc')
                exit(-1)
            if 'MemTopo' in d:
                seqMatrices.append(d['MemTopo'])
            else:
                print('The data does not have a feature called MemTopo')
                exit(-1)

        ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment.
        ## this is mainly used for homology modeling
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if 'tplSimScore' not in d:
                print(
                    'the data has no key tplSimScore, which is needed since you specify to use template information'
                )
                exit(-1)
            if d['tplSimScore'].shape[1] != 11:
                print(
                    'The number of features for query-template similarity shall be equal to 11'
                )
                exit(-1)
            seqMatrices.append(d['tplSimScore'])
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        ##collecting pairwise features...
        pairfeatures = []
        ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 )
        posFeature = LocationFeature(d)
        pairfeatures.append(posFeature)

        cbrtFeature = CubeRootFeature(d)
        pairfeatures.append(cbrtFeature)

        if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True):
            if 'ccmpredZ' not in d:
                print('Something must be wrong. The data for protein ',
                      d['name'],
                      ' does not have the normalized ccmpred feature!')
                exit(-1)
            pairfeatures.append(d['ccmpredZ'])

        if modelSpecs['UsePSICOV'] is True:
            pairfeatures.append(d['psicovZ'])

        if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is
                                              True):
            pairfeatures.append(d['OtherPairs'])

        ##add template-related distance matrix. This code needs modification later
        ## somewhere we shall also write code to add template-related sequential features such as secondary structure?
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template distance matrix...'
            if 'tplDistMatrix' not in d:
                print(
                    'the data for ', d['name'],
                    ' has no tplDistMatrix, which is needed since you specify to use template information'
                )
                exit(-1)

            ## Check to make sure that we use exactly the same set of inter-atom distance information from templates
            ## currently we do not use HB and Beta information from template
            apts = d['tplDistMatrix'].keys()
            assert (set(apts) == set(config.allAtomPairTypes))
            ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) )

            tmpPairFeatures = dict()
            for apt, tplDistMatrix in d['tplDistMatrix'].items():
                ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type
                if apt == 'CaCa':
                    flagMatrix = np.zeros_like(tplDistMatrix)
                    np.putmask(flagMatrix, tplDistMatrix < 0, 1)
                    pairfeatures.append(flagMatrix)

                strengthMatrix = np.copy(tplDistMatrix)
                np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5)
                np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50)
                strengthMatrix = 3.5 / strengthMatrix

                if config.InTPLMemorySaveMode(modelSpecs):
                    tmpPairFeatures[apt] = [strengthMatrix]
                else:
                    tmpPairFeatures[apt] = [
                        strengthMatrix,
                        np.square(strengthMatrix)
                    ]

            ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure
            ## python of different versions may have different ordering of keys in dict() ?
            pairfeatures.extend(tmpPairFeatures['CbCb'])
            pairfeatures.extend(tmpPairFeatures['CgCg'])
            pairfeatures.extend(tmpPairFeatures['CaCg'])
            pairfeatures.extend(tmpPairFeatures['CaCa'])
            pairfeatures.extend(tmpPairFeatures['NO'])

        if config.InTPLMemorySaveMode(modelSpecs):
            matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32)
        else:
            matrixFeature = np.dstack(tuple(pairfeatures))
            #print 'matrixFeature.shape: ', matrixFeature.shape

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature
        oneprotein['matrixFeatures'] = matrixFeature

        ##collecting labels...
        if 'atomDistMatrix' in d:
            atomDistMatrix = d['atomDistMatrix']
            oneprotein['atomLabelMatrix'] = dict()

            for response in modelSpecs['responses']:
                responseName = Response2LabelName(response)
                labelType = Response2LabelType(response)
                if responseName not in atomDistMatrix:
                    print('In the raw feature data, ', d['name'],
                          ' does not have matrix for ', responseName)
                    exit(-1)

                ## atomDistMatrix is the raw data, so it does not have information about labelType
                distm = atomDistMatrix[responseName]

                if labelType.startswith('Discrete'):
                    subType = labelType[len('Discrete'):]

                    ## no need to discretize for HB and Beta-Pairing since they are binary matrices
                    if responseName.startswith(
                            'HB') or responseName.startswith('Beta'):
                        oneprotein['atomLabelMatrix'][response] = distm
                    else:
                        labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
                            distm, config.distCutoffs[subType],
                            subType.endswith('Plus'))
                        oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('LogNormal'):
                    labelMatrix = DistanceUtils.LogDistMatrix(distm)
                    oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('Normal'):
                    oneprotein['atomLabelMatrix'][response] = distm
                else:
                    print('unsupported response: ', res)
                    exit(-1)

        elif forTrainValidation:
            print(
                'atomic distance matrix is needed for the training and validation data'
            )
            exit(-1)

        ##at this point, finish collecting features and labels for one protein
        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print('assembled features and labels for ', counter, ' proteins.')

    return proteinFeatures

Example #10

Show file

File: DataProcessor.py Project: zhujianwei31415/RaptorX-3DModeling

def LoadPropertyFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print 'ERROR: the feature files is empty'
        exit(1)

    data = []
    for infile in files:
        with open(infile, 'rb') as fh:
            data.extend(cPickle.load(fh))

    EmbeddingModel = None
    if modelSpecs.has_key(
            'UseSequenceEmbedding') and modelSpecs['UseSequenceEmbedding']:
        EmbeddingModelFile = os.path.join(
            os.environ['DL4PropertyPredHome'], 'data',
            'Mofrad-PLoSOne-2015Nov.3GramEmbeddingParams.pkl')
        EmbeddingModel = SequenceEmbedding.LoadEmbeddingParamsInPKL(
            EmbeddingModelFile)

    ## each protein has sequential features as input
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ##collecting sequential features...
        seqMatrices = []

        seqMatrices.append(d['PSSM'])
        ##seqMatrices.append( d['PSFM'] )

        ##Load sequence embedding features here
        if EmbeddingModel is not None:
            seqMatrices.append(
                SequenceEmbedding.EmbedOneSequence(d['sequence'],
                                                   EmbeddingModel))

        if modelSpecs.has_key('UsePSFM') and modelSpecs['UsePSFM']:
            seqMatrices.append(d['PSFM'])

        if modelSpecs.has_key(
                'UseOneHotEncoding') and modelSpecs['UseOneHotEncoding']:
            seqMatrices.append(config.SeqOneHotEncoding(d['sequence']))

## add template similarity score here
        if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if not d.has_key('tplSimScore'):
                print 'ERROR: no tplSimScore for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)
            if d['tplSimScore'].shape[1] != 10:
                print 'ERROR: the number of query-template similarity features is not 10 in data for', d[
                    'name']
                exit(1)

            if not d.has_key('tplProperties'):
                print 'ERROR: no tplProperties for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)

            if d['tplProperties'].shape[1] < 15:
                print 'ERROR: #template local structure properties shall be at least 15 for target', d[
                    'name']
                exit(1)

            ## the query-template similarity score shall be arranged in the order of: AA identity (binary), blosum80, blosum62, blosum45, spScore, spScore_ST, ppScore, pmScore, cc, hdsm
            seqMatrices.append(d['tplSimScore'])

            ##we do not use omg information from the template, the first 8 features shall be the 8-state secondary structure, then followed by pACC, CNa, CNb, Phi, Psi, Theta and Tau
            #seqMatrices.append( d['tplProperties'][:,:15] )
            seqMatrices.append(d['tplProperties'][:, :8])
            for r in modelSpecs['responses']:
                if r.startswith('ACC'):
                    seqMatrices.append(d['tplProperties'][:, 8:9])
                elif r.startswith('Phi') or r.startswith(
                        'Psi') or r.startswith('CLE'):
                    seqMatrices.append(d['tplProperties'][:, 11:13])
                elif r.startswith('Theta') or r.startswith('Tau'):
                    seqMatrices.append(d['tplProperties'][:, 13:15])
                elif r.startswith('CNa') or r.startswith('CNb'):
                    seqMatrices.append(d['tplProperties'][:, 9:11])
                else:
                    print 'ERROR: unsupported response', r
                    exit(1)

        if d.has_key('otherSeqFeatures'):
            seqMatrices.append(d['otherSeqFeatures'])

## all the features shall have shape (seqLen, nFeatures) where nFeatures is variable, but seqLen is the sequence length of one protein
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature

        if not d.has_key('DISO') and d.has_key('Missing'):
            d['DISO'] = d['Missing']

##collecting labels...
        for r in modelSpecs['responses']:
            labelName = Response2LabelName(r)
            labelType = Response2LabelType(r)

            if not d.has_key(labelName) and forTrainValidation:
                print 'ERROR: missing label information for protein ', d[
                    'name'], ' and response ', r
                exit(1)
            elif not d.has_key(labelName):
                continue

            labels = d[labelName]

            ## need some special handling of discrete labels
            if labelType.startswith('Discrete'):
                if r.startswith('SS3'):
                    labels = np.array([
                        PropertyUtils.SS3Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('SS8'):
                    labels = np.array([
                        PropertyUtils.SS8Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('ACC') or r.startswith('DISO'):
                    labels = labels.reshape((-1, 1))
                elif r.startswith('CLE'):
                    labels = np.array([
                        PropertyUtils.CLELetter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                else:
                    print 'ERROR: please specify how to convert your discrete labels to numbers for response ', r
                    exit(1)

            oneprotein[labelName] = labels

##at this point, finish collecting features and labels for one protein
        if d.has_key('Missing'):
            oneprotein['missing'] = d['Missing']
        elif forTrainValidation:
            print 'ERROR: for training data, we need information to specify which residues have no 3D coordinates'
            exit(1)

        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print 'assembled features and labels for ', counter, ' proteins.'
    """
    	tmpfile = open(files[0] + '.contactInput.pkl', 'wb')
    	cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL)
    	tmpfile.close()
    	"""

    return proteinFeatures

Example #11

Show file

File: DataProcessor.py Project: zhujianwei31415/RaptorX-3DModeling

def AssembleOneBatch(data, modelSpecs, forTrainValidation=True):
    if not data:
        print 'WARNING: the list of data is empty'
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]),
                   dtype=theano.config.floatX)
    ## for mask
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)

    ## Y shall be a list of labels, each for one type
    ##we always need a weight vector to deal with residues without 3D coordinates in training and validation, if modelSpecs['UseSampleWeight']:
    Y = []
    W = []
    for res in modelSpecs['responses']:
        labelType = Response2LabelType(res)
        labelName = Response2LabelName(res)

        dataType = (np.int32 if labelType.startswith('Discrete') else
                    theano.config.floatX)
        if forTrainValidation:
            if not data[0].has_key(labelName):
                print 'ERROR: label information is needed for training protein ', data[
                    'name'], ' and response ', res
                exit(1)
            Y.append(
                np.zeros(shape=(numSeqs, maxSeqLen,
                                config.responseValueDims[labelType]),
                         dtype=dataType))

            if not data[0].has_key('missing'):
                print 'ERROR: missing information is needed for training protein ', data[
                    'name']
                exit(1)

            W.append(
                np.zeros(shape=(numSeqs, maxSeqLen, 1),
                         dtype=theano.config.floatX))

    for j in range(len(data)):
        seqLen = data[j]['seqLen']
        X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures']
        M1d[j, maxSeqLen - seqLen:].fill(1)

        for y, w, res in zip(Y, W, modelSpecs['responses']):
            y[j, maxSeqLen - seqLen:, ] = data[j][Response2LabelName(res)]

            if res.startswith('DISO'):
                ## for disorder prediction, all the residues shall be considered since those residues without 3D coordinates are positive examples
                ## we may assign a larger weight to positive examples since they are only 6% of the whole data set
                w[j, maxSeqLen - seqLen:, ] = np.reshape(
                    data[j]['missing'],
                    (-1, 1)) * (modelSpecs['w4diso'] - 1.) + 1.
            else:
                ## assign weight 0 to those residues without coordinates, otherwise 1
                w[j, maxSeqLen -
                  seqLen:, ] = 1.0 - np.reshape(data[j]['missing'], (-1, 1))

    onebatch = [X1d, M1d]
    onebatch.extend(Y)
    onebatch.extend(W)

    return onebatch

Example #12

Show file

def PredictMatrixLabels(models,
                        predictors,
                        names,
                        inputFolders,
                        aliFolders=None,
                        tplFolder=None,
                        aliFile=None,
                        tplFile=None,
                        saveFolder=None):

    if not isinstance(names, (list, tuple)):
        targetName = names
    else:
        targetName = None

    ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices
    ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results
    ##two different models may share common responses

    allsequences = dict()
    allresults = dict()  ## the results predicted from the real input
    numModels = dict(
    )  ## count the number of models that may predict each response

    for model, predictor in zip(models, predictors):
        #predict, inputVariables = BuildPredictor(model)
        predict, inputVariables = predictor

        ## load data for each model separately since each model may have a different specification
        if targetName is None:
            rawData = LoadProteinData4OneModel(model, names, inputFolders,
                                               aliFolders, tplFolder)

        elif aliFile is not None and tplFile is not None:
            rawData = LoadOneAlignment4OneModel(model, targetName,
                                                inputFolders, aliFile, tplFile)
        else:
            rawData = LoadOneProteinData4OneModel(model, targetName,
                                                  inputFolders, aliFolders,
                                                  tplFolder)

        predData = DataProcessor.ExtractFeaturesNLabels(
            rawData,
            modelSpecs=model,
            forTrainValidation=False,
            returnMode='list')

        ##make sure the input has the same number of features as the model
        FeatureUtils.CheckModelNDataConsistency(model, predData)

        ## check sequence consistency
        for d in predData:
            name = d['name']
            if not allresults.has_key(name):
                allresults[name] = dict()
                numModels[name] = dict()

            if not allsequences.has_key(name):
                allsequences[name] = d['sequence']
            elif allsequences[name] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData = DataProcessor.SplitData2Batches(data=predData,
                                                      numDataPoints=624,
                                                      modelSpecs=model)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        ##for onebatch, names4onebatch in zip(predSeqData, names):
        for minibatch in predSeqData:
            onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
                minibatch, model)
            input = onebatch[:len(inputVariables)]
            result = predict(*input)
            ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
            assert result.shape[3] == sum([
                GetResponseProbDims(response)
                for response in model['responses']
            ])

            ## calculate the start and end positions of each response in the last dimension of result
            dims = [
                GetResponseProbDims(response)
                for response in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            x1d, x2d, x1dmask, x2dmask = input[0:4]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            for response, start, end in zip(model['responses'], startPositions,
                                            endPositions):

                ## batchres is a batch of result, its ndim=4
                ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters
                batchres = result[:, :, :, start:end]
                ## remove masked positions
                revised_batchres = [
                    probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :]
                    for probMatrix, seqLen in zip(batchres, seqLens)
                ]

                for res4one, name in zip(revised_batchres, names4onebatch):
                    if not allresults[name].has_key(response):
                        allresults[name][response] = res4one
                        numModels[name][response] = np.int32(1)
                    else:
                        ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                        allresults[name][response] += res4one
                        numModels[name][response] += np.int32(1)

    ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()

        ## finalresults has 3 dimensions.
        for response in results.keys():
            finalresults[name][response] = (allresults[name][response] /
                                            numModels[name][response]).astype(
                                                np.float32)

            ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
            labelName = Response2LabelName(response)
            if config.IsSymmetricLabel(labelName):
                finalresults[name][response] = (
                    finalresults[name][response] +
                    np.transpose(finalresults[name][response], (1, 0, 2))) / 2.

    ## convert predicted distance probability matrix into contact matrix
    predictedContactMatrices = DeriveContactMatrix(finalresults)

    ## collect the average label distributions and weight matrix
    finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution(
        models)

    ##write all the results here
    ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution)
    for name, results in finalresults.iteritems():

        savefilename = name + '.predictedDistMatrix.pkl'
        if saveFolder is not None:
            savefilename = os.path.join(saveFolder, savefilename)

        if targetName is not None:
            originalName = targetName
        else:
            for n in names:
                if name.startswith(n):
                    originalName = n
                    break

        with open(savefilename, 'wb') as fh:
            #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump((originalName, allsequences[name], results,
                          predictedContactMatrices[name], finalLabelWeights,
                          finalLabelDistributions),
                         fh,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    return (predictedContactMatrices, allsequences)
    """