def EvaluateAccuracy(pred_prob, truth, pad_len):
            pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                           dtype=theano.config.floatX)
            truth_in_correct_shape = truth[pad_len:, pad_len:]

            labelType = Response2LabelType(currentResponse)
            atomType = Response2LabelName(currentResponse)
            symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

            if labelType.startswith('LogNormal'):
                return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                            truth=truth_in_correct_shape,
                                            symmetric=symmetric)

            elif labelType.startswith('Normal'):
                return TopAccuracyNormal(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)

            elif labelType.startswith('Discrete'):
                subType = labelType[len('Discrete'):]
                if subType.startswith('2C'):
                    return TopAccuracy2C(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)
                else:
                    return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                             truth=truth_in_correct_shape,
                                             subType=subType,
                                             symmetric=symmetric)
            else:
                print 'unsupported label type in EvaluateAccuracy: ', labelType
                exit(-1)
def CalcRefState4OneBatch(batch, modelSpecs, minSeqSep=3):
    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in batch
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.]).astype(np.float32)
            continue

        if modelSpecs.has_key('UseBoundingBox4RefProbs') and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=newLabelMatrices,
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)
        else:
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=[
                    m.astype(np.int32) for m in allLabelMatrices[response]
                ],
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)

    return allRefProbs, avgLen
def EvaluateSinglePropertyPrediction(prediction, nativeLabelFile):

    from DataProcessor import LoadNativeLabelsFromFile

    errors = dict()
    nativeLabels = LoadNativeLabelsFromFile(nativeLabelFile)

    for response, pred in prediction.iteritems():
        native = nativeLabels[Response2LabelName(response)]
        missing = nativeLabels['Missing']

        if response.startswith('DISO'):
            numResidues = len(pred)
            totalError = sum([p != t for p, t in zip(pred, native)])
            tmpError = np.array([numResidues, totalError])

        elif response.startswith('SS'):
            numResidues = sum([m == 0 for m in missing])
            totalError = sum(
                [p != t for p, t, m in zip(pred, native, missing) if m == 0])
            tmpError = np.array([numResidues, totalError])

        elif 'Phi' in response or 'Psi' in response:
            invalidResidues = [0] * len(missing)
            for i in xrange(len(missing)):
                if missing[i] == 1:
                    invalidResidues[i] = 1
                    if i > 0:
                        invalidResidues[i - 1] = 1
                    if i < len(missing) - 1:
                        invalidResidues[i + 1] = 1

            invalidResidues[0] = 1
            invalidResidues[len(missing) - 1] = 1

            numResidues = sum([m == 0 for m in invalidResidues])
            err1 = abs(pred - native)
            err2 = np.float32(2 * np.pi) - err1
            err = np.minimum(err1, err2)
            totalError = np.sum(
                [e for e, m in zip(err, invalidResidues) if m == 0], axis=0)
            tmpError = np.array([numResidues] + list(totalError))
        else:
            print 'The Evaluate function not implemented for response: ', response
            exit(1)

        if errors.has_key(response):
            errors[response].append(tmpError)
        else:
            errors[response] = [tmpError]

    ## calculate average error
    avgerrors = dict()
    for response, err in errors.iteritems():
        avgerrors[response] = np.average(err)

    return avgerrors
Beispiel #4
0
    def EvaluateAccuracy(pred_prob, truth, pad_len):
        pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                       dtype=theano.config.floatX)
        truth_in_correct_shape = truth[pad_len:, pad_len:]

        labelType = Response2LabelType(currentResponse)
        atomType = Response2LabelName(currentResponse)
        symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

        if labelType.startswith('LogNormal'):
            return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                        truth=truth_in_correct_shape,
                                        symmetric=symmetric)
        elif labelType.startswith('Normal'):
            return TopAccuracyNormal(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
        elif labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]
            if subType.startswith('2C'):
                return TopAccuracy2C(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
            else:
                return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         subType=subType,
                                         symmetric=symmetric)
        else:
            print('unsupported label type in EvaluateAccuracy: ', labelType)
            exit(-1)

        accuracyList = []
        for res, out_prob, z, ratio in zip(self.responses,
                                           self.output_probList, zList,
                                           self.modelSpecs['topRatios']):
            ## currently TopAccuracy only works when the dimension of each z is 3
            assert z.ndim == 3
            if self.mask_1d is not None:
                paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d,
                                                            axis=1)
            else:
                paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32)
            currentResponse = res
            topRatio = ratio
            ##here we use scan to calculate accuracy for each protein
            result, updates = theano.scan(fn=EvaluateAccuracy,
                                          outputs_info=None,
                                          sequences=[out_prob, z, paddingLens])
            accuracy = T.mean(result, axis=0)
            accuracyList.append(accuracy)

        return T.stacklists(accuracyList)
def PredictDistMatrix(modelFiles, predFiles, savefolder=None):
    	## load all the models from the files. Each file contains specification for one model.
	models = []
	for mFile in modelFiles:
    		fh = open(mFile, 'rb')
    		model = cPickle.load(fh)
    		fh.close()
		models.append(model)

	## check consistency among models. All the models shall have the same labelType for the same atom pair type
	labelTypes = dict()
	for model in models:
		for response in model['responses']:
			labelName = Response2LabelName(response)
			labelType = Response2LabelType(response)
			if not labelTypes.has_key(labelName):
				labelTypes[labelName] = labelType
			elif labelTypes[labelName] != labelType:
				print 'WARNING: at least two models have different label types for the same atom pair type.'
				exit(-1)
					

	allsequences = dict()

	##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices
	##We predict one prob_matrix from each model for each protein and each response
	## two different models may share some overlapping responses.

	allresults = dict()
	numModels = dict()
	for model, mfile in zip(models, modelFiles):
		if not model['network'] in config.allNetworks:

			print 'unsupported network architecture: ', model['network']
			exit(-1)

		distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False)

		inputVariables = [ x, y, xmask, ymask]
		if xem is not None:
			inputVariables.append(xem)

	  	pred_prob = distancePredictor.output_prob
        	predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' )

		## set model parameter values
		if not Compatible(distancePredictor.params, model['paramValues']):
			print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile
			exit(-1)

		[ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ]

		## We shall load these files for each model separately since each model may have different requirement of the data
		predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False)

		##make sure the input has the same number of features as the model. We do random check here to speed up
		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2]

		if predData[0].has_key('embedFeatures'):
			rindex = np.random.randint(0, high=len(predData) )
			assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1]

		## check if all the proteins of the same name have exactly the same sequence
		for d in predData:
			if not allsequences.has_key(d['name']):
				allsequences[d['name']] = d['sequence']
			elif allsequences[d['name']] != d['sequence']:
				print 'Error: inconsistent primary sequence for the same protein in the protein feature files'
				exit(-1)
			
		## predSeqData and names are in the exactly the same order, so we know which data is for which protein	
		predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model)
		print '#predData: ', len(predData), '#batches: ', len(predSeqData)

		for onebatch, names4onebatch in zip(predSeqData, names):
			input = onebatch[ : len(inputVariables) ]
			result = predict(*input)

			x1d, x2d, x1dmask, x2dmask = input[0:4]
			seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
			maxSeqLen = x1d.shape[1]

			##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
			assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] )

			## calculate the start and end positions of each response in the last dimension of result
			dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ]
                        endPositions = np.cumsum(dims)
                        startPositions =  endPositions - dims

			for name in names4onebatch:
				if not allresults.has_key(name):
					allresults[name]=dict() 
					numModels[name] =dict()

			## batchres is a batch of result, its ndim=4
			for response, start, end in zip(model['responses'], startPositions, endPositions):

				## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters
				batchres = result[:, :, :, start:end ]


				## remove masked positions
				revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ]

				for res4one, name in zip(revised_batchres, names4onebatch):
                                        if not allresults[name].has_key(response):
                                                allresults[name][response] = res4one
                                                numModels[name][response] = np.int32(1)
                                        else:
                                                ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                                                allresults[name][response] +=  res4one
                                                numModels[name][response] += np.int32(1)


		del predict
		del predData
		del predSeqData
		gc.collect()


	## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response
	finalresults = dict()
	for name, results in allresults.iteritems():
		if not finalresults.has_key(name):
			finalresults[name] = dict()

		## finalresults has 3 dimensions. 
		for response in results.keys():
			#finalresults[name][response] = np.average(allresults[name][response], axis=0)
			finalresults[name][response] = allresults[name][response]/numModels[name][response]

			##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
			apt = Response2LabelName(response)
			if config.IsSymmetricAPT( apt ):
				finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2.

	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for response in model['responses']:
			apt = response
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][response])
			labelWeights[apt].append(model['weight4labels'][response])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)

	## convert the predicted distance probability matrix into a predicted contact matrix. 
	## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions
	predictedContactMatrices = dict()
	from scipy.stats import norm
	for name, results in finalresults.iteritems():
		predictedContactMatrices[name] = dict()
		for response in results.keys():
			apt = Response2LabelName(response)
			labelType = Response2LabelType(response)

			if apt in config.allAtomPairTypes:
				if labelType.startswith('Discrete'):
					subType = labelType[len('Discrete'): ]
					labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
					predictedContactMatrices[name][apt] =  np.sum( finalresults[name][response][:, :, :labelOf8], axis=2)
				elif labelType.startswith('Normal'):
					assert labelType.startswith('Normal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(config.ContactDefinition)
				elif labelType.startswith('LogNormal'):
					assert labelType.startswith('LogNormal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(np.log(config.ContactDefinition) )
				else:
					print 'unsupported label type in response: ', response
					exit(-1)

			elif apt in ['HB', 'Beta']:
				predictedContactMatrices[name][apt] =  finalresults[name][response][:, :, 0]
			else:
				print 'unsupported atom type in response: ', response
				exit(-1)


	##write all the results here
	## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence)
        ## we store distLabelType for future use
	for name, results in finalresults.iteritems():

		savefilename = name + '.predictedDistMatrix.pkl'
		if savefolder is not None:
			savefilename = os.path.join(savefolder, savefilename)

		fh = open(savefilename, 'wb')
		cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()

	return finalresults, predictedContactMatrices, allsequences
def EvaluatePropertyPrediction(predictions, nativefolder):

    from DataProcessor import LoadNativeLabels

    errors = dict()
    names = []
    for name, preds in predictions.iteritems():
        #print 'name=', name
        nativeLabels = LoadNativeLabels(name, nativefolder, preds.keys())
        if nativeLabels is None:
            continue

        names.append(name)

        for response, pred in preds.iteritems():
            native = nativeLabels[Response2LabelName(response)]
            missing = nativeLabels['Missing']

            if response.startswith('DISO'):
                numResidues = len(pred)
                totalError = sum([p != t for p, t in zip(pred, native)])
                tmpError = np.array([numResidues, totalError])

            elif response.startswith('SS'):
                numResidues = sum([m == 0 for m in missing])
                totalError = sum([
                    p != t for p, t, m in zip(pred, native, missing) if m == 0
                ])
                tmpError = np.array([numResidues, totalError])

            elif 'Phi' in response or 'Psi' in response:
                invalidResidues = [0] * len(missing)
                for i in xrange(len(missing)):
                    if missing[i] == 1:
                        invalidResidues[i] = 1
                        if i > 0:
                            invalidResidues[i - 1] = 1
                        if i < len(missing) - 1:
                            invalidResidues[i + 1] = 1

                invalidResidues[0] = 1
                invalidResidues[len(missing) - 1] = 1

                numResidues = sum([m == 0 for m in invalidResidues])
                err1 = abs(pred - native)
                err2 = np.float32(2 * np.pi) - err1
                err = np.minimum(err1, err2)
                totalError = np.sum(
                    [e for e, m in zip(err, invalidResidues) if m == 0],
                    axis=0)
                tmpError = np.array([numResidues] + list(totalError))
            else:
                print 'The Evaluate function not implemented for response: ', response
                exit(1)

            if errors.has_key(response):
                errors[response].append(tmpError)
            else:
                errors[response] = [tmpError]

    ## calculate average error
    avgErrPerTarget = dict()
    avgErrPerResidue = dict()
    allerrors = dict()
    for response, e in errors.iteritems():

        err = np.array(e)
        err_avg = np.average(err, axis=0)
        err2 = err_avg[1:] * 1. / err_avg[0]

        ind_err = np.divide(err[:, 1:] * 1.0, err[:, 0:1])
        err1 = np.average(ind_err, axis=0)

        avgErrPerTarget[response] = err1
        avgErrPerResidue[response] = err2
        """
		print '*********************Error for response ', response, '************************'
		print 'avg by target: ', err1, ' avg by residue: ', err2
		print '                            '
		print '*********************Individual Error for response ', response, '************************'
		"""
        allerrors[response] = dict()
        for name, e0 in zip(names, ind_err):
            ##print name, e0
            allerrors[response][name] = e0

    return avgErrPerTarget, avgErrPerResidue, allerrors
def MergeOneProtein(inputFiles, method):

        if inputFiles is None or len(inputFiles) < 2:
                print 'Please provide at least two predicted matrices for merge'
                exit(-1)

        seqName = None
        sequence = None

        distProbs = dict()
        contactProbs = dict()
        labelDistributions = dict()
        labelWeights = dict()
        labelWeightFlags = []

	tempNames = []
        for inputFile in inputFiles:
                content = DistanceUtils.LoadRawDistProbFile(inputFile)

                name0, sequence0, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content

                ##add code here to check all the input files have the same protein name
		seqName0 = '-'.join(name0.split('-')[0:-1])
		tempName = name0.split('-')[-1]
		tempNames.append(tempName)

		labelWeightFlags.append( labelWeight is not None )

		if seqName is None:
			seqName = seqName0
		else:
			assert seqName == seqName0

                if sequence is None:
                        sequence = sequence0
                else:
                        assert sequence == sequence0


                for apt in predictedDistProb.keys():
                        if not distProbs.has_key(apt):
                                distProbs[apt] =[]
                        distProbs[apt].append( predictedDistProb[apt] )

                for apt in predictedContactProb.keys():
                        if not contactProbs.has_key(apt):
                                contactProbs[apt] = []
                        contactProbs[apt].append( predictedContactProb[apt] )

                if labelWeight is not None:
                        for apt in labelWeight.keys():
                                if not labelWeights.has_key(apt):
                                        labelWeights[apt] = []
                                labelWeights[apt].append( labelWeight[apt] )

                for apt in labelDistribution.keys():
                        if not labelDistributions.has_key(apt):
                                labelDistributions[apt] = []
                        labelDistributions[apt].append( labelDistribution[apt] )

        ## check consistency among labelWeightFlags
        consistent  = all( flag==labelWeightFlags[0] for flag in labelWeightFlags)
        if not consistent:
                print 'ERROR: the input matrix files have inconsistent format. Some have a labelWeight while others do not.'
                exit(-1)

        ### Ms is a dictionary, each value in Ms is a list of matrices
        ### this function calculates the geometric mean of all the matrices in the same list and the renormalize the last dim of the resultant mean

        def CalcGeometricMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = scipy.stats.mstats.gmean(v, axis=0)
                        tmp_sum = np.sum(result[apt], axis=-1, keepdims=True)
                        result[apt] = result[apt]/tmp_sum

                return result

        ## calculate arithmetic mean
        def CalcArithmeticMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = np.mean(v, axis=0)

                return result

        if method == 'amean':
                distMatrixProb = CalcArithmeticMean(distProbs)
                labelDistribution = CalcArithmeticMean(labelDistributions)
        else:
                distMatrixProb = CalcGeometricMean(distProbs)
                labelDistribution = CalcGeometricMean(labelDistributions)

	contactMatrixProb = dict()
	for k in distMatrixProb.keys():
		apt = Response2LabelName(k)
		labelType = Response2LabelType(k)

		if not labelType.startswith('Discrete'):
			print 'ERROR: this labelType currently not supported in TPLMergePredicteDistMatrix.py : ', labelType
			exit(-1)

		subType = labelType[ len('Discrete'): ]
		labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
		contactMatrixProb[apt] = ContactUtils.Distance2Contact(distMatrixProb[k], labelOf8)

        if labelWeightFlags[0] is True:
                labelWeight = CalcArithmeticMean(labelWeights)

	targetName = '-'.join( [ seqName ] + tempNames )
        if labelWeightFlags[0] is True:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, labelWeight, labelDistribution)
        else:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, None, labelDistribution)

        return contactMatrixProb, content4save
Beispiel #8
0
def CalcLabelDistributionAndWeight(data=None, modelSpecs=None):
    ## weight for different ranges (long, medium, short, and near-ranges)
    if 'weight4range' not in modelSpecs:
        modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape(
            (4, 1)).astype(np.float32)
    else:
        modelSpecs['weight4range'].reshape((4, 1)).astype(np.float32)
    print('weight for range: ', modelSpecs['weight4range'])

    ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15
    if 'LRbias' in modelSpecs:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C[modelSpecs['LRbias']], modelSpecs['weight4range'])
    else:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C['mid'], modelSpecs['weight4range'])
    print('LRbias= ', modelSpecs['LRbias'], 'weight43C= ',
          modelSpecs['weight4Discrete3C'])

    ## weight for 2C
    modelSpecs['weight4HB_Discrete2C'] = np.multiply(
        config.weight4HB2C, modelSpecs['weight4range'])
    modelSpecs['weight4Beta_Discrete2C'] = np.multiply(
        config.weight4Beta2C, modelSpecs['weight4range'])

    ## weight for real value
    modelSpecs['weight4continuous'] = np.multiply(
        np.array([1.] * 4).reshape((4, 1)).astype(np.float32),
        modelSpecs['weight4range'])

    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in data
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.] * 4).reshape(
                (4, 1)).astype(np.float32)
            continue

        if 'UseBoundingBox4RefProbs' in modelSpecs and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=newLabelMatrices,
                numLabels=config.responseProbDims[labelType])
        else:
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=[m.astype(np.int32) for m in allLabelMatrices[response]],
                numLabels=config.responseProbDims[labelType])

    modelSpecs['labelRefProbs'] = allRefProbs

    ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier
    modelSpecs['weight4labels'] = dict()

    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)

        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            ## just need to assign range weight
            modelSpecs['weight4labels'][response] = modelSpecs[
                'weight4continuous']
            continue

        if labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]

            ## if the response is for HB and BetaPairing
            if subType.startswith('2C'):
                modelSpecs['weight4labels'][response] = modelSpecs['weight4' +
                                                                   response]
                continue

            ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            if subType.startswith('3C'):
                modelSpecs['weight4labels'][response] = modelSpecs[
                    'weight4Discrete3C']
                continue

            ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            modelSpecs['weight4labels'][
                response] = DistanceUtils.CalcLabelWeight(
                    modelSpecs['weight4Discrete3C'], allRefProbs[response],
                    config.distCutoffs[subType])
            continue

        print('unsupported response in CalcLabelDistributionAndWeight: ',
              response)
        exit(-1)

    return modelSpecs['labelRefProbs'], modelSpecs['weight4labels']
Beispiel #9
0
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print('the feature file is empty')
        exit(-1)

    fhs = [open(file, 'rb') for file in files]
    data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], [])
    [fh.close() for fh in fhs]

    ## each protein has sequential and  pairwise features as input and distance matrix as label
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ## convert the primary sequence to a one-hot encoding
        oneHotEncoding = config.SeqOneHotEncoding(d['sequence'])

        ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure
        if config.EmbeddingUsed(modelSpecs):
            if 'Seq+SS' in modelSpecs['seq2matrixMode']:
                embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3'])
            else:
                embedFeature = oneHotEncoding
            oneprotein['embedFeatures'] = embedFeature

        ##collecting sequential features...
        seqMatrices = [oneHotEncoding]

        ## 3-state secondary structure shall always be placed before the other features, why?
        if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True):
            seqMatrices.append(d['SS3'])

        if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True):
            seqMatrices.append(d['ACC'])

        if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True):
            seqMatrices.append(d['PSSM'])

        if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True:
            seqMatrices.append(d['DISO'])

        ##membrane protein specific features
        useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and (
            modelSpecs['UseMPSpecificFeatures'] is True)
        if useMPSpecificFeatures:
            if 'MemAcc' in d:
                seqMatrices.append(d['MemAcc'])
            else:
                print('The data does not have a feature called MemAcc')
                exit(-1)
            if 'MemTopo' in d:
                seqMatrices.append(d['MemTopo'])
            else:
                print('The data does not have a feature called MemTopo')
                exit(-1)

        ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment.
        ## this is mainly used for homology modeling
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if 'tplSimScore' not in d:
                print(
                    'the data has no key tplSimScore, which is needed since you specify to use template information'
                )
                exit(-1)
            if d['tplSimScore'].shape[1] != 11:
                print(
                    'The number of features for query-template similarity shall be equal to 11'
                )
                exit(-1)
            seqMatrices.append(d['tplSimScore'])
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        ##collecting pairwise features...
        pairfeatures = []
        ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 )
        posFeature = LocationFeature(d)
        pairfeatures.append(posFeature)

        cbrtFeature = CubeRootFeature(d)
        pairfeatures.append(cbrtFeature)

        if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True):
            if 'ccmpredZ' not in d:
                print('Something must be wrong. The data for protein ',
                      d['name'],
                      ' does not have the normalized ccmpred feature!')
                exit(-1)
            pairfeatures.append(d['ccmpredZ'])

        if modelSpecs['UsePSICOV'] is True:
            pairfeatures.append(d['psicovZ'])

        if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is
                                              True):
            pairfeatures.append(d['OtherPairs'])

        ##add template-related distance matrix. This code needs modification later
        ## somewhere we shall also write code to add template-related sequential features such as secondary structure?
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template distance matrix...'
            if 'tplDistMatrix' not in d:
                print(
                    'the data for ', d['name'],
                    ' has no tplDistMatrix, which is needed since you specify to use template information'
                )
                exit(-1)

            ## Check to make sure that we use exactly the same set of inter-atom distance information from templates
            ## currently we do not use HB and Beta information from template
            apts = d['tplDistMatrix'].keys()
            assert (set(apts) == set(config.allAtomPairTypes))
            ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) )

            tmpPairFeatures = dict()
            for apt, tplDistMatrix in d['tplDistMatrix'].items():
                ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type
                if apt == 'CaCa':
                    flagMatrix = np.zeros_like(tplDistMatrix)
                    np.putmask(flagMatrix, tplDistMatrix < 0, 1)
                    pairfeatures.append(flagMatrix)

                strengthMatrix = np.copy(tplDistMatrix)
                np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5)
                np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50)
                strengthMatrix = 3.5 / strengthMatrix

                if config.InTPLMemorySaveMode(modelSpecs):
                    tmpPairFeatures[apt] = [strengthMatrix]
                else:
                    tmpPairFeatures[apt] = [
                        strengthMatrix,
                        np.square(strengthMatrix)
                    ]

            ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure
            ## python of different versions may have different ordering of keys in dict() ?
            pairfeatures.extend(tmpPairFeatures['CbCb'])
            pairfeatures.extend(tmpPairFeatures['CgCg'])
            pairfeatures.extend(tmpPairFeatures['CaCg'])
            pairfeatures.extend(tmpPairFeatures['CaCa'])
            pairfeatures.extend(tmpPairFeatures['NO'])

        if config.InTPLMemorySaveMode(modelSpecs):
            matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32)
        else:
            matrixFeature = np.dstack(tuple(pairfeatures))
            #print 'matrixFeature.shape: ', matrixFeature.shape

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature
        oneprotein['matrixFeatures'] = matrixFeature

        ##collecting labels...
        if 'atomDistMatrix' in d:
            atomDistMatrix = d['atomDistMatrix']
            oneprotein['atomLabelMatrix'] = dict()

            for response in modelSpecs['responses']:
                responseName = Response2LabelName(response)
                labelType = Response2LabelType(response)
                if responseName not in atomDistMatrix:
                    print('In the raw feature data, ', d['name'],
                          ' does not have matrix for ', responseName)
                    exit(-1)

                ## atomDistMatrix is the raw data, so it does not have information about labelType
                distm = atomDistMatrix[responseName]

                if labelType.startswith('Discrete'):
                    subType = labelType[len('Discrete'):]

                    ## no need to discretize for HB and Beta-Pairing since they are binary matrices
                    if responseName.startswith(
                            'HB') or responseName.startswith('Beta'):
                        oneprotein['atomLabelMatrix'][response] = distm
                    else:
                        labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
                            distm, config.distCutoffs[subType],
                            subType.endswith('Plus'))
                        oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('LogNormal'):
                    labelMatrix = DistanceUtils.LogDistMatrix(distm)
                    oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('Normal'):
                    oneprotein['atomLabelMatrix'][response] = distm
                else:
                    print('unsupported response: ', res)
                    exit(-1)

        elif forTrainValidation:
            print(
                'atomic distance matrix is needed for the training and validation data'
            )
            exit(-1)

        ##at this point, finish collecting features and labels for one protein
        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print('assembled features and labels for ', counter, ' proteins.')

    return proteinFeatures
def LoadPropertyFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print 'ERROR: the feature files is empty'
        exit(1)

    data = []
    for infile in files:
        with open(infile, 'rb') as fh:
            data.extend(cPickle.load(fh))

    EmbeddingModel = None
    if modelSpecs.has_key(
            'UseSequenceEmbedding') and modelSpecs['UseSequenceEmbedding']:
        EmbeddingModelFile = os.path.join(
            os.environ['DL4PropertyPredHome'], 'data',
            'Mofrad-PLoSOne-2015Nov.3GramEmbeddingParams.pkl')
        EmbeddingModel = SequenceEmbedding.LoadEmbeddingParamsInPKL(
            EmbeddingModelFile)

    ## each protein has sequential features as input
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ##collecting sequential features...
        seqMatrices = []

        seqMatrices.append(d['PSSM'])
        ##seqMatrices.append( d['PSFM'] )

        ##Load sequence embedding features here
        if EmbeddingModel is not None:
            seqMatrices.append(
                SequenceEmbedding.EmbedOneSequence(d['sequence'],
                                                   EmbeddingModel))

        if modelSpecs.has_key('UsePSFM') and modelSpecs['UsePSFM']:
            seqMatrices.append(d['PSFM'])

        if modelSpecs.has_key(
                'UseOneHotEncoding') and modelSpecs['UseOneHotEncoding']:
            seqMatrices.append(config.SeqOneHotEncoding(d['sequence']))

## add template similarity score here
        if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if not d.has_key('tplSimScore'):
                print 'ERROR: no tplSimScore for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)
            if d['tplSimScore'].shape[1] != 10:
                print 'ERROR: the number of query-template similarity features is not 10 in data for', d[
                    'name']
                exit(1)

            if not d.has_key('tplProperties'):
                print 'ERROR: no tplProperties for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)

            if d['tplProperties'].shape[1] < 15:
                print 'ERROR: #template local structure properties shall be at least 15 for target', d[
                    'name']
                exit(1)

            ## the query-template similarity score shall be arranged in the order of: AA identity (binary), blosum80, blosum62, blosum45, spScore, spScore_ST, ppScore, pmScore, cc, hdsm
            seqMatrices.append(d['tplSimScore'])

            ##we do not use omg information from the template, the first 8 features shall be the 8-state secondary structure, then followed by pACC, CNa, CNb, Phi, Psi, Theta and Tau
            #seqMatrices.append( d['tplProperties'][:,:15] )
            seqMatrices.append(d['tplProperties'][:, :8])
            for r in modelSpecs['responses']:
                if r.startswith('ACC'):
                    seqMatrices.append(d['tplProperties'][:, 8:9])
                elif r.startswith('Phi') or r.startswith(
                        'Psi') or r.startswith('CLE'):
                    seqMatrices.append(d['tplProperties'][:, 11:13])
                elif r.startswith('Theta') or r.startswith('Tau'):
                    seqMatrices.append(d['tplProperties'][:, 13:15])
                elif r.startswith('CNa') or r.startswith('CNb'):
                    seqMatrices.append(d['tplProperties'][:, 9:11])
                else:
                    print 'ERROR: unsupported response', r
                    exit(1)

        if d.has_key('otherSeqFeatures'):
            seqMatrices.append(d['otherSeqFeatures'])

## all the features shall have shape (seqLen, nFeatures) where nFeatures is variable, but seqLen is the sequence length of one protein
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature

        if not d.has_key('DISO') and d.has_key('Missing'):
            d['DISO'] = d['Missing']

##collecting labels...
        for r in modelSpecs['responses']:
            labelName = Response2LabelName(r)
            labelType = Response2LabelType(r)

            if not d.has_key(labelName) and forTrainValidation:
                print 'ERROR: missing label information for protein ', d[
                    'name'], ' and response ', r
                exit(1)
            elif not d.has_key(labelName):
                continue

            labels = d[labelName]

            ## need some special handling of discrete labels
            if labelType.startswith('Discrete'):
                if r.startswith('SS3'):
                    labels = np.array([
                        PropertyUtils.SS3Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('SS8'):
                    labels = np.array([
                        PropertyUtils.SS8Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('ACC') or r.startswith('DISO'):
                    labels = labels.reshape((-1, 1))
                elif r.startswith('CLE'):
                    labels = np.array([
                        PropertyUtils.CLELetter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                else:
                    print 'ERROR: please specify how to convert your discrete labels to numbers for response ', r
                    exit(1)

            oneprotein[labelName] = labels

##at this point, finish collecting features and labels for one protein
        if d.has_key('Missing'):
            oneprotein['missing'] = d['Missing']
        elif forTrainValidation:
            print 'ERROR: for training data, we need information to specify which residues have no 3D coordinates'
            exit(1)

        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print 'assembled features and labels for ', counter, ' proteins.'
    """
    	tmpfile = open(files[0] + '.contactInput.pkl', 'wb')
    	cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL)
    	tmpfile.close()
    	"""

    return proteinFeatures
def AssembleOneBatch(data, modelSpecs, forTrainValidation=True):
    if not data:
        print 'WARNING: the list of data is empty'
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]),
                   dtype=theano.config.floatX)
    ## for mask
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)

    ## Y shall be a list of labels, each for one type
    ##we always need a weight vector to deal with residues without 3D coordinates in training and validation, if modelSpecs['UseSampleWeight']:
    Y = []
    W = []
    for res in modelSpecs['responses']:
        labelType = Response2LabelType(res)
        labelName = Response2LabelName(res)

        dataType = (np.int32 if labelType.startswith('Discrete') else
                    theano.config.floatX)
        if forTrainValidation:
            if not data[0].has_key(labelName):
                print 'ERROR: label information is needed for training protein ', data[
                    'name'], ' and response ', res
                exit(1)
            Y.append(
                np.zeros(shape=(numSeqs, maxSeqLen,
                                config.responseValueDims[labelType]),
                         dtype=dataType))

            if not data[0].has_key('missing'):
                print 'ERROR: missing information is needed for training protein ', data[
                    'name']
                exit(1)

            W.append(
                np.zeros(shape=(numSeqs, maxSeqLen, 1),
                         dtype=theano.config.floatX))

    for j in range(len(data)):
        seqLen = data[j]['seqLen']
        X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures']
        M1d[j, maxSeqLen - seqLen:].fill(1)

        for y, w, res in zip(Y, W, modelSpecs['responses']):
            y[j, maxSeqLen - seqLen:, ] = data[j][Response2LabelName(res)]

            if res.startswith('DISO'):
                ## for disorder prediction, all the residues shall be considered since those residues without 3D coordinates are positive examples
                ## we may assign a larger weight to positive examples since they are only 6% of the whole data set
                w[j, maxSeqLen - seqLen:, ] = np.reshape(
                    data[j]['missing'],
                    (-1, 1)) * (modelSpecs['w4diso'] - 1.) + 1.
            else:
                ## assign weight 0 to those residues without coordinates, otherwise 1
                w[j, maxSeqLen -
                  seqLen:, ] = 1.0 - np.reshape(data[j]['missing'], (-1, 1))

    onebatch = [X1d, M1d]
    onebatch.extend(Y)
    onebatch.extend(W)

    return onebatch
Beispiel #12
0
def PredictMatrixLabels(models,
                        predictors,
                        names,
                        inputFolders,
                        aliFolders=None,
                        tplFolder=None,
                        aliFile=None,
                        tplFile=None,
                        saveFolder=None):

    if not isinstance(names, (list, tuple)):
        targetName = names
    else:
        targetName = None

    ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices
    ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results
    ##two different models may share common responses

    allsequences = dict()
    allresults = dict()  ## the results predicted from the real input
    numModels = dict(
    )  ## count the number of models that may predict each response

    for model, predictor in zip(models, predictors):
        #predict, inputVariables = BuildPredictor(model)
        predict, inputVariables = predictor

        ## load data for each model separately since each model may have a different specification
        if targetName is None:
            rawData = LoadProteinData4OneModel(model, names, inputFolders,
                                               aliFolders, tplFolder)

        elif aliFile is not None and tplFile is not None:
            rawData = LoadOneAlignment4OneModel(model, targetName,
                                                inputFolders, aliFile, tplFile)
        else:
            rawData = LoadOneProteinData4OneModel(model, targetName,
                                                  inputFolders, aliFolders,
                                                  tplFolder)

        predData = DataProcessor.ExtractFeaturesNLabels(
            rawData,
            modelSpecs=model,
            forTrainValidation=False,
            returnMode='list')

        ##make sure the input has the same number of features as the model
        FeatureUtils.CheckModelNDataConsistency(model, predData)

        ## check sequence consistency
        for d in predData:
            name = d['name']
            if not allresults.has_key(name):
                allresults[name] = dict()
                numModels[name] = dict()

            if not allsequences.has_key(name):
                allsequences[name] = d['sequence']
            elif allsequences[name] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData = DataProcessor.SplitData2Batches(data=predData,
                                                      numDataPoints=624,
                                                      modelSpecs=model)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        ##for onebatch, names4onebatch in zip(predSeqData, names):
        for minibatch in predSeqData:
            onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
                minibatch, model)
            input = onebatch[:len(inputVariables)]
            result = predict(*input)
            ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
            assert result.shape[3] == sum([
                GetResponseProbDims(response)
                for response in model['responses']
            ])

            ## calculate the start and end positions of each response in the last dimension of result
            dims = [
                GetResponseProbDims(response)
                for response in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            x1d, x2d, x1dmask, x2dmask = input[0:4]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            for response, start, end in zip(model['responses'], startPositions,
                                            endPositions):

                ## batchres is a batch of result, its ndim=4
                ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters
                batchres = result[:, :, :, start:end]
                ## remove masked positions
                revised_batchres = [
                    probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :]
                    for probMatrix, seqLen in zip(batchres, seqLens)
                ]

                for res4one, name in zip(revised_batchres, names4onebatch):
                    if not allresults[name].has_key(response):
                        allresults[name][response] = res4one
                        numModels[name][response] = np.int32(1)
                    else:
                        ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                        allresults[name][response] += res4one
                        numModels[name][response] += np.int32(1)

    ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()

        ## finalresults has 3 dimensions.
        for response in results.keys():
            finalresults[name][response] = (allresults[name][response] /
                                            numModels[name][response]).astype(
                                                np.float32)

            ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
            labelName = Response2LabelName(response)
            if config.IsSymmetricLabel(labelName):
                finalresults[name][response] = (
                    finalresults[name][response] +
                    np.transpose(finalresults[name][response], (1, 0, 2))) / 2.

    ## convert predicted distance probability matrix into contact matrix
    predictedContactMatrices = DeriveContactMatrix(finalresults)

    ## collect the average label distributions and weight matrix
    finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution(
        models)

    ##write all the results here
    ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution)
    for name, results in finalresults.iteritems():

        savefilename = name + '.predictedDistMatrix.pkl'
        if saveFolder is not None:
            savefilename = os.path.join(saveFolder, savefilename)

        if targetName is not None:
            originalName = targetName
        else:
            for n in names:
                if name.startswith(n):
                    originalName = n
                    break

        with open(savefilename, 'wb') as fh:
            #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump((originalName, allsequences[name], results,
                          predictedContactMatrices[name], finalLabelWeights,
                          finalLabelDistributions),
                         fh,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    return (predictedContactMatrices, allsequences)
    """