def errors(self, zList, weightList=None):
		errs = []
		if weightList is not None and len(weightList)>0:
			for res, predictor, z, w, o in zip(self.responses, self.predictors, zList, weightList, self.outputList):
				labelType = Response2LabelType(res)
				numLabels = config.responseProbDims[labelType]

				## if the label type is Discrete25C, Discrete52C, Discrete12C
				if labelType.startswith('Discrete') and numLabels > 3:
					assert (z.ndim == 3 and config.responseValueDims[labelType] == 1 )
					o2 = o.flatten(3)
					## here we convert 12C, 25C, and 52C to 3C for error calculation, which makes the result easier to interpret
					errs.append( self.errors4one(z, o2, weight=w, distLabelType=labelType[len('Discrete'): ] ) )
				else:
					## call the error function of each predictor
					if (z.ndim == 3 ):
						zflat = z.flatten().dimshuffle(0, 'x')
					elif (z.ndim == 4 ):
						zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0)
					else:
						print('unsupported ndim for z in errors():', z.ndim)
						exit(-1)

					assert (w.ndim == 3)
					wflat = w.flatten().dimshuffle(0, 'x')
					e = predictor.errors(zflat, sampleWeight=wflat)
					## e is a tensor with ndim=1
					errs.append(e)

		else:
			for res, predictor, z, o in zip(self.responses, self.predictors, zList, self.outputList):
				labelType = Response2LabelType(res)
				numLabels = config.responseProbDims[labelType]
				if labelType.startswith('Discrete') and numLabels > 3 :
					assert (z.ndim == 3 and config.responseValueDims[labelType] == 1 )
					o2 = o.flatten(3)
					errs.append( self.errors4one(z, o, distLabelType=labelType[len('Discrete'): ] ) )
				else:
					## call the error function of each predictor
					if (z.ndim == 3):
						zflat = z.flatten().dimshuffle(0, 'x')
					elif (z.ndim == 4):
						zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0)
					else:
						print('unsupported ndim for z in errors():', z.ndim)
						exit(-1)
					e = predictor.errors(zflat)
					## e is a tensor with ndim=1
					errs.append(e)

		return T.concatenate(errs)
Esempio n. 2
0
def BuildModel(modelSpecs, forTrain=True):
        rng = np.random.RandomState()

        ## x is for sequential features
        x = T.tensor3('x')

        ## mask for x 
        xmask = T.bmatrix('xmask')
        propertyPredictor = ResNet4Properties( rng, seqInput=x, mask_seq=xmask, modelSpecs=modelSpecs )

        ## labelList is a list of label matrices, each with shape (batchSize, seqLen, numLabels)
        labelList = []
        if forTrain:
                ## when this model is used for training. We need to define the label variable
		labelList = []
		for res in modelSpecs['responses']:
			labelType = Response2LabelType(res)
			if labelType.startswith('Discrete'):
                		labelList.append( T.itensor3('label4' + res ) )
			else:
                		labelList.append( T.tensor3('label4' + res ) )

        ## weightList is a list of label weight matices, each with shape (batchSize, seqLen, 1)
	## we always use weight to deal with residues without 3D coordinates
        weightList = []
        if len(labelList)>0:
                weightList = [ T.tensor3('weight4' + res ) for res in modelSpecs['responses'] ]

	if len(labelList)>0:
        	return propertyPredictor, x, xmask, labelList, weightList
	else:
        	return propertyPredictor, x, xmask
        def EvaluateAccuracy(pred_prob, truth, pad_len):
            pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                           dtype=theano.config.floatX)
            truth_in_correct_shape = truth[pad_len:, pad_len:]

            labelType = Response2LabelType(currentResponse)
            atomType = Response2LabelName(currentResponse)
            symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

            if labelType.startswith('LogNormal'):
                return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                            truth=truth_in_correct_shape,
                                            symmetric=symmetric)

            elif labelType.startswith('Normal'):
                return TopAccuracyNormal(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)

            elif labelType.startswith('Discrete'):
                subType = labelType[len('Discrete'):]
                if subType.startswith('2C'):
                    return TopAccuracy2C(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)
                else:
                    return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                             truth=truth_in_correct_shape,
                                             subType=subType,
                                             symmetric=symmetric)
            else:
                print 'unsupported label type in EvaluateAccuracy: ', labelType
                exit(-1)
def CalcLabelDistribution(data, modelSpecs):
    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue

        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in data
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.] * numRanges).reshape(
                (-1, 1)).astype(np.float32)
            continue

        if modelSpecs.has_key('UseBoundingBox4RefProbs') and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            if labelName in config.allOrientationNames:
                allRefProbs[response] = OrientationUtils.CalcLabelProb(
                    data=newLabelMatrices,
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
            else:
                allRefProbs[response] = DistanceUtils.CalcLabelProb(
                    data=newLabelMatrices,
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
        else:
            if labelName in config.allOrientationNames:
                allRefProbs[response] = OrientationUtils.CalcLabelProb(
                    data=[
                        m.astype(np.int32) for m in allLabelMatrices[response]
                    ],
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
            else:
                allRefProbs[response] = DistanceUtils.CalcLabelProb(
                    data=[
                        m.astype(np.int32) for m in allLabelMatrices[response]
                    ],
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))

    modelSpecs['labelDistributions'] = allRefProbs
    return allRefProbs
def Coding2String(coding, response):

    if coding.ndim == 2:
        code = coding[:, 0]
    else:
        code = coding

    labelType = Response2LabelType(response)

    if response.startswith('SS'):
        if labelType.endswith('3C'):
            str = ''.join([SS3Code2Letter[c] for c in code])
        elif labelType.endswith('8C'):
            str = ''.join([SS8Code2Letter[c] for c in code])
        else:
            print 'ERROR: unsupported response and labelType: ', response
            exit(1)
        return str

    if response.startswith('ACC'):
        assert (labelType.endswith('3C'))
        str = ''.join([ACCCode2Letter[c] for c in code])
        return str

    if response.startswith('CLE'):
        assert (labelType.endswith('18C'))
        str = ''.join([CLECode2Letter[c] for c in code])
        return str

    print 'ERROR: unsupported response: ', response
    exit(1)
def String2Coding(str, response):

    labelType = Response2LabelType(response)

    if response.startswith('SS'):
        if labelType.endswith('3C'):
            code = [SS3Letter2Code[c] for c in str]
            code = np.array(code).astype(np.int32).reshape((len(str), 1))
        elif labelType.endswith('8C'):
            code = [SS8Letter2Code[c] for c in str]
            code = np.array(code).astype(np.int32).reshape((len(str), 1))
        else:
            print 'ERROR: unsupported response and labelType: ', response
            exit(1)
        return code

    if response.startswith('ACC'):
        assert (labelType.endswith('3C'))
        code = [ACCLetter2Code[c] for c in str]
        code = np.array(code).astype(np.int32).reshape((len(str), 1))
        return code

    if response.startswith('CLE'):
        assert (labelType.endswith('18C'))
        code = [CLELetter2Code[c] for c in str]
        code = np.array(code).astype(np.int32).reshape((len(str), 1))
        return code

    print 'ERROR: unsupported response: ', response
    exit(1)
def CalcRefState4OneBatch(batch, modelSpecs, minSeqSep=3):
    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in batch
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.]).astype(np.float32)
            continue

        if modelSpecs.has_key('UseBoundingBox4RefProbs') and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=newLabelMatrices,
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)
        else:
            allRefProbs[response], avgLen = CalcLabelProb(
                labelMatrices=[
                    m.astype(np.int32) for m in allLabelMatrices[response]
                ],
                numLabels=config.responseProbDims[labelType],
                minSeqSep=minSeqSep)

    return allRefProbs, avgLen
Esempio n. 8
0
    def EvaluateAccuracy(pred_prob, truth, pad_len):
        pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                       dtype=theano.config.floatX)
        truth_in_correct_shape = truth[pad_len:, pad_len:]

        labelType = Response2LabelType(currentResponse)
        atomType = Response2LabelName(currentResponse)
        symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta'])

        if labelType.startswith('LogNormal'):
            return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                        truth=truth_in_correct_shape,
                                        symmetric=symmetric)
        elif labelType.startswith('Normal'):
            return TopAccuracyNormal(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
        elif labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]
            if subType.startswith('2C'):
                return TopAccuracy2C(pred=pred_in_correct_shape,
                                     truth=truth_in_correct_shape,
                                     symmetric=symmetric)
            else:
                return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         subType=subType,
                                         symmetric=symmetric)
        else:
            print('unsupported label type in EvaluateAccuracy: ', labelType)
            exit(-1)

        accuracyList = []
        for res, out_prob, z, ratio in zip(self.responses,
                                           self.output_probList, zList,
                                           self.modelSpecs['topRatios']):
            ## currently TopAccuracy only works when the dimension of each z is 3
            assert z.ndim == 3
            if self.mask_1d is not None:
                paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d,
                                                            axis=1)
            else:
                paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32)
            currentResponse = res
            topRatio = ratio
            ##here we use scan to calculate accuracy for each protein
            result, updates = theano.scan(fn=EvaluateAccuracy,
                                          outputs_info=None,
                                          sequences=[out_prob, z, paddingLens])
            accuracy = T.mean(result, axis=0)
            accuracyList.append(accuracy)

        return T.stacklists(accuracyList)
Esempio n. 9
0
def CalcLabelWeightMatrix(LabelMatrix=None, modelSpecs=None):
    if LabelMatrix is None:
        return None

    M1s = np.ones_like(LabelMatrix.values()[0], dtype=np.int16)
    np.fill_diagonal(M1s, 0)

    LRmask = np.triu(M1s, 24) + np.tril(M1s, -24)
    MLRmask = np.triu(M1s, 12) + np.tril(M1s, -12)
    SMLRmask = np.triu(M1s, 6) + np.tril(M1s, -6)
    SRmask = SMLRmask - MLRmask
    MRmask = MLRmask - LRmask
    NRmask = M1s - SMLRmask

    for response in modelSpecs['responses']:
        if response not in modelSpecs['weight4labels']:
            print('Cannot find the weight factor tensor for response ',
                  response)
            exit(-1)

    ##the below procedure is not very effective. We shall improve it later.
    labelWeightMatrices = dict()
    for response in modelSpecs['responses']:
        ##name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        labelWeightMatrices[response] = np.zeros_like(
            LabelMatrix[response], dtype=theano.config.floatX)

        ## wMatrix is a matrix with dimension 4 * numLabels
        wMatrix = modelSpecs['weight4labels'][response]
        wMatrixShape = wMatrix.shape
        assert (wMatrixShape[0] == 4)

        if labelType.startswith('Normal') or labelType.startswith('LogNormal'):
            ## if the label is real value, then for each range, there is only a single weight for all the possible values
            tmpWeightMatrices = []
            for i in range(4):
                tmp = wMatrix[i][M1s]
                ## set the weight of the entries without valid distance to 0. An invalid entry in the label matrix is indicated by a negative value,e.g., -1
                np.putmask(tmp, LabelMatrix[response] < 0, 0)
                tmpWeightMatrices.append(tmp)
        else:
            tmpWeightMatrices = [
                wMatrix[i][LabelMatrix[response]] for i in range(4)
            ]

        LRw, MRw, SRw, NRw = tmpWeightMatrices
        labelWeightMatrices[response] += (LRmask * LRw + MRmask * MRw +
                                          SRmask * SRw + NRmask * NRw)

    return labelWeightMatrices
def BuildModel(modelSpecs, forTrain=True):
	rng = np.random.RandomState()

	## x is for sequential features and y for matrix (or pairwise) features
	x = T.tensor3('x')
	y = T.tensor4('y')

	## mask for x and y, respectively
	xmask = T.bmatrix('xmask')
	ymask = T.btensor3('ymask')

	xem = None
	##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
	if config.EmbeddingUsed(modelSpecs):
		xem = T.tensor3('xem')
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   embedInput=xem, modelSpecs=modelSpecs )
	else:
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   modelSpecs=modelSpecs )

	## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
	labelList = []
	if forTrain:
		## when this model is used for training. We need to define the label variable
		for response in modelSpecs['responses']:
			labelType = Response2LabelType(response)
			rValDims = config.responseValueDims[labelType]

			if labelType.startswith('Discrete'):
				if rValDims > 1:
				## if one response is a vector, then we use a 4-d tensor
				## wtensor is for 16bit integer
					labelList.append( T.wtensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.wtensor3('Tlabel4' + response ) )
			else:
				if rValDims > 1:
					labelList.append( T.tensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.tensor3('Tlabel4' + response ) )

	## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
	weightList = []
	if len(labelList)>0 and modelSpecs['UseSampleWeight']:
		weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ]

	## for prediction, both labelList and weightList are empty
	return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def QuickValidateAllData(SeqDataset, validate, modelSpecs):
    losses = []
    errs = []
    if modelSpecs['UseSampleWeight']:
        w4losses = []
        w4errors = []
    else:
        w4losses = None
        w4errors = None

    for onebatch in SeqDataset:
        los, err = validate(*onebatch)
        losses.append(los)
        errs.append(err)

        ##two different batches may have different number of residues and different distribution of labels
        ##so we shall normalize the loss and errors by the weight of different batches
        if modelSpecs['UseSampleWeight']:
            weights = onebatch[len(onebatch) - len(modelSpecs['responses']):]
            w4loss = []
            w4error = []
            for res, w in zip(modelSpecs['responses'], weights):
                wSum = np.sum(w)
                w4loss.append(wSum)
                w4error.extend(
                    [wSum] * config.responseValueDims[Response2LabelType(res)])
            w4losses.append(w4loss)
            w4errors.append(w4error)

    losses = np.array(losses)
    errs = np.array(errs)

    return np.average(losses, axis=0,
                      weights=w4losses), np.average(errs,
                                                    axis=0,
                                                    weights=w4errors)
Esempio n. 12
0
def CalcLabelDistributionAndWeight(data=None, modelSpecs=None):
    ## weight for different ranges (long, medium, short, and near-ranges)
    if 'weight4range' not in modelSpecs:
        modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape(
            (4, 1)).astype(np.float32)
    else:
        modelSpecs['weight4range'].reshape((4, 1)).astype(np.float32)
    print('weight for range: ', modelSpecs['weight4range'])

    ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15
    if 'LRbias' in modelSpecs:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C[modelSpecs['LRbias']], modelSpecs['weight4range'])
    else:
        modelSpecs['weight4Discrete3C'] = np.multiply(
            config.weight43C['mid'], modelSpecs['weight4range'])
    print('LRbias= ', modelSpecs['LRbias'], 'weight43C= ',
          modelSpecs['weight4Discrete3C'])

    ## weight for 2C
    modelSpecs['weight4HB_Discrete2C'] = np.multiply(
        config.weight4HB2C, modelSpecs['weight4range'])
    modelSpecs['weight4Beta_Discrete2C'] = np.multiply(
        config.weight4Beta2C, modelSpecs['weight4range'])

    ## weight for real value
    modelSpecs['weight4continuous'] = np.multiply(
        np.array([1.] * 4).reshape((4, 1)).astype(np.float32),
        modelSpecs['weight4range'])

    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue
        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in data
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.] * 4).reshape(
                (4, 1)).astype(np.float32)
            continue

        if 'UseBoundingBox4RefProbs' in modelSpecs and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=newLabelMatrices,
                numLabels=config.responseProbDims[labelType])
        else:
            allRefProbs[response] = DistanceUtils.CalcLabelProb(
                data=[m.astype(np.int32) for m in allLabelMatrices[response]],
                numLabels=config.responseProbDims[labelType])

    modelSpecs['labelRefProbs'] = allRefProbs

    ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier
    modelSpecs['weight4labels'] = dict()

    for response in modelSpecs['responses']:
        name = Response2LabelName(response)
        labelType = Response2LabelType(response)

        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            ## just need to assign range weight
            modelSpecs['weight4labels'][response] = modelSpecs[
                'weight4continuous']
            continue

        if labelType.startswith('Discrete'):
            subType = labelType[len('Discrete'):]

            ## if the response is for HB and BetaPairing
            if subType.startswith('2C'):
                modelSpecs['weight4labels'][response] = modelSpecs['weight4' +
                                                                   response]
                continue

            ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            if subType.startswith('3C'):
                modelSpecs['weight4labels'][response] = modelSpecs[
                    'weight4Discrete3C']
                continue

            ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            modelSpecs['weight4labels'][
                response] = DistanceUtils.CalcLabelWeight(
                    modelSpecs['weight4Discrete3C'], allRefProbs[response],
                    config.distCutoffs[subType])
            continue

        print('unsupported response in CalcLabelDistributionAndWeight: ',
              response)
        exit(-1)

    return modelSpecs['labelRefProbs'], modelSpecs['weight4labels']
def MergeOneProtein(inputFiles, method):

        if inputFiles is None or len(inputFiles) < 2:
                print 'Please provide at least two predicted matrices for merge'
                exit(-1)

        seqName = None
        sequence = None

        distProbs = dict()
        contactProbs = dict()
        labelDistributions = dict()
        labelWeights = dict()
        labelWeightFlags = []

	tempNames = []
        for inputFile in inputFiles:
                content = DistanceUtils.LoadRawDistProbFile(inputFile)

                name0, sequence0, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content

                ##add code here to check all the input files have the same protein name
		seqName0 = '-'.join(name0.split('-')[0:-1])
		tempName = name0.split('-')[-1]
		tempNames.append(tempName)

		labelWeightFlags.append( labelWeight is not None )

		if seqName is None:
			seqName = seqName0
		else:
			assert seqName == seqName0

                if sequence is None:
                        sequence = sequence0
                else:
                        assert sequence == sequence0


                for apt in predictedDistProb.keys():
                        if not distProbs.has_key(apt):
                                distProbs[apt] =[]
                        distProbs[apt].append( predictedDistProb[apt] )

                for apt in predictedContactProb.keys():
                        if not contactProbs.has_key(apt):
                                contactProbs[apt] = []
                        contactProbs[apt].append( predictedContactProb[apt] )

                if labelWeight is not None:
                        for apt in labelWeight.keys():
                                if not labelWeights.has_key(apt):
                                        labelWeights[apt] = []
                                labelWeights[apt].append( labelWeight[apt] )

                for apt in labelDistribution.keys():
                        if not labelDistributions.has_key(apt):
                                labelDistributions[apt] = []
                        labelDistributions[apt].append( labelDistribution[apt] )

        ## check consistency among labelWeightFlags
        consistent  = all( flag==labelWeightFlags[0] for flag in labelWeightFlags)
        if not consistent:
                print 'ERROR: the input matrix files have inconsistent format. Some have a labelWeight while others do not.'
                exit(-1)

        ### Ms is a dictionary, each value in Ms is a list of matrices
        ### this function calculates the geometric mean of all the matrices in the same list and the renormalize the last dim of the resultant mean

        def CalcGeometricMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = scipy.stats.mstats.gmean(v, axis=0)
                        tmp_sum = np.sum(result[apt], axis=-1, keepdims=True)
                        result[apt] = result[apt]/tmp_sum

                return result

        ## calculate arithmetic mean
        def CalcArithmeticMean( Ms ):
                result = dict()
                for apt, v in Ms.iteritems():
                        result[apt] = np.mean(v, axis=0)

                return result

        if method == 'amean':
                distMatrixProb = CalcArithmeticMean(distProbs)
                labelDistribution = CalcArithmeticMean(labelDistributions)
        else:
                distMatrixProb = CalcGeometricMean(distProbs)
                labelDistribution = CalcGeometricMean(labelDistributions)

	contactMatrixProb = dict()
	for k in distMatrixProb.keys():
		apt = Response2LabelName(k)
		labelType = Response2LabelType(k)

		if not labelType.startswith('Discrete'):
			print 'ERROR: this labelType currently not supported in TPLMergePredicteDistMatrix.py : ', labelType
			exit(-1)

		subType = labelType[ len('Discrete'): ]
		labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
		contactMatrixProb[apt] = ContactUtils.Distance2Contact(distMatrixProb[k], labelOf8)

        if labelWeightFlags[0] is True:
                labelWeight = CalcArithmeticMean(labelWeights)

	targetName = '-'.join( [ seqName ] + tempNames )
        if labelWeightFlags[0] is True:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, labelWeight, labelDistribution)
        else:
                content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, None, labelDistribution)

        return contactMatrixProb, content4save
def PredictProperty(models, predictors, predFiles):

    allsequences = dict()

    ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list
    allresults4prob = dict()
    allresults = dict()

    for model, predictor in zip(models, predictors):

        predict, inputVariables = predictor

        ## We shall load these files for each model separately since each model may use a different set of features
        predData = DataProcessor.LoadPropertyFeatures(predFiles,
                                                      modelSpecs=model,
                                                      forTrainValidation=False)

        ##make sure the input has the same number of features as the model
        rindex = np.random.randint(0, high=len(predData))
        assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

        ## collecting sequences
        for d in predData:
            if not allsequences.has_key(d['name']):
                allsequences[d['name']] = d['sequence']
            elif allsequences[d['name']] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData, names = DataProcessor.SplitData2Batches(
            data=predData,
            numDataPoints=30,
            modelSpecs=model,
            forTrainValidation=False)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        for onebatch, names4onebatch in zip(predSeqData, names):
            input = onebatch[:len(inputVariables)]
            result4prob, result = predict(*input)

            ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked)
            x1d, x1dmask = input[0:2]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses'])  )
            assert result4prob.shape[2] == sum([
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses'])  )
            assert result.shape[2] == sum([
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            nameGenerator = (name for name in names4onebatch
                             if not allresults.has_key(name))
            for name in nameGenerator:
                allresults[name] = dict()
                allresults4prob[name] = dict()

            dims = [
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults4prob[name].has_key(res))
                for name in nameGenerator:
                    allresults4prob[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result4prob[:, :,
                                                       start:end], seqLens)
                ]

                [
                    allresults4prob[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

            dims = [
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults[name].has_key(res))
                for name in nameGenerator:
                    allresults[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result[:, :, start:end], seqLens)
                ]
                [
                    allresults[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

    ## calculate the final result, which is the average of all the predictd properties for the same protein and response name
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()
        for response in results.keys():
            tmpresult = np.average(allresults[name][response], axis=0)

            ##convert coding of discrete labels to more meaningful representation
            labelType = Response2LabelType(response)
            if not labelType.startswith('Discrete'):
                finalresults[name][response] = tmpresult

    finalresults4prob = dict()
    for name, results in allresults4prob.iteritems():
        if not finalresults4prob.has_key(name):
            finalresults4prob[name] = dict()
        for response in results.keys():
            finalresults4prob[name][response] = np.average(
                allresults4prob[name][response], axis=0)

            labelType = Response2LabelType(response)
            if labelType.startswith('Discrete'):
                tmpresult = np.argmax(finalresults4prob[name][response],
                                      axis=1)
                finalresults[name][response] = PropertyUtils.Coding2String(
                    tmpresult, response)
    """
	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for apt in model['responseNames']:
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][apt])
			labelWeights[apt].append(model['weight4' + model['labelType'] ][apt])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)
	"""

    return finalresults4prob, finalresults, allsequences
Esempio n. 15
0
    def __init__(self,
                 rng,
                 seqInput,
                 matrixInput,
                 mask_seq=None,
                 mask_matrix=None,
                 embedInput=None,
                 modelSpecs=None):
        """
		seqInput has shape (batchSize, seqLen, n_in_seq)
		matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix)
		mask_seq has shape (batchSize, #cols_to_be_masked)
		mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen)
		embedInput has shape (batchSize, seqLen, n_in2)
		"""

        assert (modelSpecs is not None)

        self.modelSpecs = modelSpecs
        self.responses = modelSpecs['responses']

        ## set the number of hidden neurons and number of layers
        n_in_seq = modelSpecs['n_in_seq']
        n_in_matrix = modelSpecs['n_in_matrix']
        n_hiddens_seq = modelSpecs['conv1d_hiddens']
        n_hiddens_matrix = modelSpecs['conv2d_hiddens']
        n_hiddens_logreg = modelSpecs['logreg_hiddens']
        seq_repeats = modelSpecs['conv1d_repeats']
        matrix_repeats = modelSpecs['conv2d_repeats']

        ## half win size for convolutional operation
        if modelSpecs['network'].startswith('DilatedResNet'):
            hwsz_matrix = modelSpecs['conv2d_hwszs']
            hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq)
            dilation_seq = [1] * len(n_hiddens_seq)
            dilation_matrix = modelSpecs['conv2d_dilations']
        else:
            hwsz_matrix = modelSpecs['halfWinSize_matrix']
            hwsz_seq = modelSpecs['halfWinSize_seq']

        ## masks to reduce impact of padding zeros
        self.mask_1d = mask_seq
        self.mask_2d = mask_matrix

        self.layers = []

        # sequence convolution
        if modelSpecs['network'].startswith('DilatedResNet'):
            seqConv = DilatedResNet(rng,
                                    input=seqInput,
                                    n_in=n_in_seq,
                                    n_hiddens=n_hiddens_seq,
                                    n_repeats=seq_repeats,
                                    halfWinSize=hwsz_seq,
                                    dilation=dilation_seq,
                                    mask=mask_seq,
                                    activation=modelSpecs['activation'],
                                    batchNorm=modelSpecs['batchNorm'],
                                    version=modelSpecs['network'])
        else:
            seqConv = ResNet(rng,
                             input=seqInput,
                             n_in=n_in_seq,
                             n_hiddens=n_hiddens_seq,
                             n_repeats=seq_repeats,
                             halfWinSize=hwsz_seq,
                             mask=mask_seq,
                             activation=modelSpecs['activation'],
                             batchNorm=modelSpecs['batchNorm'],
                             version=modelSpecs['network'])
        self.layers.append(seqConv)

        ## transform 1d sequence to 2d matrix
        seq2matrixMode = modelSpecs['seq2matrixMode']
        seq2matrixLayers = []
        embedLayers = []

        ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility
        ## useSequentialFeatures is True by default
        useSequentialFeatures = ('UseSequentialFeatures' in modelSpecs \
               and (modelSpecs['UseSequentialFeatures'] is True))

        ## use OuterConcatenation operation to convert sequence features into pairwise features
        if 'OuterCat' in seq2matrixMode and useSequentialFeatures:
            ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out)
            midpointfeature, n_midpoint_out = MidpointFeature(
                seqConv.output, seqConv.n_out)

            ##remove noise in midpointfeature
            ## mask_matrix is used to reduce noise introduced by padding positions
            mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :]
            midpointfeature = T.set_subtensor(
                mid_subtensor,
                T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor))
            mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :]
            midpointfeature = T.set_subtensor(
                mid_subtensor2,
                T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2))

            ## here we use convolution with halfWinSize=0 to reduce model complexity
            compressLayer = Conv2D4DistMatrix(
                rng,
                input=midpointfeature,
                n_in=n_midpoint_out,
                n_hiddens=seq2matrixMode['OuterCat'],
                halfWinSize=0,
                mask=mask_matrix)
            seq2matrixLayers.append(compressLayer)

        ## embedding primary sequence and/or predicted secondary structure
        if embedInput is not None:
            from EmbeddingLayer import MetaEmbeddingLayer

            if 'Seq+SS' in seq2matrixMode:
                n_out_embed = seq2matrixMode['Seq+SS']
            elif 'SeqOnly' in seq2matrixMode:
                n_out_embed = seq2matrixMode['SeqOnly']
            else:
                print(
                    'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.'
                )
                sys.exit(-1)

            embedLayer = MetaEmbeddingLayer(embedInput,
                                            modelSpecs['n_in_embed'],
                                            n_out_embed)
            seq2matrixLayers.append(embedLayer)
            embedLayers.append(embedLayer)

        self.layers += seq2matrixLayers

        input_2d = T.concatenate([matrixInput] +
                                 [layer.output for layer in seq2matrixLayers],
                                 axis=3)
        n_input2d = n_in_matrix + sum(
            [layer.n_out for layer in seq2matrixLayers])

        if modelSpecs['network'].startswith('ResNet'):
            matrixConv = ResNet(rng,
                                input=input_2d,
                                n_in=n_input2d,
                                n_hiddens=n_hiddens_matrix,
                                n_repeats=matrix_repeats,
                                halfWinSize=hwsz_matrix,
                                mask=mask_matrix,
                                activation=modelSpecs['activation'],
                                batchNorm=modelSpecs['batchNorm'],
                                version=modelSpecs['network'])
        elif modelSpecs['network'].startswith('DilatedResNet'):
            matrixConv = DilatedResNet(rng,
                                       input=input_2d,
                                       n_in=n_input2d,
                                       n_hiddens=n_hiddens_matrix,
                                       n_repeats=matrix_repeats,
                                       halfWinSize=hwsz_matrix,
                                       dilation=dilation_matrix,
                                       mask=mask_matrix,
                                       activation=modelSpecs['activation'],
                                       batchNorm=modelSpecs['batchNorm'],
                                       version=modelSpecs['network'])
        else:
            print('Unimplemented deep network type: ', modelSpecs['network'])
            exit(-1)

        self.layers.append(matrixConv)

        conv_out = matrixConv.output

        selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0)
        n_in4logreg = matrixConv.n_out

        self.outputList = []
        self.output_probList = []
        self.predictors = []

        self.params4var = []
        self.paramL14var = 0
        self.paramL24var = 0

        for res in modelSpecs['responses']:
            labelType = Response2LabelType(res)
            predictor = None

            if labelType.startswith('Discrete'):
                assert (config.responseValueDims[labelType] == 1)
                predictor = NN4LogReg(rng=rng,
                                      input=selected,
                                      n_in=n_in4logreg,
                                      n_out=config.responseProbDims[labelType],
                                      n_hiddens=n_hiddens_logreg)
            elif labelType.startswith('LogNormal') or labelType.startswith(
                    'Normal'):
                predictor = NN4Normal(
                    rng=rng,
                    input=selected,
                    n_in=n_in4logreg,
                    n_variables=config.responseValueDims[labelType],
                    n_out=config.responseProbDims[labelType],
                    n_hiddens=n_hiddens_logreg)
                ## recording parameters specific for variance prediction
                self.params4var += predictor.params4var
                self.paramL14var += predictor.paramL14var
                self.paramL24var += predictor.paramL24var
            else:
                print('incorrect response name or label type: ', res)
                exit(-1)

            self.layers.append(predictor)
            self.predictors.append(predictor)

            ## output in 2d matrix
            output_2d = predictor.y_pred.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 config.responseValueDims[labelType]))
            output_2d_prob = predictor.output.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 config.responseProbDims[labelType]))

            self.outputList.append(output_2d)
            self.output_probList.append(output_2d_prob)

        self.output = T.concatenate(self.outputList, axis=3)
        self.output_prob = T.concatenate(self.output_probList, axis=3)

        ## collect all the model parameters and their norms
        self.params = []
        self.paramL2 = 0
        self.paramL1 = 0

        for layer in self.layers:
            self.params += layer.params
            self.paramL2 += layer.paramL2
            self.paramL1 += layer.paramL1
def AssembleOneBatch(data, modelSpecs, forTrainValidation=True):
    if not data:
        print 'WARNING: the list of data is empty'
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]),
                   dtype=theano.config.floatX)
    ## for mask
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)

    ## Y shall be a list of labels, each for one type
    ##we always need a weight vector to deal with residues without 3D coordinates in training and validation, if modelSpecs['UseSampleWeight']:
    Y = []
    W = []
    for res in modelSpecs['responses']:
        labelType = Response2LabelType(res)
        labelName = Response2LabelName(res)

        dataType = (np.int32 if labelType.startswith('Discrete') else
                    theano.config.floatX)
        if forTrainValidation:
            if not data[0].has_key(labelName):
                print 'ERROR: label information is needed for training protein ', data[
                    'name'], ' and response ', res
                exit(1)
            Y.append(
                np.zeros(shape=(numSeqs, maxSeqLen,
                                config.responseValueDims[labelType]),
                         dtype=dataType))

            if not data[0].has_key('missing'):
                print 'ERROR: missing information is needed for training protein ', data[
                    'name']
                exit(1)

            W.append(
                np.zeros(shape=(numSeqs, maxSeqLen, 1),
                         dtype=theano.config.floatX))

    for j in range(len(data)):
        seqLen = data[j]['seqLen']
        X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures']
        M1d[j, maxSeqLen - seqLen:].fill(1)

        for y, w, res in zip(Y, W, modelSpecs['responses']):
            y[j, maxSeqLen - seqLen:, ] = data[j][Response2LabelName(res)]

            if res.startswith('DISO'):
                ## for disorder prediction, all the residues shall be considered since those residues without 3D coordinates are positive examples
                ## we may assign a larger weight to positive examples since they are only 6% of the whole data set
                w[j, maxSeqLen - seqLen:, ] = np.reshape(
                    data[j]['missing'],
                    (-1, 1)) * (modelSpecs['w4diso'] - 1.) + 1.
            else:
                ## assign weight 0 to those residues without coordinates, otherwise 1
                w[j, maxSeqLen -
                  seqLen:, ] = 1.0 - np.reshape(data[j]['missing'], (-1, 1))

    onebatch = [X1d, M1d]
    onebatch.extend(Y)
    onebatch.extend(W)

    return onebatch
def LoadPropertyFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print 'ERROR: the feature files is empty'
        exit(1)

    data = []
    for infile in files:
        with open(infile, 'rb') as fh:
            data.extend(cPickle.load(fh))

    EmbeddingModel = None
    if modelSpecs.has_key(
            'UseSequenceEmbedding') and modelSpecs['UseSequenceEmbedding']:
        EmbeddingModelFile = os.path.join(
            os.environ['DL4PropertyPredHome'], 'data',
            'Mofrad-PLoSOne-2015Nov.3GramEmbeddingParams.pkl')
        EmbeddingModel = SequenceEmbedding.LoadEmbeddingParamsInPKL(
            EmbeddingModelFile)

    ## each protein has sequential features as input
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ##collecting sequential features...
        seqMatrices = []

        seqMatrices.append(d['PSSM'])
        ##seqMatrices.append( d['PSFM'] )

        ##Load sequence embedding features here
        if EmbeddingModel is not None:
            seqMatrices.append(
                SequenceEmbedding.EmbedOneSequence(d['sequence'],
                                                   EmbeddingModel))

        if modelSpecs.has_key('UsePSFM') and modelSpecs['UsePSFM']:
            seqMatrices.append(d['PSFM'])

        if modelSpecs.has_key(
                'UseOneHotEncoding') and modelSpecs['UseOneHotEncoding']:
            seqMatrices.append(config.SeqOneHotEncoding(d['sequence']))

## add template similarity score here
        if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if not d.has_key('tplSimScore'):
                print 'ERROR: no tplSimScore for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)
            if d['tplSimScore'].shape[1] != 10:
                print 'ERROR: the number of query-template similarity features is not 10 in data for', d[
                    'name']
                exit(1)

            if not d.has_key('tplProperties'):
                print 'ERROR: no tplProperties for target', d[
                    'name'], 'which is needed since you specify to use template information'
                exit(1)

            if d['tplProperties'].shape[1] < 15:
                print 'ERROR: #template local structure properties shall be at least 15 for target', d[
                    'name']
                exit(1)

            ## the query-template similarity score shall be arranged in the order of: AA identity (binary), blosum80, blosum62, blosum45, spScore, spScore_ST, ppScore, pmScore, cc, hdsm
            seqMatrices.append(d['tplSimScore'])

            ##we do not use omg information from the template, the first 8 features shall be the 8-state secondary structure, then followed by pACC, CNa, CNb, Phi, Psi, Theta and Tau
            #seqMatrices.append( d['tplProperties'][:,:15] )
            seqMatrices.append(d['tplProperties'][:, :8])
            for r in modelSpecs['responses']:
                if r.startswith('ACC'):
                    seqMatrices.append(d['tplProperties'][:, 8:9])
                elif r.startswith('Phi') or r.startswith(
                        'Psi') or r.startswith('CLE'):
                    seqMatrices.append(d['tplProperties'][:, 11:13])
                elif r.startswith('Theta') or r.startswith('Tau'):
                    seqMatrices.append(d['tplProperties'][:, 13:15])
                elif r.startswith('CNa') or r.startswith('CNb'):
                    seqMatrices.append(d['tplProperties'][:, 9:11])
                else:
                    print 'ERROR: unsupported response', r
                    exit(1)

        if d.has_key('otherSeqFeatures'):
            seqMatrices.append(d['otherSeqFeatures'])

## all the features shall have shape (seqLen, nFeatures) where nFeatures is variable, but seqLen is the sequence length of one protein
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature

        if not d.has_key('DISO') and d.has_key('Missing'):
            d['DISO'] = d['Missing']

##collecting labels...
        for r in modelSpecs['responses']:
            labelName = Response2LabelName(r)
            labelType = Response2LabelType(r)

            if not d.has_key(labelName) and forTrainValidation:
                print 'ERROR: missing label information for protein ', d[
                    'name'], ' and response ', r
                exit(1)
            elif not d.has_key(labelName):
                continue

            labels = d[labelName]

            ## need some special handling of discrete labels
            if labelType.startswith('Discrete'):
                if r.startswith('SS3'):
                    labels = np.array([
                        PropertyUtils.SS3Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('SS8'):
                    labels = np.array([
                        PropertyUtils.SS8Letter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                elif r.startswith('ACC') or r.startswith('DISO'):
                    labels = labels.reshape((-1, 1))
                elif r.startswith('CLE'):
                    labels = np.array([
                        PropertyUtils.CLELetter2Code[c] for c in labels
                    ]).reshape((-1, 1))
                else:
                    print 'ERROR: please specify how to convert your discrete labels to numbers for response ', r
                    exit(1)

            oneprotein[labelName] = labels

##at this point, finish collecting features and labels for one protein
        if d.has_key('Missing'):
            oneprotein['missing'] = d['Missing']
        elif forTrainValidation:
            print 'ERROR: for training data, we need information to specify which residues have no 3D coordinates'
            exit(1)

        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print 'assembled features and labels for ', counter, ' proteins.'
    """
    	tmpfile = open(files[0] + '.contactInput.pkl', 'wb')
    	cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL)
    	tmpfile.close()
    	"""

    return proteinFeatures
Esempio n. 18
0
def AssembleOneBatch(data, modelSpecs):
    if not data:
        print('WARNING: the list of data is empty')
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]),
                   dtype=theano.config.floatX)
    X2d = np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen,
                          data[0]['matrixFeatures'].shape[2]),
                   dtype=theano.config.floatX)

    X1dem = None
    if 'embedFeatures' in data[0]:
        X1dem = np.zeros(shape=(numSeqs, maxSeqLen,
                                data[0]['embedFeatures'].shape[1]),
                         dtype=theano.config.floatX)

    ## Y shall be a list of 3D matrices, each for one atom type. Need to revise dtype for Y
    Y = []
    if 'atomLabelMatrix' in data[0]:
        for response in modelSpecs['responses']:
            labelType = Response2LabelType(response)
            dataType = np.int16
            if not labelType.startswith('Discrete'):
                dataType = theano.config.floatX
            rValDims = config.responseValueDims[labelType]
            if rValDims == 1:
                Y.append(
                    np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen),
                             dtype=dataType))
            else:
                Y.append(
                    np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen, nValDims),
                             dtype=dataType))

    ## when Y is empty, weight is useless. So When Y is None, weight shall also be None
    weightMatrix = []
    if Y and modelSpecs['UseSampleWeight']:
        weightMatrix = [
            np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen),
                     dtype=theano.config.floatX)
        ] * len(modelSpecs['responses'])

    ## for mask
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)
    M2d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen, maxSeqLen),
                   dtype=np.int8)

    for j in range(len(data)):
        seqLen = data[j]['seqLen']
        X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures']
        X2d[j, maxSeqLen - seqLen:,
            maxSeqLen - seqLen:, :] = data[j]['matrixFeatures']
        M1d[j, maxSeqLen - seqLen:].fill(1)
        M2d[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:].fill(1)

        if X1dem is not None:
            X1dem[j, maxSeqLen - seqLen:, :] = data[j]['embedFeatures']

        if Y:
            for y, response in zip(Y, modelSpecs['responses']):
                if len(y.shape) == 3:
                    y[j, maxSeqLen - seqLen:, maxSeqLen -
                      seqLen:] = data[j]['atomLabelMatrix'][response]
                else:
                    y[j, maxSeqLen - seqLen:, maxSeqLen -
                      seqLen:, ] = data[j]['atomLabelMatrix'][response]
        if weightMatrix:
            ## we calculate the labelWeightMatrix here
            labelWeightMatrix = CalcLabelWeightMatrix(
                data[j]['atomLabelMatrix'], modelSpecs)
            for w, at in zip(weightMatrix, modelSpecs['responses']):
                w[j, maxSeqLen - seqLen:,
                  maxSeqLen - seqLen:] = labelWeightMatrix[at]

    onebatch = [X1d, X2d, M1d, M2d]

    if X1dem is not None:
        onebatch.append(X1dem)

    onebatch.extend(Y)
    onebatch.extend(weightMatrix)

    return onebatch
    def __init__(self,
                 rng,
                 seqInput,
                 matrixInput,
                 mask_seq=None,
                 mask_matrix=None,
                 embedInput=None,
                 boundingbox=None,
                 modelSpecs=None):
        """
	seqInput has shape (batchSize, seqLen, n_in_seq)
	matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix)
	mask_seq has shape (batchSize, #cols_to_be_masked)
        mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen)
	embedInput has shape (batchSize, seqLen, n_in2)
	boundingbox is a vector of 4 integer elements: top, left, bottom and right. boundingbox shall only be applied to the matrix converted from sequential features.
        """

        assert (modelSpecs is not None)

        self.modelSpecs = modelSpecs
        self.responses = modelSpecs['responses']

        ## set the number of hidden neurons and number of layers
        n_in_seq = modelSpecs['n_in_seq']
        n_in_matrix = modelSpecs['n_in_matrix']
        n_hiddens_seq = modelSpecs['conv1d_hiddens']
        n_hiddens_matrix = modelSpecs['conv2d_hiddens']
        n_hiddens_logreg = modelSpecs['logreg_hiddens']
        seq_repeats = modelSpecs['conv1d_repeats']
        matrix_repeats = modelSpecs['conv2d_repeats']

        ## half win size for convolutional operation
        if modelSpecs['network'].startswith('DilatedResNet'):
            hwsz_matrix = modelSpecs['conv2d_hwszs']
            hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq)
            dilation_seq = [1] * len(n_hiddens_seq)
            dilation_matrix = modelSpecs['conv2d_dilations']
        else:
            hwsz_matrix = modelSpecs['halfWinSize_matrix']
            hwsz_seq = modelSpecs['halfWinSize_seq']

        ## masks to reduce impact of padding zeros
        self.mask_1d = mask_seq
        self.mask_2d = mask_matrix

        self.layers = []

        act = T.nnet.relu
        if modelSpecs['activation'] == 'TANH':
            act = T.tanh

        # sequence convolution
        if modelSpecs['network'].startswith('DilatedResNet'):
            #seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network'])
            seqConv = DilatedResNet(rng,
                                    input=seqInput,
                                    n_in=n_in_seq,
                                    n_hiddens=n_hiddens_seq,
                                    n_repeats=seq_repeats,
                                    halfWinSize=hwsz_seq,
                                    dilation=dilation_seq,
                                    mask=mask_seq,
                                    activation=act,
                                    modelSpecs=modelSpecs)
        else:
            seqConv = ResNet(rng,
                             input=seqInput,
                             n_in=n_in_seq,
                             n_hiddens=n_hiddens_seq,
                             n_repeats=seq_repeats,
                             halfWinSize=hwsz_seq,
                             mask=mask_seq,
                             activation=act,
                             batchNorm=modelSpecs['batchNorm'],
                             version=modelSpecs['network'])
        self.layers.append(seqConv)

        ## transform 1d sequence to 2d matrix
        seq2matrixMode = modelSpecs['seq2matrixMode']
        seq2matrixLayers = []
        embedLayers = []

        ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility
        ## useSequentialFeatures is True by default
        ##useSequentialFeatures = ( modelSpecs.has_key('UseSequentialFeatures') and (modelSpecs['UseSequentialFeatures'] is True) )

        ## use OuterConcatenation operation to convert sequence features into pairwise features
        if seq2matrixMode.has_key('OuterCat') and config.UseSequentialFeatures:

            ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out)
            midpointfeature, n_midpoint_out = MidpointFeature(seqConv.output,
                                                              seqConv.n_out,
                                                              box=boundingbox)

            ##remove noise in midpointfeature
            ## mask_matrix is used to reduce noise introduced by padding positions
            mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :]
            midpointfeature = T.set_subtensor(
                mid_subtensor,
                T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor))
            mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :]
            midpointfeature = T.set_subtensor(
                mid_subtensor2,
                T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2))

            ## here we use convolution with halfWinSize=0 to reduce model complexity
            compressLayer = Conv2D4DistMatrix(
                rng,
                input=midpointfeature,
                n_in=n_midpoint_out,
                n_hiddens=seq2matrixMode['OuterCat'],
                halfWinSize=0,
                mask=mask_matrix)
            #compressLayer = Conv2D4DistMatrix(rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=None )
            seq2matrixLayers.append(compressLayer)

        ## embedding primary sequence and/or predicted secondary structure
        if embedInput is not None:
            from EmbeddingLayer import EmbeddingLayer4AllRange

            if seq2matrixMode.has_key('Seq+SS'):
                n_out_embed = seq2matrixMode['Seq+SS']
            elif seq2matrixMode.has_key('SeqOnly'):
                n_out_embed = seq2matrixMode['SeqOnly']
            else:
                print 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.'
                exit(1)

            embedLayer = EmbeddingLayer4AllRange(embedInput,
                                                 modelSpecs['n_in_embed'],
                                                 n_out_embed,
                                                 box=boundingbox)
            seq2matrixLayers.append(embedLayer)
            embedLayers.append(embedLayer)
        """
	we do not use this profile embedding any more
	## embedding the sequence profile
	if seq2matrixMode.has_key('Profile') and useSequentialFeatures:
	    from EmbeddingLayer import ProfileEmbeddingLayer
	    pEmbedLayer = ProfileEmbeddingLayer(seqConv.output, seqConv.n_out, seq2matrixMode['Profile'])
	    seq2matrixLayers.append(pEmbedLayer)
	    embedLayers.append(pEmbedLayer)
	"""

        self.layers += seq2matrixLayers

        bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
            modelSpecs)
        if (bUseCCMraw or bUseFullMI
                or bUseFullCov) and config.CompressMatrixInput(modelSpecs):
            ## here we add a compress layer to reduce the #channels of the original matrix input.
            n_hiddens4MatrixCompress = modelSpecs['hiddens4MatrixCompress']
            compressLayer4MatrixInput = Conv2D4DistMatrix(
                rng,
                input=matrixInput,
                n_in=n_in_matrix,
                n_hiddens=n_hiddens4MatrixCompress,
                halfWinSize=0,
                mask=mask_matrix)
            compressedMatrixInput = compressLayer4MatrixInput.output
            n_compressedMatrix = compressLayer4MatrixInput.n_out
            input_2d = T.concatenate(
                [compressedMatrixInput] +
                [layer.output for layer in seq2matrixLayers],
                axis=3)
            n_input2d = n_compressedMatrix + sum(
                [layer.n_out for layer in seq2matrixLayers])
        else:

            ##old code for merging original matrix input and sequential input
            input_2d = T.concatenate(
                [matrixInput] + [layer.output for layer in seq2matrixLayers],
                axis=3)
            n_input2d = n_in_matrix + sum(
                [layer.n_out for layer in seq2matrixLayers])

        #print 'n_input2d=', n_input2d

        if modelSpecs['network'].startswith('ResNet'):
            matrixConv = ResNet(rng,
                                input=input_2d,
                                n_in=n_input2d,
                                n_hiddens=n_hiddens_matrix,
                                n_repeats=matrix_repeats,
                                halfWinSize=hwsz_matrix,
                                mask=mask_matrix,
                                activation=act,
                                batchNorm=modelSpecs['batchNorm'],
                                version=modelSpecs['network'])

        elif modelSpecs['network'].startswith('DilatedResNet'):
            #matrixConv=DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network'])
            matrixConv = DilatedResNet(rng,
                                       input=input_2d,
                                       n_in=n_input2d,
                                       n_hiddens=n_hiddens_matrix,
                                       n_repeats=matrix_repeats,
                                       halfWinSize=hwsz_matrix,
                                       dilation=dilation_matrix,
                                       mask=mask_matrix,
                                       activation=act,
                                       modelSpecs=modelSpecs)
        else:
            print 'ERROR: Unimplemented deep network type: ', modelSpecs[
                'network']
            exit(1)

        self.layers.append(matrixConv)

        conv_out = matrixConv.output

        selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0)
        n_in4logreg = matrixConv.n_out

        self.outputList = []
        self.output_probList = []
        self.predictors = []

        self.params4var = []
        self.paramL14var = 0
        self.paramL24var = 0

        for res in modelSpecs['responses']:

            labelType = Response2LabelType(res)
            predictor = None

            if labelType.startswith('Discrete'):
                assert GetResponseValueDims(res) == 1
                predictor = NN4LogReg(rng=rng,
                                      input=selected,
                                      n_in=n_in4logreg,
                                      n_out=GetResponseProbDims(res),
                                      n_hiddens=n_hiddens_logreg)

            elif labelType.startswith('LogNormal') or labelType.startswith(
                    'Normal'):
                predictor = NN4Normal(rng=rng,
                                      input=selected,
                                      n_in=n_in4logreg,
                                      n_variables=GetResponseValueDims(res),
                                      n_out=GetResponseProbDims(res),
                                      n_hiddens=n_hiddens_logreg)

                ## recording parameters specific for variance prediction
                self.params4var += predictor.params4var
                self.paramL14var += predictor.paramL14var
                self.paramL24var += predictor.paramL24var

            else:
                print 'incorrect response name or label type: ', res
                exit(1)

            self.layers.append(predictor)
            self.predictors.append(predictor)

            ## output in 2d matrix
            output_2d = predictor.y_pred.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 GetResponseValueDims(res)))
            output_2d_prob = predictor.output.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 GetResponseProbDims(res)))

            self.outputList.append(output_2d)
            self.output_probList.append(output_2d_prob)

        self.output = T.concatenate(self.outputList, axis=3)
        self.output_prob = T.concatenate(self.output_probList, axis=3)

        ## collect all the model parameters and their norms
        self.params = []
        self.paramL2 = 0
        self.paramL1 = 0

        for layer in self.layers:
            self.params += layer.params
            self.paramL2 += layer.paramL2
            self.paramL1 += layer.paramL1
        """
def BuildModel(modelSpecs, forTrain=True):
    rng = np.random.RandomState()

    ## x is for sequential features and y for matrix (or pairwise) features
    x = T.tensor3('x')
    y = T.tensor4('y')

    ## mask for x and y, respectively
    xmask = T.bmatrix('xmask')
    ymask = T.btensor3('ymask')

    xem = None
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        xem = T.tensor3('xem')

## bounding box for crop of a big protein distance matrix. This box allows crop at any position.
    box = None
    if forTrain:
        box = T.ivector('boundingbox')

## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data
    trainByRefLoss = None
    if forTrain and config.TrainByRefLoss(modelSpecs):
        trainByRefLoss = T.iscalar('trainByRefLoss')

    distancePredictor = ResNet4DistMatrix(rng,
                                          seqInput=x,
                                          matrixInput=y,
                                          mask_seq=xmask,
                                          mask_matrix=ymask,
                                          embedInput=xem,
                                          boundingbox=box,
                                          modelSpecs=modelSpecs)

    ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
    labelList = []
    if forTrain:
        ## when this model is used for training. We need to define the label variable
        for response in modelSpecs['responses']:
            labelType = Response2LabelType(response)
            rValDims = GetResponseValueDims(response)

            if labelType.startswith('Discrete'):
                if rValDims > 1:
                    ## if one response is a vector, then we use a 4-d tensor
                    ## wtensor is for 16bit integer
                    labelList.append(T.wtensor4('Tlabel4' + response))
                else:
                    labelList.append(T.wtensor3('Tlabel4' + response))
            else:
                if rValDims > 1:
                    labelList.append(T.tensor4('Tlabel4' + response))
                else:
                    labelList.append(T.tensor3('Tlabel4' + response))

    ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
    weightList = []
    if len(labelList) > 0 and config.UseSampleWeight(modelSpecs):
        weightList = [
            T.tensor3('Tweight4' + response)
            for response in modelSpecs['responses']
        ]

## for prediction, both labelList and weightList are empty
    if forTrain:
        return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss
    else:
        return distancePredictor, x, y, xmask, ymask, xem
Esempio n. 21
0
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print('the feature file is empty')
        exit(-1)

    fhs = [open(file, 'rb') for file in files]
    data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], [])
    [fh.close() for fh in fhs]

    ## each protein has sequential and  pairwise features as input and distance matrix as label
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ## convert the primary sequence to a one-hot encoding
        oneHotEncoding = config.SeqOneHotEncoding(d['sequence'])

        ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure
        if config.EmbeddingUsed(modelSpecs):
            if 'Seq+SS' in modelSpecs['seq2matrixMode']:
                embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3'])
            else:
                embedFeature = oneHotEncoding
            oneprotein['embedFeatures'] = embedFeature

        ##collecting sequential features...
        seqMatrices = [oneHotEncoding]

        ## 3-state secondary structure shall always be placed before the other features, why?
        if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True):
            seqMatrices.append(d['SS3'])

        if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True):
            seqMatrices.append(d['ACC'])

        if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True):
            seqMatrices.append(d['PSSM'])

        if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True:
            seqMatrices.append(d['DISO'])

        ##membrane protein specific features
        useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and (
            modelSpecs['UseMPSpecificFeatures'] is True)
        if useMPSpecificFeatures:
            if 'MemAcc' in d:
                seqMatrices.append(d['MemAcc'])
            else:
                print('The data does not have a feature called MemAcc')
                exit(-1)
            if 'MemTopo' in d:
                seqMatrices.append(d['MemTopo'])
            else:
                print('The data does not have a feature called MemTopo')
                exit(-1)

        ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment.
        ## this is mainly used for homology modeling
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if 'tplSimScore' not in d:
                print(
                    'the data has no key tplSimScore, which is needed since you specify to use template information'
                )
                exit(-1)
            if d['tplSimScore'].shape[1] != 11:
                print(
                    'The number of features for query-template similarity shall be equal to 11'
                )
                exit(-1)
            seqMatrices.append(d['tplSimScore'])
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        ##collecting pairwise features...
        pairfeatures = []
        ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 )
        posFeature = LocationFeature(d)
        pairfeatures.append(posFeature)

        cbrtFeature = CubeRootFeature(d)
        pairfeatures.append(cbrtFeature)

        if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True):
            if 'ccmpredZ' not in d:
                print('Something must be wrong. The data for protein ',
                      d['name'],
                      ' does not have the normalized ccmpred feature!')
                exit(-1)
            pairfeatures.append(d['ccmpredZ'])

        if modelSpecs['UsePSICOV'] is True:
            pairfeatures.append(d['psicovZ'])

        if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is
                                              True):
            pairfeatures.append(d['OtherPairs'])

        ##add template-related distance matrix. This code needs modification later
        ## somewhere we shall also write code to add template-related sequential features such as secondary structure?
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template distance matrix...'
            if 'tplDistMatrix' not in d:
                print(
                    'the data for ', d['name'],
                    ' has no tplDistMatrix, which is needed since you specify to use template information'
                )
                exit(-1)

            ## Check to make sure that we use exactly the same set of inter-atom distance information from templates
            ## currently we do not use HB and Beta information from template
            apts = d['tplDistMatrix'].keys()
            assert (set(apts) == set(config.allAtomPairTypes))
            ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) )

            tmpPairFeatures = dict()
            for apt, tplDistMatrix in d['tplDistMatrix'].items():
                ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type
                if apt == 'CaCa':
                    flagMatrix = np.zeros_like(tplDistMatrix)
                    np.putmask(flagMatrix, tplDistMatrix < 0, 1)
                    pairfeatures.append(flagMatrix)

                strengthMatrix = np.copy(tplDistMatrix)
                np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5)
                np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50)
                strengthMatrix = 3.5 / strengthMatrix

                if config.InTPLMemorySaveMode(modelSpecs):
                    tmpPairFeatures[apt] = [strengthMatrix]
                else:
                    tmpPairFeatures[apt] = [
                        strengthMatrix,
                        np.square(strengthMatrix)
                    ]

            ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure
            ## python of different versions may have different ordering of keys in dict() ?
            pairfeatures.extend(tmpPairFeatures['CbCb'])
            pairfeatures.extend(tmpPairFeatures['CgCg'])
            pairfeatures.extend(tmpPairFeatures['CaCg'])
            pairfeatures.extend(tmpPairFeatures['CaCa'])
            pairfeatures.extend(tmpPairFeatures['NO'])

        if config.InTPLMemorySaveMode(modelSpecs):
            matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32)
        else:
            matrixFeature = np.dstack(tuple(pairfeatures))
            #print 'matrixFeature.shape: ', matrixFeature.shape

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature
        oneprotein['matrixFeatures'] = matrixFeature

        ##collecting labels...
        if 'atomDistMatrix' in d:
            atomDistMatrix = d['atomDistMatrix']
            oneprotein['atomLabelMatrix'] = dict()

            for response in modelSpecs['responses']:
                responseName = Response2LabelName(response)
                labelType = Response2LabelType(response)
                if responseName not in atomDistMatrix:
                    print('In the raw feature data, ', d['name'],
                          ' does not have matrix for ', responseName)
                    exit(-1)

                ## atomDistMatrix is the raw data, so it does not have information about labelType
                distm = atomDistMatrix[responseName]

                if labelType.startswith('Discrete'):
                    subType = labelType[len('Discrete'):]

                    ## no need to discretize for HB and Beta-Pairing since they are binary matrices
                    if responseName.startswith(
                            'HB') or responseName.startswith('Beta'):
                        oneprotein['atomLabelMatrix'][response] = distm
                    else:
                        labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
                            distm, config.distCutoffs[subType],
                            subType.endswith('Plus'))
                        oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('LogNormal'):
                    labelMatrix = DistanceUtils.LogDistMatrix(distm)
                    oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('Normal'):
                    oneprotein['atomLabelMatrix'][response] = distm
                else:
                    print('unsupported response: ', res)
                    exit(-1)

        elif forTrainValidation:
            print(
                'atomic distance matrix is needed for the training and validation data'
            )
            exit(-1)

        ##at this point, finish collecting features and labels for one protein
        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print('assembled features and labels for ', counter, ' proteins.')

    return proteinFeatures
def PredictDistMatrix(modelFiles, predFiles, savefolder=None):
    	## load all the models from the files. Each file contains specification for one model.
	models = []
	for mFile in modelFiles:
    		fh = open(mFile, 'rb')
    		model = cPickle.load(fh)
    		fh.close()
		models.append(model)

	## check consistency among models. All the models shall have the same labelType for the same atom pair type
	labelTypes = dict()
	for model in models:
		for response in model['responses']:
			labelName = Response2LabelName(response)
			labelType = Response2LabelType(response)
			if not labelTypes.has_key(labelName):
				labelTypes[labelName] = labelType
			elif labelTypes[labelName] != labelType:
				print 'WARNING: at least two models have different label types for the same atom pair type.'
				exit(-1)
					

	allsequences = dict()

	##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices
	##We predict one prob_matrix from each model for each protein and each response
	## two different models may share some overlapping responses.

	allresults = dict()
	numModels = dict()
	for model, mfile in zip(models, modelFiles):
		if not model['network'] in config.allNetworks:

			print 'unsupported network architecture: ', model['network']
			exit(-1)

		distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False)

		inputVariables = [ x, y, xmask, ymask]
		if xem is not None:
			inputVariables.append(xem)

	  	pred_prob = distancePredictor.output_prob
        	predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' )

		## set model parameter values
		if not Compatible(distancePredictor.params, model['paramValues']):
			print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile
			exit(-1)

		[ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ]

		## We shall load these files for each model separately since each model may have different requirement of the data
		predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False)

		##make sure the input has the same number of features as the model. We do random check here to speed up
		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2]

		if predData[0].has_key('embedFeatures'):
			rindex = np.random.randint(0, high=len(predData) )
			assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1]

		## check if all the proteins of the same name have exactly the same sequence
		for d in predData:
			if not allsequences.has_key(d['name']):
				allsequences[d['name']] = d['sequence']
			elif allsequences[d['name']] != d['sequence']:
				print 'Error: inconsistent primary sequence for the same protein in the protein feature files'
				exit(-1)
			
		## predSeqData and names are in the exactly the same order, so we know which data is for which protein	
		predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model)
		print '#predData: ', len(predData), '#batches: ', len(predSeqData)

		for onebatch, names4onebatch in zip(predSeqData, names):
			input = onebatch[ : len(inputVariables) ]
			result = predict(*input)

			x1d, x2d, x1dmask, x2dmask = input[0:4]
			seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
			maxSeqLen = x1d.shape[1]

			##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
			assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] )

			## calculate the start and end positions of each response in the last dimension of result
			dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ]
                        endPositions = np.cumsum(dims)
                        startPositions =  endPositions - dims

			for name in names4onebatch:
				if not allresults.has_key(name):
					allresults[name]=dict() 
					numModels[name] =dict()

			## batchres is a batch of result, its ndim=4
			for response, start, end in zip(model['responses'], startPositions, endPositions):

				## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters
				batchres = result[:, :, :, start:end ]


				## remove masked positions
				revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ]

				for res4one, name in zip(revised_batchres, names4onebatch):
                                        if not allresults[name].has_key(response):
                                                allresults[name][response] = res4one
                                                numModels[name][response] = np.int32(1)
                                        else:
                                                ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                                                allresults[name][response] +=  res4one
                                                numModels[name][response] += np.int32(1)


		del predict
		del predData
		del predSeqData
		gc.collect()


	## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response
	finalresults = dict()
	for name, results in allresults.iteritems():
		if not finalresults.has_key(name):
			finalresults[name] = dict()

		## finalresults has 3 dimensions. 
		for response in results.keys():
			#finalresults[name][response] = np.average(allresults[name][response], axis=0)
			finalresults[name][response] = allresults[name][response]/numModels[name][response]

			##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
			apt = Response2LabelName(response)
			if config.IsSymmetricAPT( apt ):
				finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2.

	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for response in model['responses']:
			apt = response
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][response])
			labelWeights[apt].append(model['weight4labels'][response])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)

	## convert the predicted distance probability matrix into a predicted contact matrix. 
	## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions
	predictedContactMatrices = dict()
	from scipy.stats import norm
	for name, results in finalresults.iteritems():
		predictedContactMatrices[name] = dict()
		for response in results.keys():
			apt = Response2LabelName(response)
			labelType = Response2LabelType(response)

			if apt in config.allAtomPairTypes:
				if labelType.startswith('Discrete'):
					subType = labelType[len('Discrete'): ]
					labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
					predictedContactMatrices[name][apt] =  np.sum( finalresults[name][response][:, :, :labelOf8], axis=2)
				elif labelType.startswith('Normal'):
					assert labelType.startswith('Normal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(config.ContactDefinition)
				elif labelType.startswith('LogNormal'):
					assert labelType.startswith('LogNormal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(np.log(config.ContactDefinition) )
				else:
					print 'unsupported label type in response: ', response
					exit(-1)

			elif apt in ['HB', 'Beta']:
				predictedContactMatrices[name][apt] =  finalresults[name][response][:, :, 0]
			else:
				print 'unsupported atom type in response: ', response
				exit(-1)


	##write all the results here
	## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence)
        ## we store distLabelType for future use
	for name, results in finalresults.iteritems():

		savefilename = name + '.predictedDistMatrix.pkl'
		if savefolder is not None:
			savefilename = os.path.join(savefolder, savefilename)

		fh = open(savefilename, 'wb')
		cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()

	return finalresults, predictedContactMatrices, allsequences
Esempio n. 23
0
	def __init__(self, rng, seqInput, mask_seq=None, modelSpecs = None):
		"""
		seqInput has shape (batchSize, seqLen, n_in_seq)
		mask_seq has shape (batchSize, #cols_to_be_masked)
       		"""

		self.modelSpecs = modelSpecs	
		n_in_seq = modelSpecs['n_in_seq']
		n_hiddens_seq = modelSpecs['conv1d_hiddens']
		seq_repeats = modelSpecs['conv1d_repeats']
		n_hiddens_logreg = modelSpecs['logreg_hiddens']

		hwsz_seq=modelSpecs['halfWinSize_seq']
	
		self.mask_1d = mask_seq
		self.layers = []
        
		# sequence convolution 

		if modelSpecs['network'].startswith('ResNet'):
        		seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network'])
		else:
			print 'Unimplemented deep network type: ', modelSpecs['network']
			exit(-1)

		self.layers.append(seqConv)

		## conv_out has shape (batchSize, seqLen, seqConv.n_out)
        	conv_out = seqConv.output

		##flatten all
        	selected = conv_out.dimshuffle(2, 0, 1).flatten(2).dimshuffle(1, 0) 
		n_in4logreg = seqConv.n_out 

		self.outputList = []
		self.output_probList = []
		self.predictors = []

		self.params4var = []
		self.paramL14var = 0
		self.paramL24var = 0

		for res in modelSpecs['responses']:
			labelType = Response2LabelType(res)
			predictor = None

			if labelType.startswith('vonMise'):
				assert (config.responseValueDims[labelType] == 2)
            			predictor = NN4PhiPsi(rng=rng, input=selected, n_in=n_in4logreg, n_variables=config.responseValueDims[labelType], n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg)
				self.params4var += predictor.params4var
				self.paramL14var += predictor.paramL14var
				self.paramL24var += predictor.paramL24var

			elif labelType.startswith('Gauss'):
            			predictor = NN4Normal(rng=rng, input=selected, n_in=n_in4logreg, n_variables=config.responseValueDims[labelType], n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg)
				self.params4var += predictor.params4var
				self.paramL14var += predictor.paramL14var
				self.paramL24var += predictor.paramL24var

			elif labelType.startswith('Discrete'):
				assert (config.responseValueDims[labelType] == 1)
            			predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg)

			else:
				print 'incorrect response name or label type: ', res
				exit(-1)
				
	    		self.layers.append(predictor)
			self.predictors.append(predictor)

            		## output 
	    		y_pred = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], config.responseValueDims[labelType]) )
	    		output_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], config.responseProbDims[labelType]) )

	    		self.outputList.append( y_pred )
	    		self.output_probList.append( output_prob )

		## y_pred is the predicted target value
		## output_prob contains information for probability distribution of a target value
		self.y_pred = T.concatenate( self.outputList, axis=2 )
		self.output4prob = T.concatenate( self.output_probList, axis=2 ) 

        	self.params = []
		self.paramL1 = 0
		self.paramL2 = 0

		for layer in self.layers:
	    		self.params  += layer.params
	    		self.paramL1 += layer.paramL1
	    		self.paramL2 += layer.paramL2