def TrainByOneBatch(batch, train, modelSpecs, forRefState=False):

    ## batch is a list of protein locations, so we need to load the real data here
    minibatch = DataProcessor.LoadRealData(batch, modelSpecs)

    ## add code here to make sure that the data has the same input dimension as the model specification
    FeatureUtils.CheckModelNDataConsistency(modelSpecs, minibatch)

    onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
        minibatch, modelSpecs, forRefState=forRefState)
    x1d, x2d, x1dmask, x2dmask = onebatch[0:4]

    ## crop a large protein to deal with limited GPU memory. For sequential and embedding features, the theano model itself will crop based upon bounding box
    bounds = SampleBoundingBox((x2d.shape[1], x2d.shape[2]),
                               modelSpecs['maxbatchSize'])

    #x1d_new = x1d[:, bounds[1]:bounds[3], :]
    x1d_new = x1d
    x2d_new = x2d[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :]
    #x1dmask_new = x1dmask[:, bounds[1]:x1dmask.shape[1] ]
    x1dmask_new = x1dmask
    x2dmask_new = x2dmask[:, bounds[0]:x2dmask.shape[1], bounds[1]:bounds[3]]

    input = [x1d_new, x2d_new, x1dmask_new, x2dmask_new]

    ## if embedding is used
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        embed = onebatch[4]
        #embed_new = embed[:, bounds[1]:bounds[3], : ]
        embed_new = embed
        input.append(embed_new)

        remainings = onebatch[5:]
    else:
        remainings = onebatch[4:]

##crop the ground truth and weight matrices
    for x2d0 in remainings:
        if len(x2d0.shape) == 3:
            input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3]])
        else:
            input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :])

    ## add bounding box to the input list
    input.append(bounds)

    if config.TrainByRefLoss(modelSpecs):
        if forRefState:
            input.append(np.int32(-1))
        else:
            input.append(np.int32(1))

    train_loss, train_errors, param_L2 = train(*input)

    return train_loss, train_errors, param_L2
def PrepareInput4Prediction(data, modelSpecs, floatType=np.float32, UseSharedMemory=False, forRefState=False):
	if not bool(data):
		print 'ERROR: the input data for PrepareInput4Prediction is empty'
		exit(1)

	onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, floatType=floatType, bUseSharedMemory=UseSharedMemory)
	maxSeqLen = max( [ d['seqLen'] for d in data ] )	
	box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32)
	onebatch.append( box )

	return onebatch
def PrepareInput4Validate(data, modelSpecs, floatType=np.float32, forRefState=False, UseSharedMemory=False):
	if not bool(data):
		print 'ERROR: the input data for PrepareInput4Validate is empty'
		exit(1)

	if UseSharedMemory:
		## when shared memory is used, there is no explicit limit on the size of an ndarray
		maxAllowedLen = np.iinfo(np.int32).max
	else:
		## when the real content of a large matrix is passed through Queue, its size shall be <2GB
		maxAllowedLen = 800

	maxSeqLen = max([ d['seqLen'] for d in data ])

	if maxSeqLen > maxAllowedLen and len(data)>1:
		print 'ERROR: when one validation protein has length > ', maxAllowedLen, ', it shall form a minibatch by itself'
		exit(1)

	##determine the bounding box. 
	if maxSeqLen <= maxAllowedLen:
		bounds = None
	else:
		bounds = []
		for d in data:
			seqLen = d['seqLen']
			if seqLen > maxAllowedLen:
				## cut off a submatrix along the diagonal line so that the top accurcy function in our deep model works correctly
				top = 0
				bottom = maxAllowedLen
				#left = seqLen - maxAllowedLen
				left = 0
				#right = seqLen
				right = maxAllowedLen
				box = [top, left, bottom, right]
				bounds.append(box)
			else:
				bounds.append(None)

	onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, bounds=bounds, floatType=floatType, bUseSharedMemory=UseSharedMemory)
	if maxSeqLen <= maxAllowedLen:
		box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32)
	else:
		## in this case, len(bounds)==1 and len(data) == 1
		assert len(bounds)==1
		assert bounds[0] is not None
		box = np.array(bounds[0]).astype(np.int32)

	onebatch.append( box )

	return onebatch
def PrepareInput4Train(data, modelSpecs, floatType=np.float32, forRefState=False, UseSharedMemory=False):
	if not bool(data):
		print 'ERROR: the input data for PrepareInput4Train2 is empty'
		exit(1)

	allowedLen = int(math.floor(math.sqrt(modelSpecs['maxbatchSize']) ) )
	bounds =[]
	for d in data:
		if d['seqLen'] < allowedLen:
			bounds.append( None )
			continue
		box = SampleBoundingBox( (d['seqLen'], d['seqLen']), modelSpecs['maxbatchSize'] )
		bounds.append(box)
	
	#print allowedLen
	#print bounds
	onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, bounds=bounds, floatType=floatType, bUseSharedMemory=UseSharedMemory)

	## determine the bounding box. 
	maxSeqLen = max([ d['seqLen'] for d in data ])
	#print maxSeqLen

	if maxSeqLen > allowedLen and len(data)>1:
		print 'ERROR: one minibatch has more than one large proteins: ', [ d['name'] for d in data ]
		exit(1)
		
	if maxSeqLen <= allowedLen:
		box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32)
	else:
		## in this case, len(data) == 1 and len(bounds) == 1
		assert bounds[0] is not None
		box = np.array(bounds[0]).astype(np.int32)

	onebatch.append(box)

	if config.TrainByRefLoss(modelSpecs):
                if forRefState:
                        onebatch.append(np.int32(-1) )
                else:
                        onebatch.append(np.int32(1) )
	return onebatch
Ejemplo n.º 5
0
def PredictMatrixLabels(models,
                        predictors,
                        names,
                        inputFolders,
                        aliFolders=None,
                        tplFolder=None,
                        aliFile=None,
                        tplFile=None,
                        saveFolder=None):

    if not isinstance(names, (list, tuple)):
        targetName = names
    else:
        targetName = None

    ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices
    ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results
    ##two different models may share common responses

    allsequences = dict()
    allresults = dict()  ## the results predicted from the real input
    numModels = dict(
    )  ## count the number of models that may predict each response

    for model, predictor in zip(models, predictors):
        #predict, inputVariables = BuildPredictor(model)
        predict, inputVariables = predictor

        ## load data for each model separately since each model may have a different specification
        if targetName is None:
            rawData = LoadProteinData4OneModel(model, names, inputFolders,
                                               aliFolders, tplFolder)

        elif aliFile is not None and tplFile is not None:
            rawData = LoadOneAlignment4OneModel(model, targetName,
                                                inputFolders, aliFile, tplFile)
        else:
            rawData = LoadOneProteinData4OneModel(model, targetName,
                                                  inputFolders, aliFolders,
                                                  tplFolder)

        predData = DataProcessor.ExtractFeaturesNLabels(
            rawData,
            modelSpecs=model,
            forTrainValidation=False,
            returnMode='list')

        ##make sure the input has the same number of features as the model
        FeatureUtils.CheckModelNDataConsistency(model, predData)

        ## check sequence consistency
        for d in predData:
            name = d['name']
            if not allresults.has_key(name):
                allresults[name] = dict()
                numModels[name] = dict()

            if not allsequences.has_key(name):
                allsequences[name] = d['sequence']
            elif allsequences[name] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData = DataProcessor.SplitData2Batches(data=predData,
                                                      numDataPoints=624,
                                                      modelSpecs=model)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        ##for onebatch, names4onebatch in zip(predSeqData, names):
        for minibatch in predSeqData:
            onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
                minibatch, model)
            input = onebatch[:len(inputVariables)]
            result = predict(*input)
            ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
            assert result.shape[3] == sum([
                GetResponseProbDims(response)
                for response in model['responses']
            ])

            ## calculate the start and end positions of each response in the last dimension of result
            dims = [
                GetResponseProbDims(response)
                for response in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            x1d, x2d, x1dmask, x2dmask = input[0:4]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            for response, start, end in zip(model['responses'], startPositions,
                                            endPositions):

                ## batchres is a batch of result, its ndim=4
                ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters
                batchres = result[:, :, :, start:end]
                ## remove masked positions
                revised_batchres = [
                    probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :]
                    for probMatrix, seqLen in zip(batchres, seqLens)
                ]

                for res4one, name in zip(revised_batchres, names4onebatch):
                    if not allresults[name].has_key(response):
                        allresults[name][response] = res4one
                        numModels[name][response] = np.int32(1)
                    else:
                        ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                        allresults[name][response] += res4one
                        numModels[name][response] += np.int32(1)

    ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()

        ## finalresults has 3 dimensions.
        for response in results.keys():
            finalresults[name][response] = (allresults[name][response] /
                                            numModels[name][response]).astype(
                                                np.float32)

            ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
            labelName = Response2LabelName(response)
            if config.IsSymmetricLabel(labelName):
                finalresults[name][response] = (
                    finalresults[name][response] +
                    np.transpose(finalresults[name][response], (1, 0, 2))) / 2.

    ## convert predicted distance probability matrix into contact matrix
    predictedContactMatrices = DeriveContactMatrix(finalresults)

    ## collect the average label distributions and weight matrix
    finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution(
        models)

    ##write all the results here
    ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution)
    for name, results in finalresults.iteritems():

        savefilename = name + '.predictedDistMatrix.pkl'
        if saveFolder is not None:
            savefilename = os.path.join(saveFolder, savefilename)

        if targetName is not None:
            originalName = targetName
        else:
            for n in names:
                if name.startswith(n):
                    originalName = n
                    break

        with open(savefilename, 'wb') as fh:
            #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump((originalName, allsequences[name], results,
                          predictedContactMatrices[name], finalLabelWeights,
                          finalLabelDistributions),
                         fh,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    return (predictedContactMatrices, allsequences)
    """