コード例 #1
0
def ValidateAllData(validData, validate, modelSpecs, forRefState=False):
    accs = []
    losses = []
    errs = []
    numSamples = []

    if config.UseSampleWeight(modelSpecs):
        w4losses = []
        w4errors = []
    else:
        w4losses = None
        w4errors = None

    for batch in validData:

        input = ToFloatX(ToNonSharedArray(batch))
        onebatch = input[:-1]

        onebatch_res = validate(*input)
        los = onebatch_res[0]
        err = onebatch_res[1]
        losses.append(los)
        errs.append(err)

        if len(onebatch_res) > 2:
            acc = onebatch_res[2]
            accs.append(acc)
            ##numSamples is the number of proteins in one batch
            numSamples.append(onebatch[0].shape[0])

        if config.UseSampleWeight(modelSpecs):
            #weights = onebatch[ len(onebatch) - len(modelSpecs['responses']) : ]
            weights = onebatch[-len(modelSpecs['responses']):]
            w4loss = []
            w4error = []
            for res, w in zip(modelSpecs['responses'], weights):
                wSum = np.sum(w)
                w4loss.append(wSum)
                w4error.extend([wSum] * GetResponseValueDims(res))
            w4losses.append(w4loss)
            w4errors.append(w4error)

    ## The loss and err is normalized by the weight of each minibatch. This is equivalent to minimize loss and err per residue pair
    ## The top accuracy is not normalized by the weight of a minibatch, i.e.,  we want to maximize per-protein accuracy.
    if len(accs) > 0 and len(numSamples) > 0:
        return np.average(losses, axis=0, weights=w4losses), np.average(
            errs, axis=0, weights=w4errors), np.average(accs,
                                                        axis=0,
                                                        weights=numSamples)
    else:
        return np.average(losses, axis=0,
                          weights=w4losses), np.average(errs,
                                                        axis=0,
                                                        weights=w4errors)
コード例 #2
0
def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelMatrixPool = dict()

	while True:
		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)
		for batch in trainSeqData:
			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelMatrixPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelMatrixPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelMatrixPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)
コード例 #3
0
def AddLabel2OneBatch(names,
                      batch,
                      modelSpecs,
                      sharedLabelPool,
                      sharedLabelWeightPool,
                      floatType=theano.config.floatX):

    numSeqs = len(names)
    for name in names:
        if (not sharedLabelPool.has_key(name)) or (
                not sharedLabelWeightPool.has_key(name)):
            print 'the label or label weight matrix does not exist for protein ', name
            exit(1)

    seqLens = [sharedLabelWeightPool[name].shape[0] for name in names]

    ## get the boundingbox for this batch
    if not config.TrainByRefLoss(modelSpecs):
        box = batch[-1]
    else:
        box = batch[-2]

    top, left, bottom, right = box
    assert bottom - top == right - left
    boxsize = bottom - top

    if boxsize < max(seqLens) and numSeqs > 1:
        ## make sure that there is only one protein in this batch
        print 'ERROR: when one batch has a large protein, it can only have one protein'
        exit(1)

    ## we crop pairwise labels at this step to save memory and computational time
    maxMatrixSize = min(boxsize, max(seqLens))

    ## Y shall be a list of 2D or 3D matrices, each for one response
    Y = []
    for response in modelSpecs['responses']:
        labelName, labelType, _ = ParseResponse(response)
        dataType = np.int16
        if not config.IsDiscreteLabel(labelType):
            dataType = floatType
        rValDims = GetResponseValueDims(response)
        if rValDims == 1:
            y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                         dtype=dataType)
            Y.append(y)

        else:
            y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                                rValDims),
                         dtype=dataType)
            Y.append(y)

    ## when Y is empty, weight is useless. So When Y is empty, weight shall also be empty
    weightMatrix = []
    if bool(Y) and config.UseSampleWeight(modelSpecs):
        weightMatrix = [
            np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                     dtype=floatType)
        ] * len(modelSpecs['responses'])

    for j, name, seqLen in zip(range(len(names)), names, seqLens):

        ## we align all matrices in the bottom/right corner
        ## posInX and posInY are the starting position of one protein in the final output tensor
        ## here X and Y refer to x-axis and y-axis
        posInX = -min(boxsize, seqLen)
        posInY = -min(boxsize, seqLen)

        for y, response in zip(Y, modelSpecs['responses']):

            if boxsize < seqLen:
                tmp = sharedLabelPool[name][response][top:bottom, left:right]
            else:
                tmp = sharedLabelPool[name][response]
            if len(y.shape) == 3:
                y[j, posInX:, posInY:] = tmp
            else:
                y[j, posInX:, posInY:, ] = tmp

        labelWeightMatrix = sharedLabelWeightPool[name]
        for w, response in zip(weightMatrix, modelSpecs['responses']):
            if boxsize < seqLen:
                w[j, posInX:,
                  posInY:] = labelWeightMatrix[response][top:bottom,
                                                         left:right]
            else:
                w[j, posInX:, posInY:] = labelWeightMatrix[response]

    ## the input batch contains bounding box
    tail = 1

    ## check to see if the input batch contains one flag for RefState
    if config.TrainByRefLoss(modelSpecs):
        tail += 1

    newbatch = batch[:-tail]
    newbatch.extend(Y)
    newbatch.extend(weightMatrix)
    newbatch.extend(batch[-tail:])

    return newbatch
コード例 #4
0
def AssembleOneBatch(data,
                     modelSpecs,
                     forRefState=False,
                     bounds=None,
                     floatType=theano.config.floatX,
                     bUseSharedMemory=False):
    if not data:
        print 'WARNING: the list of data is empty'
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    names = [d['name'] for d in data]

    ## use maxSeqLen and minSeqLen for sequential features
    ## we do not crop sequential features at this step since the theano deep model will do so after 1D convolution operation
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    numSeqFeatures = FeatureUtils.DetermineNumSeqFeatures(
        data[0]['seqFeatures'])
    X1d = np.zeros(shape=(numSeqs, maxSeqLen, numSeqFeatures), dtype=floatType)

    numMatrixFeatures = FeatureUtils.DetermineNumMatrixFeatures(
        data[0]['matrixFeatures']) + FeatureUtils.DetermineNumMatrixFeatures(
            data[0]['matrixFeatures_nomean'])
    ## we use maxMatrixSize and minMatrixSize for pairwise features
    ## we crop pairwise features at this step to save memory and computational time
    minMatrixSize, maxMatrixSize = CalcMinMaxMatrixSize(bounds, seqLens)

    if bUseSharedMemory:
        shmX2d = SharedNDArray(
            (numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures),
            dtype=floatType,
            name='/RaptorX-' + str(os.getppid()) + '-X2d-' + randomString(6))
        X2d = shmX2d.array
        X2d[:] = 0
    else:
        X2d = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                              numMatrixFeatures),
                       dtype=floatType)

    X1dem = None
    if data[0].has_key('embedFeatures'):
        numEmbedFeatures = data[0]['embedFeatures'].shape[1]
        X1dem = np.zeros(shape=(numSeqs, maxSeqLen, numEmbedFeatures),
                         dtype=floatType)

    ## Y shall be a list of 2D or 3D matrices, each for one response
    Y = []
    if data[0].has_key('atomLabelMatrix'):
        for response in modelSpecs['responses']:
            labelName, labelType, _ = ParseResponse(response)
            dataType = np.int16
            if not config.IsDiscreteLabel(labelType):
                dataType = floatType
            rValDims = GetResponseValueDims(response)
            if rValDims == 1:
                y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                             dtype=dataType)
                Y.append(y)

            else:
                y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                                    rValDims),
                             dtype=dataType)
                Y.append(y)

    ## when Y is empty, weight is useless. So When Y is None, weight shall also be None
    weightMatrix = []
    if bool(Y) and config.UseSampleWeight(modelSpecs):
        weightMatrix = [
            np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                     dtype=floatType)
        ] * len(modelSpecs['responses'])

    ## for mask. we do not used shared ndarray for them since they are small
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)
    M2d = np.zeros(shape=(numSeqs, maxMatrixSize - minMatrixSize,
                          maxMatrixSize),
                   dtype=np.int8)

    if bounds is not None:
        boxes = bounds
    else:
        boxes = [None] * len(data)

    for j, d, box in zip(range(len(data)), data, boxes):
        seqLen = d['seqLen']

        ## posInSeq, posInX and posInY are the starting position of one protein in the final output tensor
        posInSeq = -seqLen

        ## here X and Y refer to x-axis and y-axis
        if box is not None:
            top, left, bottom, right = box
            posInX = -(bottom - top)
            posInY = -(right - left)
        else:
            posInX = -seqLen
            posInY = -seqLen

        if forRefState:
            ## this code needs reexamination, it may not be correct when d['seqFeatures']/d['matrixFeatures'] is represented as a list of arrays instead of a single array
            X1d[j, posInSeq:, :] = np.array(
                [modelSpecs['seqFeatures_expected']] * seqLen).reshape(
                    (seqLen, -1))

            tmp = [modelSpecs['matrixFeatures_expected']] * (seqLen * seqLen)
            tmp2 = np.array(tmp).reshape((seqLen, seqLen, -1))
            tmp3 = np.concatenate((tmp2, d['matrixFeatures_nomean']), axis=2)
            if box is not None:
                X2d[j, posInX:, posInY:, :] = tmp3[top:bottom, left:right, ]
            else:
                X2d[j, posInX:, posInY:, :] = tmp3
        else:
            if isinstance(d['seqFeatures'], np.ndarray):
                X1d[j, posInSeq:, :] = d['seqFeatures']
            else:
                startPos = 0
                for f in d['seqFeatures']:
                    if len(f.shape) == 1:
                        X1d[j, posInSeq:,
                            startPos:startPos + 1] = f[:, np.newaxis]
                        startPos += 1
                    elif len(f.shape) == 2:
                        X1d[j, posInSeq:, startPos:startPos + f.shape[1]] = f
                        startPos = startPos + f.shape[1]
                    else:
                        print 'wrong shape in sequential feature: ', f.shape
                        exit(1)

            # add 2D features in matrixFeatures to holder staring from the start position
            # holder is a 3D array and start is the starting position in the 3rd dimension
            def Add2DFeatures(matrixFeatures, holder, start):
                if isinstance(matrixFeatures, np.ndarray):
                    features = [matrixFeatures]
                else:
                    features = matrixFeatures

                startPos = start
                #for f in matrixFeatures:
                for f in features:
                    if len(f.shape) == 2:
                        endPos = startPos + 1
                        if box is None:
                            holder[:, :, startPos:endPos] = f[:, :, np.newaxis]
                        else:
                            holder[:, :,
                                   startPos:endPos] = f[top:bottom, left:right,
                                                        np.newaxis]
                    elif len(f.shape) == 3:
                        endPos = startPos + f.shape[2]
                        if box is None:
                            holder[:, :, startPos:endPos] = f
                        else:
                            holder[:, :, startPos:endPos] = f[top:bottom,
                                                              left:right, :]
                    else:
                        print 'wrong shape in matrixFeatures: ', f.shape
                        exit(1)
                    startPos = endPos

                return endPos

            end = Add2DFeatures(d['matrixFeatures'], X2d[j, posInX:,
                                                         posInY:, :], 0)
            Add2DFeatures(d['matrixFeatures_nomean'], X2d[j, posInX:,
                                                          posInY:, :], end)

        M1d[j, posInSeq:].fill(1)
        M2d[j, posInX:, posInY:].fill(1)

        if X1dem is not None:
            ## embed feature is always represented as a single array, so the code shall be correct
            if forRefState:
                X1dem[j, posInSeq:, :] = np.array(
                    [modelSpecs['embedFeatures_expected']] * seqLen).reshape(
                        (seqLen, -1))
            else:
                X1dem[j, posInSeq:, :] = d['embedFeatures']

        for y, response in zip(Y, modelSpecs['responses']):
            if box is not None:
                tmp = d['atomLabelMatrix'][response][top:bottom, left:right]
            else:
                tmp = d['atomLabelMatrix'][response]
            if len(y.shape) == 3:
                y[j, posInX:, posInY:] = tmp
            else:
                y[j, posInX:, posInY:, ] = tmp

        if bool(weightMatrix):
            if d.has_key('labelWeightMatrix'):
                labelWeightMatrix = d['labelWeightMatrix']
            else:
                labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(
                    d['atomLabelMatrix'], modelSpecs, floatType=floatType)

        for w, response in zip(weightMatrix, modelSpecs['responses']):
            if box is not None:
                w[j, posInX:,
                  posInY:] = labelWeightMatrix[response][top:bottom,
                                                         left:right]
            else:
                w[j, posInX:, posInY:] = labelWeightMatrix[response]

    if bUseSharedMemory:
        onebatch = [X1d, shmX2d, M1d, M2d]
    else:
        onebatch = [X1d, X2d, M1d, M2d]

    if X1dem is not None:
        onebatch.append(X1dem)

    onebatch.extend(Y)
    onebatch.extend(weightMatrix)

    return onebatch, names
コード例 #5
0
def BuildModel(modelSpecs, forTrain=True):
    rng = np.random.RandomState()

    ## x is for sequential features and y for matrix (or pairwise) features
    x = T.tensor3('x')
    y = T.tensor4('y')

    ## mask for x and y, respectively
    xmask = T.bmatrix('xmask')
    ymask = T.btensor3('ymask')

    xem = None
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        xem = T.tensor3('xem')

## bounding box for crop of a big protein distance matrix. This box allows crop at any position.
    box = None
    if forTrain:
        box = T.ivector('boundingbox')

## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data
    trainByRefLoss = None
    if forTrain and config.TrainByRefLoss(modelSpecs):
        trainByRefLoss = T.iscalar('trainByRefLoss')

    distancePredictor = ResNet4DistMatrix(rng,
                                          seqInput=x,
                                          matrixInput=y,
                                          mask_seq=xmask,
                                          mask_matrix=ymask,
                                          embedInput=xem,
                                          boundingbox=box,
                                          modelSpecs=modelSpecs)

    ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
    labelList = []
    if forTrain:
        ## when this model is used for training. We need to define the label variable
        for response in modelSpecs['responses']:
            labelType = Response2LabelType(response)
            rValDims = GetResponseValueDims(response)

            if labelType.startswith('Discrete'):
                if rValDims > 1:
                    ## if one response is a vector, then we use a 4-d tensor
                    ## wtensor is for 16bit integer
                    labelList.append(T.wtensor4('Tlabel4' + response))
                else:
                    labelList.append(T.wtensor3('Tlabel4' + response))
            else:
                if rValDims > 1:
                    labelList.append(T.tensor4('Tlabel4' + response))
                else:
                    labelList.append(T.tensor3('Tlabel4' + response))

    ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
    weightList = []
    if len(labelList) > 0 and config.UseSampleWeight(modelSpecs):
        weightList = [
            T.tensor3('Tweight4' + response)
            for response in modelSpecs['responses']
        ]

## for prediction, both labelList and weightList are empty
    if forTrain:
        return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss
    else:
        return distancePredictor, x, y, xmask, ymask, xem
コード例 #6
0
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs)
	if any([bUseCCMraw, bUseFullMI, bUseFullCov]):
		## when full coevolution matrices are used, we shall use float16 to save memory
		floatType = np.float16
	else:
		floatType = theano.config.floatX

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			#print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)

		#i = 0
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				#print 'trainDataLoader receives the stop signal'
				break

			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelWeightPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelWeightPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelWeightPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)

			"""
			i += 1
			if i%100 == 0:
				print '#batches of train data loaded: ', i
			"""

		#print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices  and ', len(labelMatrixPool), ' label weight matrices'
	print 'TrainDataLoader has finished loading data'
	sharedQ.close()