Python DataProcessor.SampleProteinInfo Exemples, DataProcessor.SampleProteinInfo, NeMo Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : TrainUtils.py Projet : zhujianwei31415/RaptorX-3DModeling

def TrainDataLoader3(sharedQ, sharedLabelPool, sharedLabelWeightPool, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	## load the labels of all training proteins
	trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
	for loc in trainDataLocation:
		d = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False, returnMode='list')
		name = d['name']
		labelPool[name] = d['atomLabelMatrix']
		labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
		labelWeightPool[name] = labelWeightMatrix

	print 'TrainDataLoader with #PID ', os.getpid(), ' has loaded ', len(labelPool), ' label matrices  and ', len(labelWeightPool), ' label weight matrices'
	## update labelPool and labelWeightPool to the shared dict()
	sharedLabelPool.update(labelPool)
	sharedLabelWeightPool.update(labelWeightPool)
	print 'TrainDataLoader with #PID ', os.getpid(), ' has update the shared labelPool and labelWeightPool'

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		"""
		maxLen = 900
		trainDataLocation, numExcluded = DataProcessor.FilterByLength(trainDataLocation, maxLen)
		print 'Exclude ', numExcluded, ' train proteins longer than ', maxLen, ' AAs'
		"""
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				print 'trainDataLoader receives the stop signal'
				break

			names = [ p['name'] for p in batch ]
			data = []
			for protein in batch:
				d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put( (data, names) )

	print 'TrainDataLoader has finished loading data'
	sharedQ.close()

Exemple #2

0

Afficher le fichier

def CalcFeatureExpectBySampling(metaData, modelSpecs):
    seqfeatures = []
    seqweights = []

    matrixfeatures = []
    matrixweights = []

    embedfeatures = []
    embedweights = []

    dataLocation = DataProcessor.SampleProteinInfo(metaData)
    for loc in dataLocation:
        d = DataProcessor.LoadRealData(loc, modelSpecs, loadLabel=False)
        res = CalcFeatureExpect4OneProtein(d)
        seqfeature, seqweight, matrixfeature, matrixweight = res[:4]
        seqfeatures.append(seqfeature)
        matrixfeatures.append(matrixfeature)
        seqweights.append(seqweight)
        matrixweights.append(matrixweight)

        if len(res) == 6:
            embedfeature, embedweight = res[5:]
            embedfeatures.append(embedfeature)
            embedweights.append(embedweight)

    modelSpecs['seqFeatures_expected'] = np.average(seqfeatures,
                                                    axis=0,
                                                    weights=seqweights)
    modelSpecs['matrixFeatures_expected'] = np.average(matrixfeatures,
                                                       axis=0,
                                                       weights=matrixweights)
    modelSpecs['embedFeatures_expected'] = np.average(embedfeatures,
                                                      axis=0,
                                                      weights=embedweights)

Exemple #3

0

Afficher le fichier

Fichier : FeatureUtils2.py Projet : zhujianwei31415/RaptorX-3DModeling

def DetermineFeatureDimensionBySampling(metaData, modelSpecs):

	protein = DataProcessor.SampleProteinInfo(metaData, numSamples=1)[0]
        d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')

        ## obtain the dimension of each type of input feature
	modelSpecs['n_in_seq'] = DetermineNumSeqFeatures(d['seqFeatures'])
	modelSpecs['n_in_matrix'] = DetermineNumMatrixFeatures(d['matrixFeatures']) + DetermineNumMatrixFeatures(d['matrixFeatures_nomean'])

        if d.has_key('embedFeatures'):
                modelSpecs['n_in_embed'] = d['embedFeatures'].shape[1]

Exemple #4

0

Afficher le fichier

Fichier : LabelUtils.py Projet : zhujianwei31415/RaptorX-3DModeling

def CalcLabelDistributionNWeightBySampling(trainMetaData, modelSpecs):
    trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData,
                                                        numSamples=10000)

    ## only load ground truth but not input features to save memory and speed up
    labelData = []
    for loc in trainDataLocation:
        p = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False)
        labelData.append(p)

    CalcLabelDistributionAndWeight(labelData, modelSpecs)

Exemple #5

0

Afficher le fichier

Fichier : TrainUtils.py Projet : zhujianwei31415/RaptorX-3DModeling

def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelMatrixPool = dict()

	while True:
		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)
		for batch in trainSeqData:
			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelMatrixPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelMatrixPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelMatrixPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)

Exemple #6

0

Afficher le fichier

Fichier : TrainDistancePredictor.py Projet : zhujianwei31415/RaptorX-3DModeling

def main(argv):

    modelSpecs = InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    startTime = datetime.datetime.now()

    trainMetaData = DataProcessor.LoadMetaData(modelSpecs['trainFile'])
    FeatureUtils.DetermineFeatureDimensionBySampling(trainMetaData, modelSpecs)
    ## calculate label distribution and weight at the very beginning
    print 'Calculating label distribution...'
    LabelUtils.CalcLabelDistributionNWeightBySampling(trainMetaData,
                                                      modelSpecs)

    if config.TrainByRefLoss(modelSpecs) or config.UseRefState(modelSpecs):
        print 'Calculating feature expection by sampling...'
        FeatureUtils.CalcFeatureExpectBySampling(trainMetaData, modelSpecs)

## trainMetaData is a list of groups. Each group contains a set of related proteins (seq-template alignments) and files for their features
    trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
    trainSeqData = DataProcessor.SplitData2Batches(
        trainDataLocation,
        numDataPoints=modelSpecs['minibatchSize'],
        modelSpecs=modelSpecs)
    print 'approximate #batches for train data: ', len(trainSeqData)

    #global trainSharedQ, stopTrainDataLoader, trainDataLoaders, trainSharedLabelPool, trainSharedLabelWeightPool
    global trainSharedQ, stopTrainDataLoader, trainDataLoaders
    trainSharedQ = multiprocessing.Queue(config.QSize(modelSpecs))
    stopTrainDataLoader = multiprocessing.Event()
    #trainSharedLabelPool = multiprocessing.Manager().dict()
    #trainSharedLabelWeightPool = multiprocessing.Manager().dict()
    #print stopTrainDataLoader

    numTrainDataLoaders = config.NumTrainDataLoaders(modelSpecs)
    metaDatas = DataProcessor.SplitMetaData(trainMetaData, numTrainDataLoaders)

    trainDataLoaders = []
    for i, metaData in zip(xrange(numTrainDataLoaders), metaDatas):
        #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader, args=(trainSharedQ, metaData, modelSpecs, True, True))
        trainDataLoader = multiprocessing.Process(
            name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()),
            target=TrainUtils.TrainDataLoader2,
            args=(trainSharedQ, stopTrainDataLoader, metaData, modelSpecs,
                  True, True))
        #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader3, args=(trainSharedQ, trainSharedLabelPool, trainSharedLabelWeightPool, stopTrainDataLoader, metaData, modelSpecs, True, True))
        trainDataLoader.daemon = True
        trainDataLoaders.append(trainDataLoader)

    print 'start the train data loaders...'
    for trainDataLoader in trainDataLoaders:
        trainDataLoader.start()

    validMetaData = DataProcessor.LoadMetaData(modelSpecs['validFile'])
    validDataLocation = DataProcessor.SampleProteinInfo(validMetaData)

    ## split data into batches, but do not load the real data from disk
    #validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
    validSeqData = DataProcessor.SplitData2Batches(validDataLocation,
                                                   numDataPoints=500 * 500,
                                                   modelSpecs=modelSpecs)
    print '#batches for validation data: ', len(validSeqData)

    global validSharedQ, validDataLoader, stopValidDataLoader
    validSharedQ = multiprocessing.Queue(len(validSeqData))
    stopValidDataLoader = multiprocessing.Event()
    #print stopValidDataLoader
    ## shared memory is a limited resource, so avoid using it as much as possible
    ## here we do not use shared array for validation data since we only need to load it once
    #validDataLoader = multiprocessing.Process(name='ValidDataLoader for '+str(os.getpid()), target=TrainUtils.ValidDataLoader, args=(validSharedQ, validSeqData, modelSpecs, True, False))
    validDataLoader = multiprocessing.Process(
        name='ValidDataLoader for ' + str(os.getpid()),
        target=TrainUtils.ValidDataLoader2,
        args=(validSharedQ, stopValidDataLoader, validSeqData, modelSpecs,
              True, False))
    print 'start the validation data loader...'
    validDataLoader.start()
    """
	if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
		##calculate the average weight per minibatch
		maxDeviation = DataProcessor.CalcAvgWeightPerBatch(trainSeqDataset, modelSpecs)
		print 'maxWeightDeviation=', maxDeviation
	"""

    beforeTrainTime = datetime.datetime.now()
    print 'time spent before training :', beforeTrainTime - startTime

    result = TrainModel(modelSpecs=modelSpecs,
                        trainValidData=(trainSeqData, validSeqData))

    ##merge ModelSpecs and result
    resultModel = modelSpecs.copy()
    resultModel.update(result)

    modelFile = TrainUtils.GenerateModelFileName(resultModel)
    print 'Writing the resultant model to ', modelFile
    cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)

    afterTrainTime = datetime.datetime.now()
    print 'time spent on training:', afterTrainTime - beforeTrainTime

    ## clean up again
    print 'Cleaning up again...'
    Cleanup()

Exemple #7

0

Afficher le fichier

Fichier : TrainDistancePredictor.py Projet : zhujianwei31415/RaptorX-3DModeling

def TrainModel(modelSpecs, trainValidData=None, predDataFile=None):
    if (not trainValidData):
        print 'Please provide train and validation data for model training'
        exit(1)

    if modelSpecs is None:
        print 'Please provide a model specification for training'
        exit(1)

    distancePredictor, variable4train, variable4validate, params, params4mean, params4var, paramL2, regularizer, topAcc, errors, labelList, weightList, trainByRefLoss = PrepareModel(
        modelSpecs)

    chkpoint, restart = InitializeChkpoint(params, modelSpecs)

    assert (len(modelSpecs['numEpochs']) > 0)
    numEpochs4stages = np.cumsum(modelSpecs['numEpochs'])
    ## train parameters not related to variance and correlation
    epoch = chkpoint['epoch']

    if epoch < numEpochs4stages[-1]:

        if weightList is not None and len(weightList) > 0:
            loss4train = distancePredictor.loss(labelList,
                                                useMeanOnly=True,
                                                weightList=weightList,
                                                trainByRefLoss=trainByRefLoss)
            loss4validate = distancePredictor.loss(labelList,
                                                   useMeanOnly=True,
                                                   weightList=weightList)
        else:
            loss4train = distancePredictor.loss(labelList,
                                                useMeanOnly=True,
                                                trainByRefLoss=trainByRefLoss)
            loss4validate = distancePredictor.loss(labelList, useMeanOnly=True)
        """
		## weightedLoss is only used for cost, i.e., gradient calculation
		if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
			weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs)
		else:
			weightedLoss = loss
		"""
        if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']):
            cost = T.sum(T.mul(loss4train,
                               modelSpecs['w4responses'])) / np.sum(
                                   modelSpecs['w4responses'])
        else:
            cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])
                         ) / np.sum(modelSpecs['w4responses']) + regularizer

        params4var_set = set(params4var)
        pgrads = [
            T.grad(cost,
                   p,
                   consider_constant=weightList,
                   disconnected_inputs='warn')
            if p not in params4var_set else T.zeros_like(p) for p in params
        ]
        pdecay = [
            p if p not in params4var_set else T.zeros_like(p) for p in params
        ]

    for stage, lr, epoch_end in zip(xrange(len(numEpochs4stages)),
                                    modelSpecs['lrs'], numEpochs4stages):
        if epoch >= epoch_end:
            continue

        print 'training for mean using a learning rate ', lr, ' ...'
        startFromBest = (stage > 0 and epoch == numEpochs4stages[stage - 1])
        epoch_start = epoch
        epoch = RunOneStage(epoch_start,
                            epoch_end,
                            trainValidData,
                            chkpoint,
                            loss4train,
                            loss4validate,
                            pgrads,
                            pdecay,
                            modelSpecs,
                            lr=lr,
                            startFromBest=(startFromBest, startFromBest))

## train parameters only specific to variance and correlation
    numEpochs4var = modelSpecs['numEpochs4var']
    lrs = modelSpecs['lrs4var']

    if len(params4var) > 0:
        assert (len(numEpochs4var) > 0)
        assert (len(lrs) > 0)

        previousEpochs4Stages = numEpochs4stages
        numEpochs4stages = np.cumsum(numEpochs4var) + numEpochs4stages[-1]

        if epoch < numEpochs4stages[-1]:
            print 'Training the parameters specific to correlation and variance ...'

            if weightList is not None and len(weightList) > 0:
                loss4train = distancePredictor.loss(
                    labelList,
                    weightList=weightList,
                    trainByRefLoss=trainByRefLoss)
                loss4validate = distancePredictor.loss(labelList,
                                                       weightList=weightList)
            else:
                loss4train = distancePredictor.loss(labelList)
                loss4validate = distancePredictor.loss(labelList)
            """
			## weightedLoss is only used for cost, i.e., gradient calculation
			if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
				weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs)
			else:
				weightedLoss = loss
			"""

            if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']):
                cost = T.sum(T.mul(loss4train,
                                   modelSpecs['w4responses'])) / np.sum(
                                       modelSpecs['w4responses'])
            else:
                cost = T.sum(T.mul(loss4train,
                                   modelSpecs['w4responses'])) / np.sum(
                                       modelSpecs['w4responses']) + regularizer

            params4var_set = set(params4var)
            pgrads = [
                T.grad(cost,
                       p,
                       consider_constant=weightList,
                       disconnected_inputs='raise')
                if p in params4var_set else T.zeros_like(p) for p in params
            ]
            pdecay = [
                p if p in params4var_set else T.zeros_like(p) for p in params
            ]

        for stage, lr, epoch_end in zip(xrange(len(lrs)), lrs,
                                        numEpochs4stages):
            if epoch >= epoch_end:
                continue

            print 'training for variance using a learning rate ', lr, ' ...'
            startFromBest = (
                (stage == 0 and epoch == previousEpochs4Stages[-1])
                or (stage > 0 and epoch == numEpochs4stages[stage - 1]))
            epoch_start = epoch
            epoch = RunOneStage(epoch_start,
                                epoch_end,
                                trainValidData,
                                chkpoint,
                                loss4train,
                                loss4validate,
                                pgrads,
                                pdecay,
                                modelSpecs,
                                lr=lr,
                                startFromBest=(startFromBest, startFromBest
                                               and (stage > 0)))

    resultModel = {}
    resultModel['dateTrained'] = datetime.datetime.now()
    #resultModel['validLoss'] = validLoss
    resultModel['validLoss'] = chkpoint['best_validation_loss']
    #resultModel['validErr'] = validErr
    if chkpoint.has_key('best_validation_err'):
        resultModel['validErr'] = chkpoint['best_validation_err']

    resultModel['trainLoss'] = chkpoint['train_loss4best_validation_loss']
    #resultModel['validAcc']= validAcc
    if chkpoint.has_key('best_validation_acc'):
        resultModel['validAcc'] = chkpoint['best_validation_acc']

    resultModel['paramValues'] = chkpoint['bestParamValues']

    bestParamL2norm = np.sum([(v**2).sum()
                              for v in chkpoint['bestParamValues']])
    resultModel['bestParamL2norm'] = bestParamL2norm

    bestParamL1norm = np.sum(
        [abs(v).sum() for v in chkpoint['bestParamValues']])
    resultModel['bestParamL1norm'] = bestParamL1norm

    print 'best param L1 norm: ', bestParamL1norm, 'L2 norm: ', bestParamL2norm

    Cleanup()

    #test on prediction data if it is given. Here the prediction data shall be small to save memory and contain ground truth.
    if modelSpecs['predFile'] is not None:
        predMetaData = DataProcessor.LoadMetaData(modelSpecs['predFile'])
        predDataLocation = DataProcessor.SampleProteinInfo(predMetaData)
        predBatches = DataProcessor.SplitData2Batches(predDataLocation,
                                                      numDataPoints=624,
                                                      modelSpecs=modelSpecs)
        print '\nLoading prediction data...'
        print "#predData minibatches:", len(predBatches)

        predData = []
        for batch in predBatches:
            data = DataProcessor.LoadRealData(batch,
                                              modelSpecs,
                                              returnMode='list')
            FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
            #input = TrainUtils.PrepareInput4Prediction(data, modelSpecs, floatType=np.float16)
            input = TrainUtils.PrepareInput4Prediction(
                data, modelSpecs, floatType=theano.config.floatX)
            predData.append(input)

        if weightList is not None and len(weightList) > 0:
            loss4validate = distancePredictor.loss(labelList,
                                                   weightList=weightList)
        else:
            loss4validate = distancePredictor.loss(labelList)

        fullValidate = theano.function(variable4validate,
                                       [loss4validate, errors, topAcc],
                                       on_unused_input='warn')
        if config.UseRefState(modelSpecs):
            quickValidate = theano.function(variable4validate,
                                            [loss4validate, errors],
                                            on_unused_input='warn')

    ## set model parameters for valiation and possibly prediction
        for param, value in zip(params, chkpoint['bestParamValues']):
            param.set_value(value)

        predLoss, predErr, predAcc = ValidateAllData(predData, fullValidate,
                                                     modelSpecs)
        if config.UseRefState(modelSpecs):
            refLoss, refErr = ValidateAllData(predData,
                                              quickValidate,
                                              modelSpecs,
                                              forRefState=True)
            print 'pred loss: ', predLoss, 'pred err: ', predErr, 'ref loss: ', refLoss, 'ref err: ', refErr
        else:
            print 'pred loss: ', predLoss, 'pred err: ', predErr
        resultModel['predLoss'] = predLoss
        resultModel['predErr'] = predErr

        print "predAcc: ", [str_display(pAcc[:, 0]) for pAcc in predAcc
                            ], 'for top ', modelSpecs['topRatios']
        resultModel['predAcc'] = predAcc

        del predData[:]

## training is done, remove the checkpoint file since it has been copied at the end of each stage
    if modelSpecs.has_key('checkpointFile') and (modelSpecs['checkpointFile']
                                                 is not None):
        try:
            os.remove(modelSpecs['checkpointFile'])
        except IOError:
            print 'WARNING: error in deleting the check point file: ', modelSpecs[
                'checkpointFile']

## remove theano variables from modelSpecs
    keys4removal = [
        'variable4train', 'variable4validate', 'params', 'params4mean',
        'params4var', 'paramL2', 'regularizer', 'topAcc', 'errors',
        'labelList', 'weightList', 'trainByRefLoss'
    ]
    for k in keys4removal:
        if modelSpecs.has_key(k):
            del modelSpecs[k]

    return resultModel

Exemple #8

0

Afficher le fichier

Fichier : TrainUtils.py Projet : zhujianwei31415/RaptorX-3DModeling

def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs)
	if any([bUseCCMraw, bUseFullMI, bUseFullCov]):
		## when full coevolution matrices are used, we shall use float16 to save memory
		floatType = np.float16
	else:
		floatType = theano.config.floatX

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			#print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)

		#i = 0
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				#print 'trainDataLoader receives the stop signal'
				break

			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelWeightPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelWeightPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelWeightPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)

			"""
			i += 1
			if i%100 == 0:
				print '#batches of train data loaded: ', i
			"""

		#print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices  and ', len(labelMatrixPool), ' label weight matrices'
	print 'TrainDataLoader has finished loading data'
	sharedQ.close()