def TrainDataLoader3(sharedQ, sharedLabelPool, sharedLabelWeightPool, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() ## load the labels of all training proteins trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) for loc in trainDataLocation: d = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False, returnMode='list') name = d['name'] labelPool[name] = d['atomLabelMatrix'] labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix print 'TrainDataLoader with #PID ', os.getpid(), ' has loaded ', len(labelPool), ' label matrices and ', len(labelWeightPool), ' label weight matrices' ## update labelPool and labelWeightPool to the shared dict() sharedLabelPool.update(labelPool) sharedLabelWeightPool.update(labelWeightPool) print 'TrainDataLoader with #PID ', os.getpid(), ' has update the shared labelPool and labelWeightPool' while True: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) """ maxLen = 900 trainDataLocation, numExcluded = DataProcessor.FilterByLength(trainDataLocation, maxLen) print 'Exclude ', numExcluded, ' train proteins longer than ', maxLen, ' AAs' """ trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break names = [ p['name'] for p in batch ] data = [] for protein in batch: d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put( (data, names) ) print 'TrainDataLoader has finished loading data' sharedQ.close()
def CalcFeatureExpectBySampling(metaData, modelSpecs): seqfeatures = [] seqweights = [] matrixfeatures = [] matrixweights = [] embedfeatures = [] embedweights = [] dataLocation = DataProcessor.SampleProteinInfo(metaData) for loc in dataLocation: d = DataProcessor.LoadRealData(loc, modelSpecs, loadLabel=False) res = CalcFeatureExpect4OneProtein(d) seqfeature, seqweight, matrixfeature, matrixweight = res[:4] seqfeatures.append(seqfeature) matrixfeatures.append(matrixfeature) seqweights.append(seqweight) matrixweights.append(matrixweight) if len(res) == 6: embedfeature, embedweight = res[5:] embedfeatures.append(embedfeature) embedweights.append(embedweight) modelSpecs['seqFeatures_expected'] = np.average(seqfeatures, axis=0, weights=seqweights) modelSpecs['matrixFeatures_expected'] = np.average(matrixfeatures, axis=0, weights=matrixweights) modelSpecs['embedFeatures_expected'] = np.average(embedfeatures, axis=0, weights=embedweights)
def DetermineFeatureDimensionBySampling(metaData, modelSpecs): protein = DataProcessor.SampleProteinInfo(metaData, numSamples=1)[0] d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') ## obtain the dimension of each type of input feature modelSpecs['n_in_seq'] = DetermineNumSeqFeatures(d['seqFeatures']) modelSpecs['n_in_matrix'] = DetermineNumMatrixFeatures(d['matrixFeatures']) + DetermineNumMatrixFeatures(d['matrixFeatures_nomean']) if d.has_key('embedFeatures'): modelSpecs['n_in_embed'] = d['embedFeatures'].shape[1]
def CalcLabelDistributionNWeightBySampling(trainMetaData, modelSpecs): trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData, numSamples=10000) ## only load ground truth but not input features to save memory and speed up labelData = [] for loc in trainDataLocation: p = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False) labelData.append(p) CalcLabelDistributionAndWeight(labelData, modelSpecs)
def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelMatrixPool = dict() while True: trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelMatrixPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelMatrixPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelMatrixPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data)
def main(argv): modelSpecs = InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) startTime = datetime.datetime.now() trainMetaData = DataProcessor.LoadMetaData(modelSpecs['trainFile']) FeatureUtils.DetermineFeatureDimensionBySampling(trainMetaData, modelSpecs) ## calculate label distribution and weight at the very beginning print 'Calculating label distribution...' LabelUtils.CalcLabelDistributionNWeightBySampling(trainMetaData, modelSpecs) if config.TrainByRefLoss(modelSpecs) or config.UseRefState(modelSpecs): print 'Calculating feature expection by sampling...' FeatureUtils.CalcFeatureExpectBySampling(trainMetaData, modelSpecs) ## trainMetaData is a list of groups. Each group contains a set of related proteins (seq-template alignments) and files for their features trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) trainSeqData = DataProcessor.SplitData2Batches( trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) print 'approximate #batches for train data: ', len(trainSeqData) #global trainSharedQ, stopTrainDataLoader, trainDataLoaders, trainSharedLabelPool, trainSharedLabelWeightPool global trainSharedQ, stopTrainDataLoader, trainDataLoaders trainSharedQ = multiprocessing.Queue(config.QSize(modelSpecs)) stopTrainDataLoader = multiprocessing.Event() #trainSharedLabelPool = multiprocessing.Manager().dict() #trainSharedLabelWeightPool = multiprocessing.Manager().dict() #print stopTrainDataLoader numTrainDataLoaders = config.NumTrainDataLoaders(modelSpecs) metaDatas = DataProcessor.SplitMetaData(trainMetaData, numTrainDataLoaders) trainDataLoaders = [] for i, metaData in zip(xrange(numTrainDataLoaders), metaDatas): #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader, args=(trainSharedQ, metaData, modelSpecs, True, True)) trainDataLoader = multiprocessing.Process( name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader2, args=(trainSharedQ, stopTrainDataLoader, metaData, modelSpecs, True, True)) #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader3, args=(trainSharedQ, trainSharedLabelPool, trainSharedLabelWeightPool, stopTrainDataLoader, metaData, modelSpecs, True, True)) trainDataLoader.daemon = True trainDataLoaders.append(trainDataLoader) print 'start the train data loaders...' for trainDataLoader in trainDataLoaders: trainDataLoader.start() validMetaData = DataProcessor.LoadMetaData(modelSpecs['validFile']) validDataLocation = DataProcessor.SampleProteinInfo(validMetaData) ## split data into batches, but do not load the real data from disk #validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=500 * 500, modelSpecs=modelSpecs) print '#batches for validation data: ', len(validSeqData) global validSharedQ, validDataLoader, stopValidDataLoader validSharedQ = multiprocessing.Queue(len(validSeqData)) stopValidDataLoader = multiprocessing.Event() #print stopValidDataLoader ## shared memory is a limited resource, so avoid using it as much as possible ## here we do not use shared array for validation data since we only need to load it once #validDataLoader = multiprocessing.Process(name='ValidDataLoader for '+str(os.getpid()), target=TrainUtils.ValidDataLoader, args=(validSharedQ, validSeqData, modelSpecs, True, False)) validDataLoader = multiprocessing.Process( name='ValidDataLoader for ' + str(os.getpid()), target=TrainUtils.ValidDataLoader2, args=(validSharedQ, stopValidDataLoader, validSeqData, modelSpecs, True, False)) print 'start the validation data loader...' validDataLoader.start() """ if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): ##calculate the average weight per minibatch maxDeviation = DataProcessor.CalcAvgWeightPerBatch(trainSeqDataset, modelSpecs) print 'maxWeightDeviation=', maxDeviation """ beforeTrainTime = datetime.datetime.now() print 'time spent before training :', beforeTrainTime - startTime result = TrainModel(modelSpecs=modelSpecs, trainValidData=(trainSeqData, validSeqData)) ##merge ModelSpecs and result resultModel = modelSpecs.copy() resultModel.update(result) modelFile = TrainUtils.GenerateModelFileName(resultModel) print 'Writing the resultant model to ', modelFile cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL) afterTrainTime = datetime.datetime.now() print 'time spent on training:', afterTrainTime - beforeTrainTime ## clean up again print 'Cleaning up again...' Cleanup()
def TrainModel(modelSpecs, trainValidData=None, predDataFile=None): if (not trainValidData): print 'Please provide train and validation data for model training' exit(1) if modelSpecs is None: print 'Please provide a model specification for training' exit(1) distancePredictor, variable4train, variable4validate, params, params4mean, params4var, paramL2, regularizer, topAcc, errors, labelList, weightList, trainByRefLoss = PrepareModel( modelSpecs) chkpoint, restart = InitializeChkpoint(params, modelSpecs) assert (len(modelSpecs['numEpochs']) > 0) numEpochs4stages = np.cumsum(modelSpecs['numEpochs']) ## train parameters not related to variance and correlation epoch = chkpoint['epoch'] if epoch < numEpochs4stages[-1]: if weightList is not None and len(weightList) > 0: loss4train = distancePredictor.loss(labelList, useMeanOnly=True, weightList=weightList, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, useMeanOnly=True, weightList=weightList) else: loss4train = distancePredictor.loss(labelList, useMeanOnly=True, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, useMeanOnly=True) """ ## weightedLoss is only used for cost, i.e., gradient calculation if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs) else: weightedLoss = loss """ if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']): cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) else: cost = T.sum(T.mul(loss4train, modelSpecs['w4responses']) ) / np.sum(modelSpecs['w4responses']) + regularizer params4var_set = set(params4var) pgrads = [ T.grad(cost, p, consider_constant=weightList, disconnected_inputs='warn') if p not in params4var_set else T.zeros_like(p) for p in params ] pdecay = [ p if p not in params4var_set else T.zeros_like(p) for p in params ] for stage, lr, epoch_end in zip(xrange(len(numEpochs4stages)), modelSpecs['lrs'], numEpochs4stages): if epoch >= epoch_end: continue print 'training for mean using a learning rate ', lr, ' ...' startFromBest = (stage > 0 and epoch == numEpochs4stages[stage - 1]) epoch_start = epoch epoch = RunOneStage(epoch_start, epoch_end, trainValidData, chkpoint, loss4train, loss4validate, pgrads, pdecay, modelSpecs, lr=lr, startFromBest=(startFromBest, startFromBest)) ## train parameters only specific to variance and correlation numEpochs4var = modelSpecs['numEpochs4var'] lrs = modelSpecs['lrs4var'] if len(params4var) > 0: assert (len(numEpochs4var) > 0) assert (len(lrs) > 0) previousEpochs4Stages = numEpochs4stages numEpochs4stages = np.cumsum(numEpochs4var) + numEpochs4stages[-1] if epoch < numEpochs4stages[-1]: print 'Training the parameters specific to correlation and variance ...' if weightList is not None and len(weightList) > 0: loss4train = distancePredictor.loss( labelList, weightList=weightList, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, weightList=weightList) else: loss4train = distancePredictor.loss(labelList) loss4validate = distancePredictor.loss(labelList) """ ## weightedLoss is only used for cost, i.e., gradient calculation if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs) else: weightedLoss = loss """ if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']): cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) else: cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) + regularizer params4var_set = set(params4var) pgrads = [ T.grad(cost, p, consider_constant=weightList, disconnected_inputs='raise') if p in params4var_set else T.zeros_like(p) for p in params ] pdecay = [ p if p in params4var_set else T.zeros_like(p) for p in params ] for stage, lr, epoch_end in zip(xrange(len(lrs)), lrs, numEpochs4stages): if epoch >= epoch_end: continue print 'training for variance using a learning rate ', lr, ' ...' startFromBest = ( (stage == 0 and epoch == previousEpochs4Stages[-1]) or (stage > 0 and epoch == numEpochs4stages[stage - 1])) epoch_start = epoch epoch = RunOneStage(epoch_start, epoch_end, trainValidData, chkpoint, loss4train, loss4validate, pgrads, pdecay, modelSpecs, lr=lr, startFromBest=(startFromBest, startFromBest and (stage > 0))) resultModel = {} resultModel['dateTrained'] = datetime.datetime.now() #resultModel['validLoss'] = validLoss resultModel['validLoss'] = chkpoint['best_validation_loss'] #resultModel['validErr'] = validErr if chkpoint.has_key('best_validation_err'): resultModel['validErr'] = chkpoint['best_validation_err'] resultModel['trainLoss'] = chkpoint['train_loss4best_validation_loss'] #resultModel['validAcc']= validAcc if chkpoint.has_key('best_validation_acc'): resultModel['validAcc'] = chkpoint['best_validation_acc'] resultModel['paramValues'] = chkpoint['bestParamValues'] bestParamL2norm = np.sum([(v**2).sum() for v in chkpoint['bestParamValues']]) resultModel['bestParamL2norm'] = bestParamL2norm bestParamL1norm = np.sum( [abs(v).sum() for v in chkpoint['bestParamValues']]) resultModel['bestParamL1norm'] = bestParamL1norm print 'best param L1 norm: ', bestParamL1norm, 'L2 norm: ', bestParamL2norm Cleanup() #test on prediction data if it is given. Here the prediction data shall be small to save memory and contain ground truth. if modelSpecs['predFile'] is not None: predMetaData = DataProcessor.LoadMetaData(modelSpecs['predFile']) predDataLocation = DataProcessor.SampleProteinInfo(predMetaData) predBatches = DataProcessor.SplitData2Batches(predDataLocation, numDataPoints=624, modelSpecs=modelSpecs) print '\nLoading prediction data...' print "#predData minibatches:", len(predBatches) predData = [] for batch in predBatches: data = DataProcessor.LoadRealData(batch, modelSpecs, returnMode='list') FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) #input = TrainUtils.PrepareInput4Prediction(data, modelSpecs, floatType=np.float16) input = TrainUtils.PrepareInput4Prediction( data, modelSpecs, floatType=theano.config.floatX) predData.append(input) if weightList is not None and len(weightList) > 0: loss4validate = distancePredictor.loss(labelList, weightList=weightList) else: loss4validate = distancePredictor.loss(labelList) fullValidate = theano.function(variable4validate, [loss4validate, errors, topAcc], on_unused_input='warn') if config.UseRefState(modelSpecs): quickValidate = theano.function(variable4validate, [loss4validate, errors], on_unused_input='warn') ## set model parameters for valiation and possibly prediction for param, value in zip(params, chkpoint['bestParamValues']): param.set_value(value) predLoss, predErr, predAcc = ValidateAllData(predData, fullValidate, modelSpecs) if config.UseRefState(modelSpecs): refLoss, refErr = ValidateAllData(predData, quickValidate, modelSpecs, forRefState=True) print 'pred loss: ', predLoss, 'pred err: ', predErr, 'ref loss: ', refLoss, 'ref err: ', refErr else: print 'pred loss: ', predLoss, 'pred err: ', predErr resultModel['predLoss'] = predLoss resultModel['predErr'] = predErr print "predAcc: ", [str_display(pAcc[:, 0]) for pAcc in predAcc ], 'for top ', modelSpecs['topRatios'] resultModel['predAcc'] = predAcc del predData[:] ## training is done, remove the checkpoint file since it has been copied at the end of each stage if modelSpecs.has_key('checkpointFile') and (modelSpecs['checkpointFile'] is not None): try: os.remove(modelSpecs['checkpointFile']) except IOError: print 'WARNING: error in deleting the check point file: ', modelSpecs[ 'checkpointFile'] ## remove theano variables from modelSpecs keys4removal = [ 'variable4train', 'variable4validate', 'params', 'params4mean', 'params4var', 'paramL2', 'regularizer', 'topAcc', 'errors', 'labelList', 'weightList', 'trainByRefLoss' ] for k in keys4removal: if modelSpecs.has_key(k): del modelSpecs[k] return resultModel
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() while True: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) #i = 0 for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelWeightPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelWeightPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data) """ i += 1 if i%100 == 0: print '#batches of train data loaded: ', i """ #print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices and ', len(labelMatrixPool), ' label weight matrices' print 'TrainDataLoader has finished loading data' sharedQ.close()