def main(argv): modelSpecs = config.InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) ## load the datasets. Data is a list of proteins and each protein is represented as a dict() Data = DataProcessor.LoadDistanceLabelMatrices(modelSpecs['dataset'], modelSpecs=modelSpecs) print '#proteins loaded from the dataset: ', len(Data) allProteins = [d['name'] for d in Data] print 'Preparing batch data for training...' groupSize = modelSpecs['minibatchSize'] batches = DataProcessor.SplitData2Batches(data=Data, numDataPoints=groupSize, modelSpecs=modelSpecs) print "#batches:", len(batches) ## add code here to calculate empirical reference state ## RefState is a dict, RefState[response] = (length-independent ref, length-dependent ref) ## length-independent ref is an 1d array, length-dependent ref is a list with each element being an tuple (length, 1d array) RefState = CalcRefState(batches=batches, modelSpecs=modelSpecs) RefState['dataset'] = modelSpecs['dataset'] RefState['proteins'] = allProteins ## save RefState responseStr = '-'.join(modelSpecs['responses']) file4save = 'EmpRefState-' + responseStr + '-' + str(os.getpid()) + '.pkl' fh = open(file4save, 'wb') cPickle.dump(RefState, fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() ## print the length-ind reference state for response in modelSpecs['responses']: print RefState[response][0]
def TrainDataLoader3(sharedQ, sharedLabelPool, sharedLabelWeightPool, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() ## load the labels of all training proteins trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) for loc in trainDataLocation: d = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False, returnMode='list') name = d['name'] labelPool[name] = d['atomLabelMatrix'] labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix print 'TrainDataLoader with #PID ', os.getpid(), ' has loaded ', len(labelPool), ' label matrices and ', len(labelWeightPool), ' label weight matrices' ## update labelPool and labelWeightPool to the shared dict() sharedLabelPool.update(labelPool) sharedLabelWeightPool.update(labelWeightPool) print 'TrainDataLoader with #PID ', os.getpid(), ' has update the shared labelPool and labelWeightPool' while True: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) """ maxLen = 900 trainDataLocation, numExcluded = DataProcessor.FilterByLength(trainDataLocation, maxLen) print 'Exclude ', numExcluded, ' train proteins longer than ', maxLen, ' AAs' """ trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break names = [ p['name'] for p in batch ] data = [] for protein in batch: d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put( (data, names) ) print 'TrainDataLoader has finished loading data' sharedQ.close()
def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelMatrixPool = dict() while True: trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelMatrixPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelMatrixPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelMatrixPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data)
def PredictDistMatrix(modelFiles, predFiles, savefolder=None): ## load all the models from the files. Each file contains specification for one model. models = [] for mFile in modelFiles: fh = open(mFile, 'rb') model = cPickle.load(fh) fh.close() models.append(model) ## check consistency among models. All the models shall have the same labelType for the same atom pair type labelTypes = dict() for model in models: for response in model['responses']: labelName = Response2LabelName(response) labelType = Response2LabelType(response) if not labelTypes.has_key(labelName): labelTypes[labelName] = labelType elif labelTypes[labelName] != labelType: print 'WARNING: at least two models have different label types for the same atom pair type.' exit(-1) allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices ##We predict one prob_matrix from each model for each protein and each response ## two different models may share some overlapping responses. allresults = dict() numModels = dict() for model, mfile in zip(models, modelFiles): if not model['network'] in config.allNetworks: print 'unsupported network architecture: ', model['network'] exit(-1) distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False) inputVariables = [ x, y, xmask, ymask] if xem is not None: inputVariables.append(xem) pred_prob = distancePredictor.output_prob predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' ) ## set model parameter values if not Compatible(distancePredictor.params, model['paramValues']): print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile exit(-1) [ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ] ## We shall load these files for each model separately since each model may have different requirement of the data predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False) ##make sure the input has the same number of features as the model. We do random check here to speed up rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2] if predData[0].has_key('embedFeatures'): rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1] ## check if all the proteins of the same name have exactly the same sequence for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'Error: inconsistent primary sequence for the same protein in the protein feature files' exit(-1) ## predSeqData and names are in the exactly the same order, so we know which data is for which protein predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[ : len(inputVariables) ] result = predict(*input) x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] ) ## calculate the start and end positions of each response in the last dimension of result dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for name in names4onebatch: if not allresults.has_key(name): allresults[name]=dict() numModels[name] =dict() ## batchres is a batch of result, its ndim=4 for response, start, end in zip(model['responses'], startPositions, endPositions): ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end ] ## remove masked positions revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) del predict del predData del predSeqData gc.collect() ## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): #finalresults[name][response] = np.average(allresults[name][response], axis=0) finalresults[name][response] = allresults[name][response]/numModels[name][response] ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. apt = Response2LabelName(response) if config.IsSymmetricAPT( apt ): finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2. ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for response in model['responses']: apt = response if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][response]) labelWeights[apt].append(model['weight4labels'][response]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) ## convert the predicted distance probability matrix into a predicted contact matrix. ## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions predictedContactMatrices = dict() from scipy.stats import norm for name, results in finalresults.iteritems(): predictedContactMatrices[name] = dict() for response in results.keys(): apt = Response2LabelName(response) labelType = Response2LabelType(response) if apt in config.allAtomPairTypes: if labelType.startswith('Discrete'): subType = labelType[len('Discrete'): ] labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType]) predictedContactMatrices[name][apt] = np.sum( finalresults[name][response][:, :, :labelOf8], axis=2) elif labelType.startswith('Normal'): assert labelType.startswith('Normal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(config.ContactDefinition) elif labelType.startswith('LogNormal'): assert labelType.startswith('LogNormal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(np.log(config.ContactDefinition) ) else: print 'unsupported label type in response: ', response exit(-1) elif apt in ['HB', 'Beta']: predictedContactMatrices[name][apt] = finalresults[name][response][:, :, 0] else: print 'unsupported atom type in response: ', response exit(-1) ##write all the results here ## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence) ## we store distLabelType for future use for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if savefolder is not None: savefilename = os.path.join(savefolder, savefilename) fh = open(savefilename, 'wb') cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return finalresults, predictedContactMatrices, allsequences
def PredictProperty(models, predictors, predFiles): allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list allresults4prob = dict() allresults = dict() for model, predictor in zip(models, predictors): predict, inputVariables = predictor ## We shall load these files for each model separately since each model may use a different set of features predData = DataProcessor.LoadPropertyFeatures(predFiles, modelSpecs=model, forTrainValidation=False) ##make sure the input has the same number of features as the model rindex = np.random.randint(0, high=len(predData)) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] ## collecting sequences for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData, names = DataProcessor.SplitData2Batches( data=predData, numDataPoints=30, modelSpecs=model, forTrainValidation=False) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[:len(inputVariables)] result4prob, result = predict(*input) ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked) x1d, x1dmask = input[0:2] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses']) ) assert result4prob.shape[2] == sum([ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ]) ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses']) ) assert result.shape[2] == sum([ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ]) nameGenerator = (name for name in names4onebatch if not allresults.has_key(name)) for name in nameGenerator: allresults[name] = dict() allresults4prob[name] = dict() dims = [ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults4prob[name].has_key(res)) for name in nameGenerator: allresults4prob[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result4prob[:, :, start:end], seqLens) ] [ allresults4prob[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] dims = [ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults[name].has_key(res)) for name in nameGenerator: allresults[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result[:, :, start:end], seqLens) ] [ allresults[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] ## calculate the final result, which is the average of all the predictd properties for the same protein and response name finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() for response in results.keys(): tmpresult = np.average(allresults[name][response], axis=0) ##convert coding of discrete labels to more meaningful representation labelType = Response2LabelType(response) if not labelType.startswith('Discrete'): finalresults[name][response] = tmpresult finalresults4prob = dict() for name, results in allresults4prob.iteritems(): if not finalresults4prob.has_key(name): finalresults4prob[name] = dict() for response in results.keys(): finalresults4prob[name][response] = np.average( allresults4prob[name][response], axis=0) labelType = Response2LabelType(response) if labelType.startswith('Discrete'): tmpresult = np.argmax(finalresults4prob[name][response], axis=1) finalresults[name][response] = PropertyUtils.Coding2String( tmpresult, response) """ ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for apt in model['responseNames']: if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][apt]) labelWeights[apt].append(model['weight4' + model['labelType'] ][apt]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) """ return finalresults4prob, finalresults, allsequences
def main(argv): modelSpecs = InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) startTime = datetime.datetime.now() trainMetaData = DataProcessor.LoadMetaData(modelSpecs['trainFile']) FeatureUtils.DetermineFeatureDimensionBySampling(trainMetaData, modelSpecs) ## calculate label distribution and weight at the very beginning print 'Calculating label distribution...' LabelUtils.CalcLabelDistributionNWeightBySampling(trainMetaData, modelSpecs) if config.TrainByRefLoss(modelSpecs) or config.UseRefState(modelSpecs): print 'Calculating feature expection by sampling...' FeatureUtils.CalcFeatureExpectBySampling(trainMetaData, modelSpecs) ## trainMetaData is a list of groups. Each group contains a set of related proteins (seq-template alignments) and files for their features trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) trainSeqData = DataProcessor.SplitData2Batches( trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) print 'approximate #batches for train data: ', len(trainSeqData) #global trainSharedQ, stopTrainDataLoader, trainDataLoaders, trainSharedLabelPool, trainSharedLabelWeightPool global trainSharedQ, stopTrainDataLoader, trainDataLoaders trainSharedQ = multiprocessing.Queue(config.QSize(modelSpecs)) stopTrainDataLoader = multiprocessing.Event() #trainSharedLabelPool = multiprocessing.Manager().dict() #trainSharedLabelWeightPool = multiprocessing.Manager().dict() #print stopTrainDataLoader numTrainDataLoaders = config.NumTrainDataLoaders(modelSpecs) metaDatas = DataProcessor.SplitMetaData(trainMetaData, numTrainDataLoaders) trainDataLoaders = [] for i, metaData in zip(xrange(numTrainDataLoaders), metaDatas): #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader, args=(trainSharedQ, metaData, modelSpecs, True, True)) trainDataLoader = multiprocessing.Process( name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader2, args=(trainSharedQ, stopTrainDataLoader, metaData, modelSpecs, True, True)) #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader3, args=(trainSharedQ, trainSharedLabelPool, trainSharedLabelWeightPool, stopTrainDataLoader, metaData, modelSpecs, True, True)) trainDataLoader.daemon = True trainDataLoaders.append(trainDataLoader) print 'start the train data loaders...' for trainDataLoader in trainDataLoaders: trainDataLoader.start() validMetaData = DataProcessor.LoadMetaData(modelSpecs['validFile']) validDataLocation = DataProcessor.SampleProteinInfo(validMetaData) ## split data into batches, but do not load the real data from disk #validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=500 * 500, modelSpecs=modelSpecs) print '#batches for validation data: ', len(validSeqData) global validSharedQ, validDataLoader, stopValidDataLoader validSharedQ = multiprocessing.Queue(len(validSeqData)) stopValidDataLoader = multiprocessing.Event() #print stopValidDataLoader ## shared memory is a limited resource, so avoid using it as much as possible ## here we do not use shared array for validation data since we only need to load it once #validDataLoader = multiprocessing.Process(name='ValidDataLoader for '+str(os.getpid()), target=TrainUtils.ValidDataLoader, args=(validSharedQ, validSeqData, modelSpecs, True, False)) validDataLoader = multiprocessing.Process( name='ValidDataLoader for ' + str(os.getpid()), target=TrainUtils.ValidDataLoader2, args=(validSharedQ, stopValidDataLoader, validSeqData, modelSpecs, True, False)) print 'start the validation data loader...' validDataLoader.start() """ if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): ##calculate the average weight per minibatch maxDeviation = DataProcessor.CalcAvgWeightPerBatch(trainSeqDataset, modelSpecs) print 'maxWeightDeviation=', maxDeviation """ beforeTrainTime = datetime.datetime.now() print 'time spent before training :', beforeTrainTime - startTime result = TrainModel(modelSpecs=modelSpecs, trainValidData=(trainSeqData, validSeqData)) ##merge ModelSpecs and result resultModel = modelSpecs.copy() resultModel.update(result) modelFile = TrainUtils.GenerateModelFileName(resultModel) print 'Writing the resultant model to ', modelFile cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL) afterTrainTime = datetime.datetime.now() print 'time spent on training:', afterTrainTime - beforeTrainTime ## clean up again print 'Cleaning up again...' Cleanup()
def TrainModel(modelSpecs, trainValidData=None, predDataFile=None): if (not trainValidData): print 'Please provide train and validation data for model training' exit(1) if modelSpecs is None: print 'Please provide a model specification for training' exit(1) distancePredictor, variable4train, variable4validate, params, params4mean, params4var, paramL2, regularizer, topAcc, errors, labelList, weightList, trainByRefLoss = PrepareModel( modelSpecs) chkpoint, restart = InitializeChkpoint(params, modelSpecs) assert (len(modelSpecs['numEpochs']) > 0) numEpochs4stages = np.cumsum(modelSpecs['numEpochs']) ## train parameters not related to variance and correlation epoch = chkpoint['epoch'] if epoch < numEpochs4stages[-1]: if weightList is not None and len(weightList) > 0: loss4train = distancePredictor.loss(labelList, useMeanOnly=True, weightList=weightList, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, useMeanOnly=True, weightList=weightList) else: loss4train = distancePredictor.loss(labelList, useMeanOnly=True, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, useMeanOnly=True) """ ## weightedLoss is only used for cost, i.e., gradient calculation if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs) else: weightedLoss = loss """ if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']): cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) else: cost = T.sum(T.mul(loss4train, modelSpecs['w4responses']) ) / np.sum(modelSpecs['w4responses']) + regularizer params4var_set = set(params4var) pgrads = [ T.grad(cost, p, consider_constant=weightList, disconnected_inputs='warn') if p not in params4var_set else T.zeros_like(p) for p in params ] pdecay = [ p if p not in params4var_set else T.zeros_like(p) for p in params ] for stage, lr, epoch_end in zip(xrange(len(numEpochs4stages)), modelSpecs['lrs'], numEpochs4stages): if epoch >= epoch_end: continue print 'training for mean using a learning rate ', lr, ' ...' startFromBest = (stage > 0 and epoch == numEpochs4stages[stage - 1]) epoch_start = epoch epoch = RunOneStage(epoch_start, epoch_end, trainValidData, chkpoint, loss4train, loss4validate, pgrads, pdecay, modelSpecs, lr=lr, startFromBest=(startFromBest, startFromBest)) ## train parameters only specific to variance and correlation numEpochs4var = modelSpecs['numEpochs4var'] lrs = modelSpecs['lrs4var'] if len(params4var) > 0: assert (len(numEpochs4var) > 0) assert (len(lrs) > 0) previousEpochs4Stages = numEpochs4stages numEpochs4stages = np.cumsum(numEpochs4var) + numEpochs4stages[-1] if epoch < numEpochs4stages[-1]: print 'Training the parameters specific to correlation and variance ...' if weightList is not None and len(weightList) > 0: loss4train = distancePredictor.loss( labelList, weightList=weightList, trainByRefLoss=trainByRefLoss) loss4validate = distancePredictor.loss(labelList, weightList=weightList) else: loss4train = distancePredictor.loss(labelList) loss4validate = distancePredictor.loss(labelList) """ ## weightedLoss is only used for cost, i.e., gradient calculation if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True): weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs) else: weightedLoss = loss """ if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']): cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) else: cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])) / np.sum( modelSpecs['w4responses']) + regularizer params4var_set = set(params4var) pgrads = [ T.grad(cost, p, consider_constant=weightList, disconnected_inputs='raise') if p in params4var_set else T.zeros_like(p) for p in params ] pdecay = [ p if p in params4var_set else T.zeros_like(p) for p in params ] for stage, lr, epoch_end in zip(xrange(len(lrs)), lrs, numEpochs4stages): if epoch >= epoch_end: continue print 'training for variance using a learning rate ', lr, ' ...' startFromBest = ( (stage == 0 and epoch == previousEpochs4Stages[-1]) or (stage > 0 and epoch == numEpochs4stages[stage - 1])) epoch_start = epoch epoch = RunOneStage(epoch_start, epoch_end, trainValidData, chkpoint, loss4train, loss4validate, pgrads, pdecay, modelSpecs, lr=lr, startFromBest=(startFromBest, startFromBest and (stage > 0))) resultModel = {} resultModel['dateTrained'] = datetime.datetime.now() #resultModel['validLoss'] = validLoss resultModel['validLoss'] = chkpoint['best_validation_loss'] #resultModel['validErr'] = validErr if chkpoint.has_key('best_validation_err'): resultModel['validErr'] = chkpoint['best_validation_err'] resultModel['trainLoss'] = chkpoint['train_loss4best_validation_loss'] #resultModel['validAcc']= validAcc if chkpoint.has_key('best_validation_acc'): resultModel['validAcc'] = chkpoint['best_validation_acc'] resultModel['paramValues'] = chkpoint['bestParamValues'] bestParamL2norm = np.sum([(v**2).sum() for v in chkpoint['bestParamValues']]) resultModel['bestParamL2norm'] = bestParamL2norm bestParamL1norm = np.sum( [abs(v).sum() for v in chkpoint['bestParamValues']]) resultModel['bestParamL1norm'] = bestParamL1norm print 'best param L1 norm: ', bestParamL1norm, 'L2 norm: ', bestParamL2norm Cleanup() #test on prediction data if it is given. Here the prediction data shall be small to save memory and contain ground truth. if modelSpecs['predFile'] is not None: predMetaData = DataProcessor.LoadMetaData(modelSpecs['predFile']) predDataLocation = DataProcessor.SampleProteinInfo(predMetaData) predBatches = DataProcessor.SplitData2Batches(predDataLocation, numDataPoints=624, modelSpecs=modelSpecs) print '\nLoading prediction data...' print "#predData minibatches:", len(predBatches) predData = [] for batch in predBatches: data = DataProcessor.LoadRealData(batch, modelSpecs, returnMode='list') FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) #input = TrainUtils.PrepareInput4Prediction(data, modelSpecs, floatType=np.float16) input = TrainUtils.PrepareInput4Prediction( data, modelSpecs, floatType=theano.config.floatX) predData.append(input) if weightList is not None and len(weightList) > 0: loss4validate = distancePredictor.loss(labelList, weightList=weightList) else: loss4validate = distancePredictor.loss(labelList) fullValidate = theano.function(variable4validate, [loss4validate, errors, topAcc], on_unused_input='warn') if config.UseRefState(modelSpecs): quickValidate = theano.function(variable4validate, [loss4validate, errors], on_unused_input='warn') ## set model parameters for valiation and possibly prediction for param, value in zip(params, chkpoint['bestParamValues']): param.set_value(value) predLoss, predErr, predAcc = ValidateAllData(predData, fullValidate, modelSpecs) if config.UseRefState(modelSpecs): refLoss, refErr = ValidateAllData(predData, quickValidate, modelSpecs, forRefState=True) print 'pred loss: ', predLoss, 'pred err: ', predErr, 'ref loss: ', refLoss, 'ref err: ', refErr else: print 'pred loss: ', predLoss, 'pred err: ', predErr resultModel['predLoss'] = predLoss resultModel['predErr'] = predErr print "predAcc: ", [str_display(pAcc[:, 0]) for pAcc in predAcc ], 'for top ', modelSpecs['topRatios'] resultModel['predAcc'] = predAcc del predData[:] ## training is done, remove the checkpoint file since it has been copied at the end of each stage if modelSpecs.has_key('checkpointFile') and (modelSpecs['checkpointFile'] is not None): try: os.remove(modelSpecs['checkpointFile']) except IOError: print 'WARNING: error in deleting the check point file: ', modelSpecs[ 'checkpointFile'] ## remove theano variables from modelSpecs keys4removal = [ 'variable4train', 'variable4validate', 'params', 'params4mean', 'params4var', 'paramL2', 'regularizer', 'topAcc', 'errors', 'labelList', 'weightList', 'trainByRefLoss' ] for k in keys4removal: if modelSpecs.has_key(k): del modelSpecs[k] return resultModel
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() while True: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) #i = 0 for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelWeightPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelWeightPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data) """ i += 1 if i%100 == 0: print '#batches of train data loaded: ', i """ #print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices and ', len(labelMatrixPool), ' label weight matrices' print 'TrainDataLoader has finished loading data' sharedQ.close()
def main(argv): #modelSpecs = config.InitializeModelSpecs() modelSpecs = InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) startTime = datetime.datetime.now() ##trainData and validData are a list. Each element corresponds to one protein, which is a dict() trainData = DataProcessor.LoadPropertyFeatures(modelSpecs['trainFile'], modelSpecs=modelSpecs) validData = DataProcessor.LoadPropertyFeatures(modelSpecs['validFile'], modelSpecs=modelSpecs) print '#trainData: ', len(trainData), '#validData: ', len(validData) ## where to add code to assign weight to each residue? We need to deal with the residues without 3D coordinates for angle and SS prediction ##a, b = DataProcessor.CalcLabelDistributionAndWeight(trainData, modelSpecs) modelSpecs['numOfTrainProteins'] = len(trainData) beforeBatchTime = datetime.datetime.now() print 'time spent on data loading: ', beforeBatchTime - startTime print 'Preparing batch data for training...' groupSize = modelSpecs['minibatchSize'] trainSeqDataset, _ = DataProcessor.SplitData2Batches( data=trainData, numDataPoints=groupSize, modelSpecs=modelSpecs) validSeqDataset, _ = DataProcessor.SplitData2Batches( data=validData, numDataPoints=groupSize, modelSpecs=modelSpecs) #validSeqDataset = DataProcessor.SplitData2Batches(data=validData, numDataPoints=20000, modelSpecs=modelSpecs) print "#trainData minibatches:", len( trainSeqDataset), "#validData minibatches:", len(validSeqDataset) predSeqDataset = None if modelSpecs['predFile'] is not None: predData = DataProcessor.LoadPropertyFeatures(modelSpecs['predFile'], modelSpecs=modelSpecs, forTrainValidation=False) print '#predData: ', len(predData) predSeqDataset, _ = DataProcessor.SplitData2Batches( data=predData, numDataPoints=40, modelSpecs=modelSpecs) print "#predData minibatches:", len(predSeqDataset) ## Each protein in trainData contains three or four components: seqFeatures and label modelSpecs['n_in_seq'] = trainData[0]['seqFeatures'].shape[1] beforeTrainTime = datetime.datetime.now() print 'time spent on generating batch data:', beforeTrainTime - beforeBatchTime result = TrainModel(modelSpecs=modelSpecs, trainSeqData=trainSeqDataset, validSeqData=validSeqDataset, predSeqData=predSeqDataset) ##merge ModelSpecs and result resultModel = modelSpecs.copy() resultModel.update(result) modelFile = GenerateModelFileName(resultModel) print 'Writing the resultant model to ', modelFile cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)
def PredictMatrixLabels(models, predictors, names, inputFolders, aliFolders=None, tplFolder=None, aliFile=None, tplFile=None, saveFolder=None): if not isinstance(names, (list, tuple)): targetName = names else: targetName = None ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results ##two different models may share common responses allsequences = dict() allresults = dict() ## the results predicted from the real input numModels = dict( ) ## count the number of models that may predict each response for model, predictor in zip(models, predictors): #predict, inputVariables = BuildPredictor(model) predict, inputVariables = predictor ## load data for each model separately since each model may have a different specification if targetName is None: rawData = LoadProteinData4OneModel(model, names, inputFolders, aliFolders, tplFolder) elif aliFile is not None and tplFile is not None: rawData = LoadOneAlignment4OneModel(model, targetName, inputFolders, aliFile, tplFile) else: rawData = LoadOneProteinData4OneModel(model, targetName, inputFolders, aliFolders, tplFolder) predData = DataProcessor.ExtractFeaturesNLabels( rawData, modelSpecs=model, forTrainValidation=False, returnMode='list') ##make sure the input has the same number of features as the model FeatureUtils.CheckModelNDataConsistency(model, predData) ## check sequence consistency for d in predData: name = d['name'] if not allresults.has_key(name): allresults[name] = dict() numModels[name] = dict() if not allsequences.has_key(name): allsequences[name] = d['sequence'] elif allsequences[name] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) ##for onebatch, names4onebatch in zip(predSeqData, names): for minibatch in predSeqData: onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, model) input = onebatch[:len(inputVariables)] result = predict(*input) ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum([ GetResponseProbDims(response) for response in model['responses'] ]) ## calculate the start and end positions of each response in the last dimension of result dims = [ GetResponseProbDims(response) for response in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] for response, start, end in zip(model['responses'], startPositions, endPositions): ## batchres is a batch of result, its ndim=4 ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end] ## remove masked positions revised_batchres = [ probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): finalresults[name][response] = (allresults[name][response] / numModels[name][response]).astype( np.float32) ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. labelName = Response2LabelName(response) if config.IsSymmetricLabel(labelName): finalresults[name][response] = ( finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2))) / 2. ## convert predicted distance probability matrix into contact matrix predictedContactMatrices = DeriveContactMatrix(finalresults) ## collect the average label distributions and weight matrix finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution( models) ##write all the results here ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution) for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if saveFolder is not None: savefilename = os.path.join(saveFolder, savefilename) if targetName is not None: originalName = targetName else: for n in names: if name.startswith(n): originalName = n break with open(savefilename, 'wb') as fh: #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump((originalName, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) return (predictedContactMatrices, allsequences) """