def PredictProperty(models, predictors, predFiles): allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list allresults4prob = dict() allresults = dict() for model, predictor in zip(models, predictors): predict, inputVariables = predictor ## We shall load these files for each model separately since each model may use a different set of features predData = DataProcessor.LoadPropertyFeatures(predFiles, modelSpecs=model, forTrainValidation=False) ##make sure the input has the same number of features as the model rindex = np.random.randint(0, high=len(predData)) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] ## collecting sequences for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData, names = DataProcessor.SplitData2Batches( data=predData, numDataPoints=30, modelSpecs=model, forTrainValidation=False) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[:len(inputVariables)] result4prob, result = predict(*input) ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked) x1d, x1dmask = input[0:2] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses']) ) assert result4prob.shape[2] == sum([ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ]) ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses']) ) assert result.shape[2] == sum([ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ]) nameGenerator = (name for name in names4onebatch if not allresults.has_key(name)) for name in nameGenerator: allresults[name] = dict() allresults4prob[name] = dict() dims = [ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults4prob[name].has_key(res)) for name in nameGenerator: allresults4prob[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result4prob[:, :, start:end], seqLens) ] [ allresults4prob[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] dims = [ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults[name].has_key(res)) for name in nameGenerator: allresults[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result[:, :, start:end], seqLens) ] [ allresults[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] ## calculate the final result, which is the average of all the predictd properties for the same protein and response name finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() for response in results.keys(): tmpresult = np.average(allresults[name][response], axis=0) ##convert coding of discrete labels to more meaningful representation labelType = Response2LabelType(response) if not labelType.startswith('Discrete'): finalresults[name][response] = tmpresult finalresults4prob = dict() for name, results in allresults4prob.iteritems(): if not finalresults4prob.has_key(name): finalresults4prob[name] = dict() for response in results.keys(): finalresults4prob[name][response] = np.average( allresults4prob[name][response], axis=0) labelType = Response2LabelType(response) if labelType.startswith('Discrete'): tmpresult = np.argmax(finalresults4prob[name][response], axis=1) finalresults[name][response] = PropertyUtils.Coding2String( tmpresult, response) """ ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for apt in model['responseNames']: if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][apt]) labelWeights[apt].append(model['weight4' + model['labelType'] ][apt]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) """ return finalresults4prob, finalresults, allsequences
def main(argv): #modelSpecs = config.InitializeModelSpecs() modelSpecs = InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) startTime = datetime.datetime.now() ##trainData and validData are a list. Each element corresponds to one protein, which is a dict() trainData = DataProcessor.LoadPropertyFeatures(modelSpecs['trainFile'], modelSpecs=modelSpecs) validData = DataProcessor.LoadPropertyFeatures(modelSpecs['validFile'], modelSpecs=modelSpecs) print '#trainData: ', len(trainData), '#validData: ', len(validData) ## where to add code to assign weight to each residue? We need to deal with the residues without 3D coordinates for angle and SS prediction ##a, b = DataProcessor.CalcLabelDistributionAndWeight(trainData, modelSpecs) modelSpecs['numOfTrainProteins'] = len(trainData) beforeBatchTime = datetime.datetime.now() print 'time spent on data loading: ', beforeBatchTime - startTime print 'Preparing batch data for training...' groupSize = modelSpecs['minibatchSize'] trainSeqDataset, _ = DataProcessor.SplitData2Batches( data=trainData, numDataPoints=groupSize, modelSpecs=modelSpecs) validSeqDataset, _ = DataProcessor.SplitData2Batches( data=validData, numDataPoints=groupSize, modelSpecs=modelSpecs) #validSeqDataset = DataProcessor.SplitData2Batches(data=validData, numDataPoints=20000, modelSpecs=modelSpecs) print "#trainData minibatches:", len( trainSeqDataset), "#validData minibatches:", len(validSeqDataset) predSeqDataset = None if modelSpecs['predFile'] is not None: predData = DataProcessor.LoadPropertyFeatures(modelSpecs['predFile'], modelSpecs=modelSpecs, forTrainValidation=False) print '#predData: ', len(predData) predSeqDataset, _ = DataProcessor.SplitData2Batches( data=predData, numDataPoints=40, modelSpecs=modelSpecs) print "#predData minibatches:", len(predSeqDataset) ## Each protein in trainData contains three or four components: seqFeatures and label modelSpecs['n_in_seq'] = trainData[0]['seqFeatures'].shape[1] beforeTrainTime = datetime.datetime.now() print 'time spent on generating batch data:', beforeTrainTime - beforeBatchTime result = TrainModel(modelSpecs=modelSpecs, trainSeqData=trainSeqDataset, validSeqData=validSeqDataset, predSeqData=predSeqDataset) ##merge ModelSpecs and result resultModel = modelSpecs.copy() resultModel.update(result) modelFile = GenerateModelFileName(resultModel) print 'Writing the resultant model to ', modelFile cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)