def CalcLabelDistribution(data, modelSpecs): ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * numRanges).reshape( (-1, 1)).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) modelSpecs['labelDistributions'] = allRefProbs return allRefProbs
def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelName, labelType, subType = ParseResponse(currentResponse) symmetric = config.IsSymmetricLabel(labelName) if labelName in config.allOrientationNames: if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported label type for orientation matrix prediction: ', currentResponse exit(1) numLabels = GetResponseProbDims(currentResponse) if subType.endswith('Plus') or subType.endswith('Minus'): largestValidLabel = numLabels - 2 else: largestValidLabel = numLabels - 1 return TopAccuracyOrientation( pred=pred_in_correct_shape, truth=truth_in_correct_shape, largestValidLabel=largestValidLabel, symmetric=symmetric) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): #subType = labelType[len('Discrete'): ] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print 'ERROR: unsupported label type in EvaluateAccuracy: ', labelType exit(1)
def errors(self, zList, weightList=None): errs = [] if weightList is not None and len(weightList) > 0: for res, predictor, z, w, o in zip(self.responses, self.predictors, zList, weightList, self.outputList): labelType = Response2LabelType(res) numLabels = GetResponseProbDims(res) ## if the label type is Discrete25C, Discrete52C, Discrete12C if res in config.allAtomPairNames and labelType.startswith( 'Discrete') and numLabels > 3: assert (z.ndim == 3 and GetResponseValueDims(res) == 1) o2 = o.flatten(3) ## here we convert 12C, 25C, and 52C to 3C for error calculation, which makes the result easier to interpret errs.append( self.errors4one( z, o2, weight=w, distLabelType=labelType[len('Discrete'):])) else: ## call the error function of each predictor if (z.ndim == 3): zflat = z.flatten().dimshuffle(0, 'x') elif (z.ndim == 4): zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) else: print 'unsupported ndim for z in errors():', z.ndim exit(1) assert (w.ndim == 3) wflat = w.flatten().dimshuffle(0, 'x') e = predictor.errors(zflat, sampleWeight=wflat) ## e is a tensor with ndim=1 errs.append(e) else: for res, predictor, z, o in zip(self.responses, self.predictors, zList, self.outputList): labelType = Response2LabelType(res) numLabels = GetResponseProbDims(res) if res in config.allAtomPairNames and labelType.startswith( 'Discrete') and numLabels > 3: assert (z.ndim == 3 and GetResponseValueDims(res) == 1) o2 = o.flatten(3) errs.append( self.errors4one( z, o, distLabelType=labelType[len('Discrete'):])) else: ## call the error function of each predictor if (z.ndim == 3): zflat = z.flatten().dimshuffle(0, 'x') elif (z.ndim == 4): zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) else: print 'unsupported ndim for z in errors():', z.ndim exit(1) e = predictor.errors(zflat) ## e is a tensor with ndim=1 errs.append(e) return T.concatenate(errs)
def __init__(self, rng, seqInput, matrixInput, mask_seq=None, mask_matrix=None, embedInput=None, boundingbox=None, modelSpecs=None): """ seqInput has shape (batchSize, seqLen, n_in_seq) matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix) mask_seq has shape (batchSize, #cols_to_be_masked) mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen) embedInput has shape (batchSize, seqLen, n_in2) boundingbox is a vector of 4 integer elements: top, left, bottom and right. boundingbox shall only be applied to the matrix converted from sequential features. """ assert (modelSpecs is not None) self.modelSpecs = modelSpecs self.responses = modelSpecs['responses'] ## set the number of hidden neurons and number of layers n_in_seq = modelSpecs['n_in_seq'] n_in_matrix = modelSpecs['n_in_matrix'] n_hiddens_seq = modelSpecs['conv1d_hiddens'] n_hiddens_matrix = modelSpecs['conv2d_hiddens'] n_hiddens_logreg = modelSpecs['logreg_hiddens'] seq_repeats = modelSpecs['conv1d_repeats'] matrix_repeats = modelSpecs['conv2d_repeats'] ## half win size for convolutional operation if modelSpecs['network'].startswith('DilatedResNet'): hwsz_matrix = modelSpecs['conv2d_hwszs'] hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq) dilation_seq = [1] * len(n_hiddens_seq) dilation_matrix = modelSpecs['conv2d_dilations'] else: hwsz_matrix = modelSpecs['halfWinSize_matrix'] hwsz_seq = modelSpecs['halfWinSize_seq'] ## masks to reduce impact of padding zeros self.mask_1d = mask_seq self.mask_2d = mask_matrix self.layers = [] act = T.nnet.relu if modelSpecs['activation'] == 'TANH': act = T.tanh # sequence convolution if modelSpecs['network'].startswith('DilatedResNet'): #seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, modelSpecs=modelSpecs) else: seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) self.layers.append(seqConv) ## transform 1d sequence to 2d matrix seq2matrixMode = modelSpecs['seq2matrixMode'] seq2matrixLayers = [] embedLayers = [] ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility ## useSequentialFeatures is True by default ##useSequentialFeatures = ( modelSpecs.has_key('UseSequentialFeatures') and (modelSpecs['UseSequentialFeatures'] is True) ) ## use OuterConcatenation operation to convert sequence features into pairwise features if seq2matrixMode.has_key('OuterCat') and config.UseSequentialFeatures: ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out) midpointfeature, n_midpoint_out = MidpointFeature(seqConv.output, seqConv.n_out, box=boundingbox) ##remove noise in midpointfeature ## mask_matrix is used to reduce noise introduced by padding positions mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :] midpointfeature = T.set_subtensor( mid_subtensor, T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor)) mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :] midpointfeature = T.set_subtensor( mid_subtensor2, T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2)) ## here we use convolution with halfWinSize=0 to reduce model complexity compressLayer = Conv2D4DistMatrix( rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=mask_matrix) #compressLayer = Conv2D4DistMatrix(rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=None ) seq2matrixLayers.append(compressLayer) ## embedding primary sequence and/or predicted secondary structure if embedInput is not None: from EmbeddingLayer import EmbeddingLayer4AllRange if seq2matrixMode.has_key('Seq+SS'): n_out_embed = seq2matrixMode['Seq+SS'] elif seq2matrixMode.has_key('SeqOnly'): n_out_embed = seq2matrixMode['SeqOnly'] else: print 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.' exit(1) embedLayer = EmbeddingLayer4AllRange(embedInput, modelSpecs['n_in_embed'], n_out_embed, box=boundingbox) seq2matrixLayers.append(embedLayer) embedLayers.append(embedLayer) """ we do not use this profile embedding any more ## embedding the sequence profile if seq2matrixMode.has_key('Profile') and useSequentialFeatures: from EmbeddingLayer import ProfileEmbeddingLayer pEmbedLayer = ProfileEmbeddingLayer(seqConv.output, seqConv.n_out, seq2matrixMode['Profile']) seq2matrixLayers.append(pEmbedLayer) embedLayers.append(pEmbedLayer) """ self.layers += seq2matrixLayers bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) if (bUseCCMraw or bUseFullMI or bUseFullCov) and config.CompressMatrixInput(modelSpecs): ## here we add a compress layer to reduce the #channels of the original matrix input. n_hiddens4MatrixCompress = modelSpecs['hiddens4MatrixCompress'] compressLayer4MatrixInput = Conv2D4DistMatrix( rng, input=matrixInput, n_in=n_in_matrix, n_hiddens=n_hiddens4MatrixCompress, halfWinSize=0, mask=mask_matrix) compressedMatrixInput = compressLayer4MatrixInput.output n_compressedMatrix = compressLayer4MatrixInput.n_out input_2d = T.concatenate( [compressedMatrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_compressedMatrix + sum( [layer.n_out for layer in seq2matrixLayers]) else: ##old code for merging original matrix input and sequential input input_2d = T.concatenate( [matrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_in_matrix + sum( [layer.n_out for layer in seq2matrixLayers]) #print 'n_input2d=', n_input2d if modelSpecs['network'].startswith('ResNet'): matrixConv = ResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) elif modelSpecs['network'].startswith('DilatedResNet'): #matrixConv=DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) matrixConv = DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, modelSpecs=modelSpecs) else: print 'ERROR: Unimplemented deep network type: ', modelSpecs[ 'network'] exit(1) self.layers.append(matrixConv) conv_out = matrixConv.output selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) n_in4logreg = matrixConv.n_out self.outputList = [] self.output_probList = [] self.predictors = [] self.params4var = [] self.paramL14var = 0 self.paramL24var = 0 for res in modelSpecs['responses']: labelType = Response2LabelType(res) predictor = None if labelType.startswith('Discrete'): assert GetResponseValueDims(res) == 1 predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) elif labelType.startswith('LogNormal') or labelType.startswith( 'Normal'): predictor = NN4Normal(rng=rng, input=selected, n_in=n_in4logreg, n_variables=GetResponseValueDims(res), n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) ## recording parameters specific for variance prediction self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var else: print 'incorrect response name or label type: ', res exit(1) self.layers.append(predictor) self.predictors.append(predictor) ## output in 2d matrix output_2d = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseValueDims(res))) output_2d_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseProbDims(res))) self.outputList.append(output_2d) self.output_probList.append(output_2d_prob) self.output = T.concatenate(self.outputList, axis=3) self.output_prob = T.concatenate(self.output_probList, axis=3) ## collect all the model parameters and their norms self.params = [] self.paramL2 = 0 self.paramL1 = 0 for layer in self.layers: self.params += layer.params self.paramL2 += layer.paramL2 self.paramL1 += layer.paramL1 """
def CalcLabelWeight(modelSpecs): print 'Calculating label weight ...' numRanges = RangeNWeight.GetNumRanges(modelSpecs) RangeNWeight.SetWeight4Range(modelSpecs) #print 'weight for range: ', modelSpecs['weight4range'] RangeNWeight.SetWeight43C2C(modelSpecs) #print 'LRbias= ', modelSpecs['LRbias'] #print 'weight43C= ', modelSpecs['weight4Discrete3C'] allRefProbs = modelSpecs['labelDistributions'] ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) numLabels = GetResponseProbDims(response) if config.IsContinuousLabel(labelType): ## just need to assign range weight for continuous response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4continuous'] continue if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported response in CalcLabelWeight: ', response exit(1) if labelName in config.allOrientationNames or config.NoWeight4Label( modelSpecs): modelSpecs['weight4labels'][response] = np.multiply( np.ones((numRanges, numLabels), dtype=np.float32), modelSpecs['weight4range']) elif labelName in ['HB', 'Beta']: ## if the response is for HB and Beta-Pairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] else: print 'ERROR: unsupported label subtype in CalcLabelWeight: ', response exit(1) elif labelName in config.allAtomPairNames: ## calculate label weight for atom pairs Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('2C'): print 'ERROR: 2C is not supported for contact/distance prediction any more' exit(1) elif subType.startswith('3C'): ## if 3C is used for the response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4Discrete3C'] else: modelSpecs['weight4labels'][ response] = DistanceUtils.CalcLabelWeight( modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType]) else: print 'ERROR: unsupported label name in CalcLabelWeight: ', response exit(1) ## set the weight of the label for the invalid entry (distance or orientation) to 0 if subType.endswith('Minus'): modelSpecs['weight4labels'][response][:, -1] = 0 """ ## for log for response in modelSpecs['responses']: print 'weight4labels for response: ', response print modelSpecs['weight4labels'][response] """ return modelSpecs['weight4labels']
def PredictMatrixLabels(models, predictors, names, inputFolders, aliFolders=None, tplFolder=None, aliFile=None, tplFile=None, saveFolder=None): if not isinstance(names, (list, tuple)): targetName = names else: targetName = None ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results ##two different models may share common responses allsequences = dict() allresults = dict() ## the results predicted from the real input numModels = dict( ) ## count the number of models that may predict each response for model, predictor in zip(models, predictors): #predict, inputVariables = BuildPredictor(model) predict, inputVariables = predictor ## load data for each model separately since each model may have a different specification if targetName is None: rawData = LoadProteinData4OneModel(model, names, inputFolders, aliFolders, tplFolder) elif aliFile is not None and tplFile is not None: rawData = LoadOneAlignment4OneModel(model, targetName, inputFolders, aliFile, tplFile) else: rawData = LoadOneProteinData4OneModel(model, targetName, inputFolders, aliFolders, tplFolder) predData = DataProcessor.ExtractFeaturesNLabels( rawData, modelSpecs=model, forTrainValidation=False, returnMode='list') ##make sure the input has the same number of features as the model FeatureUtils.CheckModelNDataConsistency(model, predData) ## check sequence consistency for d in predData: name = d['name'] if not allresults.has_key(name): allresults[name] = dict() numModels[name] = dict() if not allsequences.has_key(name): allsequences[name] = d['sequence'] elif allsequences[name] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) ##for onebatch, names4onebatch in zip(predSeqData, names): for minibatch in predSeqData: onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, model) input = onebatch[:len(inputVariables)] result = predict(*input) ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum([ GetResponseProbDims(response) for response in model['responses'] ]) ## calculate the start and end positions of each response in the last dimension of result dims = [ GetResponseProbDims(response) for response in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] for response, start, end in zip(model['responses'], startPositions, endPositions): ## batchres is a batch of result, its ndim=4 ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end] ## remove masked positions revised_batchres = [ probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): finalresults[name][response] = (allresults[name][response] / numModels[name][response]).astype( np.float32) ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. labelName = Response2LabelName(response) if config.IsSymmetricLabel(labelName): finalresults[name][response] = ( finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2))) / 2. ## convert predicted distance probability matrix into contact matrix predictedContactMatrices = DeriveContactMatrix(finalresults) ## collect the average label distributions and weight matrix finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution( models) ##write all the results here ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution) for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if saveFolder is not None: savefilename = os.path.join(saveFolder, savefilename) if targetName is not None: originalName = targetName else: for n in names: if name.startswith(n): originalName = n break with open(savefilename, 'wb') as fh: #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump((originalName, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) return (predictedContactMatrices, allsequences) """