def errors(self, zList, weightList=None): errs = [] if weightList is not None and len(weightList)>0: for res, predictor, z, w, o in zip(self.responses, self.predictors, zList, weightList, self.outputList): labelType = Response2LabelType(res) numLabels = config.responseProbDims[labelType] ## if the label type is Discrete25C, Discrete52C, Discrete12C if labelType.startswith('Discrete') and numLabels > 3: assert (z.ndim == 3 and config.responseValueDims[labelType] == 1 ) o2 = o.flatten(3) ## here we convert 12C, 25C, and 52C to 3C for error calculation, which makes the result easier to interpret errs.append( self.errors4one(z, o2, weight=w, distLabelType=labelType[len('Discrete'): ] ) ) else: ## call the error function of each predictor if (z.ndim == 3 ): zflat = z.flatten().dimshuffle(0, 'x') elif (z.ndim == 4 ): zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) else: print('unsupported ndim for z in errors():', z.ndim) exit(-1) assert (w.ndim == 3) wflat = w.flatten().dimshuffle(0, 'x') e = predictor.errors(zflat, sampleWeight=wflat) ## e is a tensor with ndim=1 errs.append(e) else: for res, predictor, z, o in zip(self.responses, self.predictors, zList, self.outputList): labelType = Response2LabelType(res) numLabels = config.responseProbDims[labelType] if labelType.startswith('Discrete') and numLabels > 3 : assert (z.ndim == 3 and config.responseValueDims[labelType] == 1 ) o2 = o.flatten(3) errs.append( self.errors4one(z, o, distLabelType=labelType[len('Discrete'): ] ) ) else: ## call the error function of each predictor if (z.ndim == 3): zflat = z.flatten().dimshuffle(0, 'x') elif (z.ndim == 4): zflat = z.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) else: print('unsupported ndim for z in errors():', z.ndim) exit(-1) e = predictor.errors(zflat) ## e is a tensor with ndim=1 errs.append(e) return T.concatenate(errs)
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features x = T.tensor3('x') ## mask for x xmask = T.bmatrix('xmask') propertyPredictor = ResNet4Properties( rng, seqInput=x, mask_seq=xmask, modelSpecs=modelSpecs ) ## labelList is a list of label matrices, each with shape (batchSize, seqLen, numLabels) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable labelList = [] for res in modelSpecs['responses']: labelType = Response2LabelType(res) if labelType.startswith('Discrete'): labelList.append( T.itensor3('label4' + res ) ) else: labelList.append( T.tensor3('label4' + res ) ) ## weightList is a list of label weight matices, each with shape (batchSize, seqLen, 1) ## we always use weight to deal with residues without 3D coordinates weightList = [] if len(labelList)>0: weightList = [ T.tensor3('weight4' + res ) for res in modelSpecs['responses'] ] if len(labelList)>0: return propertyPredictor, x, xmask, labelList, weightList else: return propertyPredictor, x, xmask
def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelType = Response2LabelType(currentResponse) atomType = Response2LabelName(currentResponse) symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta']) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print 'unsupported label type in EvaluateAccuracy: ', labelType exit(-1)
def CalcLabelDistribution(data, modelSpecs): ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * numRanges).reshape( (-1, 1)).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) modelSpecs['labelDistributions'] = allRefProbs return allRefProbs
def Coding2String(coding, response): if coding.ndim == 2: code = coding[:, 0] else: code = coding labelType = Response2LabelType(response) if response.startswith('SS'): if labelType.endswith('3C'): str = ''.join([SS3Code2Letter[c] for c in code]) elif labelType.endswith('8C'): str = ''.join([SS8Code2Letter[c] for c in code]) else: print 'ERROR: unsupported response and labelType: ', response exit(1) return str if response.startswith('ACC'): assert (labelType.endswith('3C')) str = ''.join([ACCCode2Letter[c] for c in code]) return str if response.startswith('CLE'): assert (labelType.endswith('18C')) str = ''.join([CLECode2Letter[c] for c in code]) return str print 'ERROR: unsupported response: ', response exit(1)
def String2Coding(str, response): labelType = Response2LabelType(response) if response.startswith('SS'): if labelType.endswith('3C'): code = [SS3Letter2Code[c] for c in str] code = np.array(code).astype(np.int32).reshape((len(str), 1)) elif labelType.endswith('8C'): code = [SS8Letter2Code[c] for c in str] code = np.array(code).astype(np.int32).reshape((len(str), 1)) else: print 'ERROR: unsupported response and labelType: ', response exit(1) return code if response.startswith('ACC'): assert (labelType.endswith('3C')) code = [ACCLetter2Code[c] for c in str] code = np.array(code).astype(np.int32).reshape((len(str), 1)) return code if response.startswith('CLE'): assert (labelType.endswith('18C')) code = [CLELetter2Code[c] for c in str] code = np.array(code).astype(np.int32).reshape((len(str), 1)) return code print 'ERROR: unsupported response: ', response exit(1)
def CalcRefState4OneBatch(batch, modelSpecs, minSeqSep=3): ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in batch ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.]).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) allRefProbs[response], avgLen = CalcLabelProb( labelMatrices=newLabelMatrices, numLabels=config.responseProbDims[labelType], minSeqSep=minSeqSep) else: allRefProbs[response], avgLen = CalcLabelProb( labelMatrices=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=config.responseProbDims[labelType], minSeqSep=minSeqSep) return allRefProbs, avgLen
def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelType = Response2LabelType(currentResponse) atomType = Response2LabelName(currentResponse) symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta']) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print('unsupported label type in EvaluateAccuracy: ', labelType) exit(-1) accuracyList = [] for res, out_prob, z, ratio in zip(self.responses, self.output_probList, zList, self.modelSpecs['topRatios']): ## currently TopAccuracy only works when the dimension of each z is 3 assert z.ndim == 3 if self.mask_1d is not None: paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d, axis=1) else: paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32) currentResponse = res topRatio = ratio ##here we use scan to calculate accuracy for each protein result, updates = theano.scan(fn=EvaluateAccuracy, outputs_info=None, sequences=[out_prob, z, paddingLens]) accuracy = T.mean(result, axis=0) accuracyList.append(accuracy) return T.stacklists(accuracyList)
def CalcLabelWeightMatrix(LabelMatrix=None, modelSpecs=None): if LabelMatrix is None: return None M1s = np.ones_like(LabelMatrix.values()[0], dtype=np.int16) np.fill_diagonal(M1s, 0) LRmask = np.triu(M1s, 24) + np.tril(M1s, -24) MLRmask = np.triu(M1s, 12) + np.tril(M1s, -12) SMLRmask = np.triu(M1s, 6) + np.tril(M1s, -6) SRmask = SMLRmask - MLRmask MRmask = MLRmask - LRmask NRmask = M1s - SMLRmask for response in modelSpecs['responses']: if response not in modelSpecs['weight4labels']: print('Cannot find the weight factor tensor for response ', response) exit(-1) ##the below procedure is not very effective. We shall improve it later. labelWeightMatrices = dict() for response in modelSpecs['responses']: ##name = Response2LabelName(response) labelType = Response2LabelType(response) labelWeightMatrices[response] = np.zeros_like( LabelMatrix[response], dtype=theano.config.floatX) ## wMatrix is a matrix with dimension 4 * numLabels wMatrix = modelSpecs['weight4labels'][response] wMatrixShape = wMatrix.shape assert (wMatrixShape[0] == 4) if labelType.startswith('Normal') or labelType.startswith('LogNormal'): ## if the label is real value, then for each range, there is only a single weight for all the possible values tmpWeightMatrices = [] for i in range(4): tmp = wMatrix[i][M1s] ## set the weight of the entries without valid distance to 0. An invalid entry in the label matrix is indicated by a negative value,e.g., -1 np.putmask(tmp, LabelMatrix[response] < 0, 0) tmpWeightMatrices.append(tmp) else: tmpWeightMatrices = [ wMatrix[i][LabelMatrix[response]] for i in range(4) ] LRw, MRw, SRw, NRw = tmpWeightMatrices labelWeightMatrices[response] += (LRmask * LRw + MRmask * MRw + SRmask * SRw + NRmask * NRw) return labelWeightMatrices
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, modelSpecs=modelSpecs ) else: distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, modelSpecs=modelSpecs ) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = config.responseValueDims[labelType] if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append( T.wtensor4('Tlabel4' + response ) ) else: labelList.append( T.wtensor3('Tlabel4' + response ) ) else: if rValDims > 1: labelList.append( T.tensor4('Tlabel4' + response ) ) else: labelList.append( T.tensor3('Tlabel4' + response ) ) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList)>0 and modelSpecs['UseSampleWeight']: weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def QuickValidateAllData(SeqDataset, validate, modelSpecs): losses = [] errs = [] if modelSpecs['UseSampleWeight']: w4losses = [] w4errors = [] else: w4losses = None w4errors = None for onebatch in SeqDataset: los, err = validate(*onebatch) losses.append(los) errs.append(err) ##two different batches may have different number of residues and different distribution of labels ##so we shall normalize the loss and errors by the weight of different batches if modelSpecs['UseSampleWeight']: weights = onebatch[len(onebatch) - len(modelSpecs['responses']):] w4loss = [] w4error = [] for res, w in zip(modelSpecs['responses'], weights): wSum = np.sum(w) w4loss.append(wSum) w4error.extend( [wSum] * config.responseValueDims[Response2LabelType(res)]) w4losses.append(w4loss) w4errors.append(w4error) losses = np.array(losses) errs = np.array(errs) return np.average(losses, axis=0, weights=w4losses), np.average(errs, axis=0, weights=w4errors)
def CalcLabelDistributionAndWeight(data=None, modelSpecs=None): ## weight for different ranges (long, medium, short, and near-ranges) if 'weight4range' not in modelSpecs: modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape( (4, 1)).astype(np.float32) else: modelSpecs['weight4range'].reshape((4, 1)).astype(np.float32) print('weight for range: ', modelSpecs['weight4range']) ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15 if 'LRbias' in modelSpecs: modelSpecs['weight4Discrete3C'] = np.multiply( config.weight43C[modelSpecs['LRbias']], modelSpecs['weight4range']) else: modelSpecs['weight4Discrete3C'] = np.multiply( config.weight43C['mid'], modelSpecs['weight4range']) print('LRbias= ', modelSpecs['LRbias'], 'weight43C= ', modelSpecs['weight4Discrete3C']) ## weight for 2C modelSpecs['weight4HB_Discrete2C'] = np.multiply( config.weight4HB2C, modelSpecs['weight4range']) modelSpecs['weight4Beta_Discrete2C'] = np.multiply( config.weight4Beta2C, modelSpecs['weight4range']) ## weight for real value modelSpecs['weight4continuous'] = np.multiply( np.array([1.] * 4).reshape((4, 1)).astype(np.float32), modelSpecs['weight4range']) ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * 4).reshape( (4, 1)).astype(np.float32) continue if 'UseBoundingBox4RefProbs' in modelSpecs and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) allRefProbs[response] = DistanceUtils.CalcLabelProb( data=newLabelMatrices, numLabels=config.responseProbDims[labelType]) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=[m.astype(np.int32) for m in allLabelMatrices[response]], numLabels=config.responseProbDims[labelType]) modelSpecs['labelRefProbs'] = allRefProbs ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): ## just need to assign range weight modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4continuous'] continue if labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] ## if the response is for HB and BetaPairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] continue ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('3C'): modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4Discrete3C'] continue ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO modelSpecs['weight4labels'][ response] = DistanceUtils.CalcLabelWeight( modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType]) continue print('unsupported response in CalcLabelDistributionAndWeight: ', response) exit(-1) return modelSpecs['labelRefProbs'], modelSpecs['weight4labels']
def MergeOneProtein(inputFiles, method): if inputFiles is None or len(inputFiles) < 2: print 'Please provide at least two predicted matrices for merge' exit(-1) seqName = None sequence = None distProbs = dict() contactProbs = dict() labelDistributions = dict() labelWeights = dict() labelWeightFlags = [] tempNames = [] for inputFile in inputFiles: content = DistanceUtils.LoadRawDistProbFile(inputFile) name0, sequence0, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content ##add code here to check all the input files have the same protein name seqName0 = '-'.join(name0.split('-')[0:-1]) tempName = name0.split('-')[-1] tempNames.append(tempName) labelWeightFlags.append( labelWeight is not None ) if seqName is None: seqName = seqName0 else: assert seqName == seqName0 if sequence is None: sequence = sequence0 else: assert sequence == sequence0 for apt in predictedDistProb.keys(): if not distProbs.has_key(apt): distProbs[apt] =[] distProbs[apt].append( predictedDistProb[apt] ) for apt in predictedContactProb.keys(): if not contactProbs.has_key(apt): contactProbs[apt] = [] contactProbs[apt].append( predictedContactProb[apt] ) if labelWeight is not None: for apt in labelWeight.keys(): if not labelWeights.has_key(apt): labelWeights[apt] = [] labelWeights[apt].append( labelWeight[apt] ) for apt in labelDistribution.keys(): if not labelDistributions.has_key(apt): labelDistributions[apt] = [] labelDistributions[apt].append( labelDistribution[apt] ) ## check consistency among labelWeightFlags consistent = all( flag==labelWeightFlags[0] for flag in labelWeightFlags) if not consistent: print 'ERROR: the input matrix files have inconsistent format. Some have a labelWeight while others do not.' exit(-1) ### Ms is a dictionary, each value in Ms is a list of matrices ### this function calculates the geometric mean of all the matrices in the same list and the renormalize the last dim of the resultant mean def CalcGeometricMean( Ms ): result = dict() for apt, v in Ms.iteritems(): result[apt] = scipy.stats.mstats.gmean(v, axis=0) tmp_sum = np.sum(result[apt], axis=-1, keepdims=True) result[apt] = result[apt]/tmp_sum return result ## calculate arithmetic mean def CalcArithmeticMean( Ms ): result = dict() for apt, v in Ms.iteritems(): result[apt] = np.mean(v, axis=0) return result if method == 'amean': distMatrixProb = CalcArithmeticMean(distProbs) labelDistribution = CalcArithmeticMean(labelDistributions) else: distMatrixProb = CalcGeometricMean(distProbs) labelDistribution = CalcGeometricMean(labelDistributions) contactMatrixProb = dict() for k in distMatrixProb.keys(): apt = Response2LabelName(k) labelType = Response2LabelType(k) if not labelType.startswith('Discrete'): print 'ERROR: this labelType currently not supported in TPLMergePredicteDistMatrix.py : ', labelType exit(-1) subType = labelType[ len('Discrete'): ] labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType]) contactMatrixProb[apt] = ContactUtils.Distance2Contact(distMatrixProb[k], labelOf8) if labelWeightFlags[0] is True: labelWeight = CalcArithmeticMean(labelWeights) targetName = '-'.join( [ seqName ] + tempNames ) if labelWeightFlags[0] is True: content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, labelWeight, labelDistribution) else: content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, None, labelDistribution) return contactMatrixProb, content4save
def PredictProperty(models, predictors, predFiles): allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list allresults4prob = dict() allresults = dict() for model, predictor in zip(models, predictors): predict, inputVariables = predictor ## We shall load these files for each model separately since each model may use a different set of features predData = DataProcessor.LoadPropertyFeatures(predFiles, modelSpecs=model, forTrainValidation=False) ##make sure the input has the same number of features as the model rindex = np.random.randint(0, high=len(predData)) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] ## collecting sequences for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData, names = DataProcessor.SplitData2Batches( data=predData, numDataPoints=30, modelSpecs=model, forTrainValidation=False) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[:len(inputVariables)] result4prob, result = predict(*input) ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked) x1d, x1dmask = input[0:2] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses']) ) assert result4prob.shape[2] == sum([ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ]) ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses']) ) assert result.shape[2] == sum([ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ]) nameGenerator = (name for name in names4onebatch if not allresults.has_key(name)) for name in nameGenerator: allresults[name] = dict() allresults4prob[name] = dict() dims = [ config.responseProbDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults4prob[name].has_key(res)) for name in nameGenerator: allresults4prob[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result4prob[:, :, start:end], seqLens) ] [ allresults4prob[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] dims = [ config.responseValueDims[Response2LabelType(res)] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for res, start, end in zip(model['responses'], startPositions, endPositions): nameGenerator = (name for name in names4onebatch if not allresults[name].has_key(res)) for name in nameGenerator: allresults[name][res] = [] ## remove masked positions revised_batchres = [ tmp[maxSeqLen - seqLen:, :] for tmp, seqLen in zip(result[:, :, start:end], seqLens) ] [ allresults[name][res].append(res4one) for res4one, name in zip(revised_batchres, names4onebatch) ] ## calculate the final result, which is the average of all the predictd properties for the same protein and response name finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() for response in results.keys(): tmpresult = np.average(allresults[name][response], axis=0) ##convert coding of discrete labels to more meaningful representation labelType = Response2LabelType(response) if not labelType.startswith('Discrete'): finalresults[name][response] = tmpresult finalresults4prob = dict() for name, results in allresults4prob.iteritems(): if not finalresults4prob.has_key(name): finalresults4prob[name] = dict() for response in results.keys(): finalresults4prob[name][response] = np.average( allresults4prob[name][response], axis=0) labelType = Response2LabelType(response) if labelType.startswith('Discrete'): tmpresult = np.argmax(finalresults4prob[name][response], axis=1) finalresults[name][response] = PropertyUtils.Coding2String( tmpresult, response) """ ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for apt in model['responseNames']: if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][apt]) labelWeights[apt].append(model['weight4' + model['labelType'] ][apt]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) """ return finalresults4prob, finalresults, allsequences
def __init__(self, rng, seqInput, matrixInput, mask_seq=None, mask_matrix=None, embedInput=None, modelSpecs=None): """ seqInput has shape (batchSize, seqLen, n_in_seq) matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix) mask_seq has shape (batchSize, #cols_to_be_masked) mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen) embedInput has shape (batchSize, seqLen, n_in2) """ assert (modelSpecs is not None) self.modelSpecs = modelSpecs self.responses = modelSpecs['responses'] ## set the number of hidden neurons and number of layers n_in_seq = modelSpecs['n_in_seq'] n_in_matrix = modelSpecs['n_in_matrix'] n_hiddens_seq = modelSpecs['conv1d_hiddens'] n_hiddens_matrix = modelSpecs['conv2d_hiddens'] n_hiddens_logreg = modelSpecs['logreg_hiddens'] seq_repeats = modelSpecs['conv1d_repeats'] matrix_repeats = modelSpecs['conv2d_repeats'] ## half win size for convolutional operation if modelSpecs['network'].startswith('DilatedResNet'): hwsz_matrix = modelSpecs['conv2d_hwszs'] hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq) dilation_seq = [1] * len(n_hiddens_seq) dilation_matrix = modelSpecs['conv2d_dilations'] else: hwsz_matrix = modelSpecs['halfWinSize_matrix'] hwsz_seq = modelSpecs['halfWinSize_seq'] ## masks to reduce impact of padding zeros self.mask_1d = mask_seq self.mask_2d = mask_matrix self.layers = [] # sequence convolution if modelSpecs['network'].startswith('DilatedResNet'): seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) else: seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) self.layers.append(seqConv) ## transform 1d sequence to 2d matrix seq2matrixMode = modelSpecs['seq2matrixMode'] seq2matrixLayers = [] embedLayers = [] ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility ## useSequentialFeatures is True by default useSequentialFeatures = ('UseSequentialFeatures' in modelSpecs \ and (modelSpecs['UseSequentialFeatures'] is True)) ## use OuterConcatenation operation to convert sequence features into pairwise features if 'OuterCat' in seq2matrixMode and useSequentialFeatures: ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out) midpointfeature, n_midpoint_out = MidpointFeature( seqConv.output, seqConv.n_out) ##remove noise in midpointfeature ## mask_matrix is used to reduce noise introduced by padding positions mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :] midpointfeature = T.set_subtensor( mid_subtensor, T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor)) mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :] midpointfeature = T.set_subtensor( mid_subtensor2, T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2)) ## here we use convolution with halfWinSize=0 to reduce model complexity compressLayer = Conv2D4DistMatrix( rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=mask_matrix) seq2matrixLayers.append(compressLayer) ## embedding primary sequence and/or predicted secondary structure if embedInput is not None: from EmbeddingLayer import MetaEmbeddingLayer if 'Seq+SS' in seq2matrixMode: n_out_embed = seq2matrixMode['Seq+SS'] elif 'SeqOnly' in seq2matrixMode: n_out_embed = seq2matrixMode['SeqOnly'] else: print( 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.' ) sys.exit(-1) embedLayer = MetaEmbeddingLayer(embedInput, modelSpecs['n_in_embed'], n_out_embed) seq2matrixLayers.append(embedLayer) embedLayers.append(embedLayer) self.layers += seq2matrixLayers input_2d = T.concatenate([matrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_in_matrix + sum( [layer.n_out for layer in seq2matrixLayers]) if modelSpecs['network'].startswith('ResNet'): matrixConv = ResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, mask=mask_matrix, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) elif modelSpecs['network'].startswith('DilatedResNet'): matrixConv = DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) else: print('Unimplemented deep network type: ', modelSpecs['network']) exit(-1) self.layers.append(matrixConv) conv_out = matrixConv.output selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) n_in4logreg = matrixConv.n_out self.outputList = [] self.output_probList = [] self.predictors = [] self.params4var = [] self.paramL14var = 0 self.paramL24var = 0 for res in modelSpecs['responses']: labelType = Response2LabelType(res) predictor = None if labelType.startswith('Discrete'): assert (config.responseValueDims[labelType] == 1) predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg) elif labelType.startswith('LogNormal') or labelType.startswith( 'Normal'): predictor = NN4Normal( rng=rng, input=selected, n_in=n_in4logreg, n_variables=config.responseValueDims[labelType], n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg) ## recording parameters specific for variance prediction self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var else: print('incorrect response name or label type: ', res) exit(-1) self.layers.append(predictor) self.predictors.append(predictor) ## output in 2d matrix output_2d = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], config.responseValueDims[labelType])) output_2d_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], config.responseProbDims[labelType])) self.outputList.append(output_2d) self.output_probList.append(output_2d_prob) self.output = T.concatenate(self.outputList, axis=3) self.output_prob = T.concatenate(self.output_probList, axis=3) ## collect all the model parameters and their norms self.params = [] self.paramL2 = 0 self.paramL1 = 0 for layer in self.layers: self.params += layer.params self.paramL2 += layer.paramL2 self.paramL1 += layer.paramL1
def AssembleOneBatch(data, modelSpecs, forTrainValidation=True): if not data: print 'WARNING: the list of data is empty' return None numSeqs = len(data) seqLens = [d['seqLen'] for d in data] maxSeqLen = max(seqLens) minSeqLen = min(seqLens) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]), dtype=theano.config.floatX) ## for mask M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8) ## Y shall be a list of labels, each for one type ##we always need a weight vector to deal with residues without 3D coordinates in training and validation, if modelSpecs['UseSampleWeight']: Y = [] W = [] for res in modelSpecs['responses']: labelType = Response2LabelType(res) labelName = Response2LabelName(res) dataType = (np.int32 if labelType.startswith('Discrete') else theano.config.floatX) if forTrainValidation: if not data[0].has_key(labelName): print 'ERROR: label information is needed for training protein ', data[ 'name'], ' and response ', res exit(1) Y.append( np.zeros(shape=(numSeqs, maxSeqLen, config.responseValueDims[labelType]), dtype=dataType)) if not data[0].has_key('missing'): print 'ERROR: missing information is needed for training protein ', data[ 'name'] exit(1) W.append( np.zeros(shape=(numSeqs, maxSeqLen, 1), dtype=theano.config.floatX)) for j in range(len(data)): seqLen = data[j]['seqLen'] X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures'] M1d[j, maxSeqLen - seqLen:].fill(1) for y, w, res in zip(Y, W, modelSpecs['responses']): y[j, maxSeqLen - seqLen:, ] = data[j][Response2LabelName(res)] if res.startswith('DISO'): ## for disorder prediction, all the residues shall be considered since those residues without 3D coordinates are positive examples ## we may assign a larger weight to positive examples since they are only 6% of the whole data set w[j, maxSeqLen - seqLen:, ] = np.reshape( data[j]['missing'], (-1, 1)) * (modelSpecs['w4diso'] - 1.) + 1. else: ## assign weight 0 to those residues without coordinates, otherwise 1 w[j, maxSeqLen - seqLen:, ] = 1.0 - np.reshape(data[j]['missing'], (-1, 1)) onebatch = [X1d, M1d] onebatch.extend(Y) onebatch.extend(W) return onebatch
def LoadPropertyFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files) == 0: print 'ERROR: the feature files is empty' exit(1) data = [] for infile in files: with open(infile, 'rb') as fh: data.extend(cPickle.load(fh)) EmbeddingModel = None if modelSpecs.has_key( 'UseSequenceEmbedding') and modelSpecs['UseSequenceEmbedding']: EmbeddingModelFile = os.path.join( os.environ['DL4PropertyPredHome'], 'data', 'Mofrad-PLoSOne-2015Nov.3GramEmbeddingParams.pkl') EmbeddingModel = SequenceEmbedding.LoadEmbeddingParamsInPKL( EmbeddingModelFile) ## each protein has sequential features as input proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ##collecting sequential features... seqMatrices = [] seqMatrices.append(d['PSSM']) ##seqMatrices.append( d['PSFM'] ) ##Load sequence embedding features here if EmbeddingModel is not None: seqMatrices.append( SequenceEmbedding.EmbedOneSequence(d['sequence'], EmbeddingModel)) if modelSpecs.has_key('UsePSFM') and modelSpecs['UsePSFM']: seqMatrices.append(d['PSFM']) if modelSpecs.has_key( 'UseOneHotEncoding') and modelSpecs['UseOneHotEncoding']: seqMatrices.append(config.SeqOneHotEncoding(d['sequence'])) ## add template similarity score here if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if not d.has_key('tplSimScore'): print 'ERROR: no tplSimScore for target', d[ 'name'], 'which is needed since you specify to use template information' exit(1) if d['tplSimScore'].shape[1] != 10: print 'ERROR: the number of query-template similarity features is not 10 in data for', d[ 'name'] exit(1) if not d.has_key('tplProperties'): print 'ERROR: no tplProperties for target', d[ 'name'], 'which is needed since you specify to use template information' exit(1) if d['tplProperties'].shape[1] < 15: print 'ERROR: #template local structure properties shall be at least 15 for target', d[ 'name'] exit(1) ## the query-template similarity score shall be arranged in the order of: AA identity (binary), blosum80, blosum62, blosum45, spScore, spScore_ST, ppScore, pmScore, cc, hdsm seqMatrices.append(d['tplSimScore']) ##we do not use omg information from the template, the first 8 features shall be the 8-state secondary structure, then followed by pACC, CNa, CNb, Phi, Psi, Theta and Tau #seqMatrices.append( d['tplProperties'][:,:15] ) seqMatrices.append(d['tplProperties'][:, :8]) for r in modelSpecs['responses']: if r.startswith('ACC'): seqMatrices.append(d['tplProperties'][:, 8:9]) elif r.startswith('Phi') or r.startswith( 'Psi') or r.startswith('CLE'): seqMatrices.append(d['tplProperties'][:, 11:13]) elif r.startswith('Theta') or r.startswith('Tau'): seqMatrices.append(d['tplProperties'][:, 13:15]) elif r.startswith('CNa') or r.startswith('CNb'): seqMatrices.append(d['tplProperties'][:, 9:11]) else: print 'ERROR: unsupported response', r exit(1) if d.has_key('otherSeqFeatures'): seqMatrices.append(d['otherSeqFeatures']) ## all the features shall have shape (seqLen, nFeatures) where nFeatures is variable, but seqLen is the sequence length of one protein seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32) oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature if not d.has_key('DISO') and d.has_key('Missing'): d['DISO'] = d['Missing'] ##collecting labels... for r in modelSpecs['responses']: labelName = Response2LabelName(r) labelType = Response2LabelType(r) if not d.has_key(labelName) and forTrainValidation: print 'ERROR: missing label information for protein ', d[ 'name'], ' and response ', r exit(1) elif not d.has_key(labelName): continue labels = d[labelName] ## need some special handling of discrete labels if labelType.startswith('Discrete'): if r.startswith('SS3'): labels = np.array([ PropertyUtils.SS3Letter2Code[c] for c in labels ]).reshape((-1, 1)) elif r.startswith('SS8'): labels = np.array([ PropertyUtils.SS8Letter2Code[c] for c in labels ]).reshape((-1, 1)) elif r.startswith('ACC') or r.startswith('DISO'): labels = labels.reshape((-1, 1)) elif r.startswith('CLE'): labels = np.array([ PropertyUtils.CLELetter2Code[c] for c in labels ]).reshape((-1, 1)) else: print 'ERROR: please specify how to convert your discrete labels to numbers for response ', r exit(1) oneprotein[labelName] = labels ##at this point, finish collecting features and labels for one protein if d.has_key('Missing'): oneprotein['missing'] = d['Missing'] elif forTrainValidation: print 'ERROR: for training data, we need information to specify which residues have no 3D coordinates' exit(1) proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 1): print 'assembled features and labels for ', counter, ' proteins.' """ tmpfile = open(files[0] + '.contactInput.pkl', 'wb') cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL) tmpfile.close() """ return proteinFeatures
def AssembleOneBatch(data, modelSpecs): if not data: print('WARNING: the list of data is empty') return None numSeqs = len(data) seqLens = [d['seqLen'] for d in data] maxSeqLen = max(seqLens) minSeqLen = min(seqLens) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]), dtype=theano.config.floatX) X2d = np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen, data[0]['matrixFeatures'].shape[2]), dtype=theano.config.floatX) X1dem = None if 'embedFeatures' in data[0]: X1dem = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['embedFeatures'].shape[1]), dtype=theano.config.floatX) ## Y shall be a list of 3D matrices, each for one atom type. Need to revise dtype for Y Y = [] if 'atomLabelMatrix' in data[0]: for response in modelSpecs['responses']: labelType = Response2LabelType(response) dataType = np.int16 if not labelType.startswith('Discrete'): dataType = theano.config.floatX rValDims = config.responseValueDims[labelType] if rValDims == 1: Y.append( np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen), dtype=dataType)) else: Y.append( np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen, nValDims), dtype=dataType)) ## when Y is empty, weight is useless. So When Y is None, weight shall also be None weightMatrix = [] if Y and modelSpecs['UseSampleWeight']: weightMatrix = [ np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen), dtype=theano.config.floatX) ] * len(modelSpecs['responses']) ## for mask M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8) M2d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen, maxSeqLen), dtype=np.int8) for j in range(len(data)): seqLen = data[j]['seqLen'] X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures'] X2d[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:, :] = data[j]['matrixFeatures'] M1d[j, maxSeqLen - seqLen:].fill(1) M2d[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:].fill(1) if X1dem is not None: X1dem[j, maxSeqLen - seqLen:, :] = data[j]['embedFeatures'] if Y: for y, response in zip(Y, modelSpecs['responses']): if len(y.shape) == 3: y[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:] = data[j]['atomLabelMatrix'][response] else: y[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:, ] = data[j]['atomLabelMatrix'][response] if weightMatrix: ## we calculate the labelWeightMatrix here labelWeightMatrix = CalcLabelWeightMatrix( data[j]['atomLabelMatrix'], modelSpecs) for w, at in zip(weightMatrix, modelSpecs['responses']): w[j, maxSeqLen - seqLen:, maxSeqLen - seqLen:] = labelWeightMatrix[at] onebatch = [X1d, X2d, M1d, M2d] if X1dem is not None: onebatch.append(X1dem) onebatch.extend(Y) onebatch.extend(weightMatrix) return onebatch
def __init__(self, rng, seqInput, matrixInput, mask_seq=None, mask_matrix=None, embedInput=None, boundingbox=None, modelSpecs=None): """ seqInput has shape (batchSize, seqLen, n_in_seq) matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix) mask_seq has shape (batchSize, #cols_to_be_masked) mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen) embedInput has shape (batchSize, seqLen, n_in2) boundingbox is a vector of 4 integer elements: top, left, bottom and right. boundingbox shall only be applied to the matrix converted from sequential features. """ assert (modelSpecs is not None) self.modelSpecs = modelSpecs self.responses = modelSpecs['responses'] ## set the number of hidden neurons and number of layers n_in_seq = modelSpecs['n_in_seq'] n_in_matrix = modelSpecs['n_in_matrix'] n_hiddens_seq = modelSpecs['conv1d_hiddens'] n_hiddens_matrix = modelSpecs['conv2d_hiddens'] n_hiddens_logreg = modelSpecs['logreg_hiddens'] seq_repeats = modelSpecs['conv1d_repeats'] matrix_repeats = modelSpecs['conv2d_repeats'] ## half win size for convolutional operation if modelSpecs['network'].startswith('DilatedResNet'): hwsz_matrix = modelSpecs['conv2d_hwszs'] hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq) dilation_seq = [1] * len(n_hiddens_seq) dilation_matrix = modelSpecs['conv2d_dilations'] else: hwsz_matrix = modelSpecs['halfWinSize_matrix'] hwsz_seq = modelSpecs['halfWinSize_seq'] ## masks to reduce impact of padding zeros self.mask_1d = mask_seq self.mask_2d = mask_matrix self.layers = [] act = T.nnet.relu if modelSpecs['activation'] == 'TANH': act = T.tanh # sequence convolution if modelSpecs['network'].startswith('DilatedResNet'): #seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, modelSpecs=modelSpecs) else: seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) self.layers.append(seqConv) ## transform 1d sequence to 2d matrix seq2matrixMode = modelSpecs['seq2matrixMode'] seq2matrixLayers = [] embedLayers = [] ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility ## useSequentialFeatures is True by default ##useSequentialFeatures = ( modelSpecs.has_key('UseSequentialFeatures') and (modelSpecs['UseSequentialFeatures'] is True) ) ## use OuterConcatenation operation to convert sequence features into pairwise features if seq2matrixMode.has_key('OuterCat') and config.UseSequentialFeatures: ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out) midpointfeature, n_midpoint_out = MidpointFeature(seqConv.output, seqConv.n_out, box=boundingbox) ##remove noise in midpointfeature ## mask_matrix is used to reduce noise introduced by padding positions mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :] midpointfeature = T.set_subtensor( mid_subtensor, T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor)) mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :] midpointfeature = T.set_subtensor( mid_subtensor2, T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2)) ## here we use convolution with halfWinSize=0 to reduce model complexity compressLayer = Conv2D4DistMatrix( rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=mask_matrix) #compressLayer = Conv2D4DistMatrix(rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=None ) seq2matrixLayers.append(compressLayer) ## embedding primary sequence and/or predicted secondary structure if embedInput is not None: from EmbeddingLayer import EmbeddingLayer4AllRange if seq2matrixMode.has_key('Seq+SS'): n_out_embed = seq2matrixMode['Seq+SS'] elif seq2matrixMode.has_key('SeqOnly'): n_out_embed = seq2matrixMode['SeqOnly'] else: print 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.' exit(1) embedLayer = EmbeddingLayer4AllRange(embedInput, modelSpecs['n_in_embed'], n_out_embed, box=boundingbox) seq2matrixLayers.append(embedLayer) embedLayers.append(embedLayer) """ we do not use this profile embedding any more ## embedding the sequence profile if seq2matrixMode.has_key('Profile') and useSequentialFeatures: from EmbeddingLayer import ProfileEmbeddingLayer pEmbedLayer = ProfileEmbeddingLayer(seqConv.output, seqConv.n_out, seq2matrixMode['Profile']) seq2matrixLayers.append(pEmbedLayer) embedLayers.append(pEmbedLayer) """ self.layers += seq2matrixLayers bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) if (bUseCCMraw or bUseFullMI or bUseFullCov) and config.CompressMatrixInput(modelSpecs): ## here we add a compress layer to reduce the #channels of the original matrix input. n_hiddens4MatrixCompress = modelSpecs['hiddens4MatrixCompress'] compressLayer4MatrixInput = Conv2D4DistMatrix( rng, input=matrixInput, n_in=n_in_matrix, n_hiddens=n_hiddens4MatrixCompress, halfWinSize=0, mask=mask_matrix) compressedMatrixInput = compressLayer4MatrixInput.output n_compressedMatrix = compressLayer4MatrixInput.n_out input_2d = T.concatenate( [compressedMatrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_compressedMatrix + sum( [layer.n_out for layer in seq2matrixLayers]) else: ##old code for merging original matrix input and sequential input input_2d = T.concatenate( [matrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_in_matrix + sum( [layer.n_out for layer in seq2matrixLayers]) #print 'n_input2d=', n_input2d if modelSpecs['network'].startswith('ResNet'): matrixConv = ResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) elif modelSpecs['network'].startswith('DilatedResNet'): #matrixConv=DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) matrixConv = DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, modelSpecs=modelSpecs) else: print 'ERROR: Unimplemented deep network type: ', modelSpecs[ 'network'] exit(1) self.layers.append(matrixConv) conv_out = matrixConv.output selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) n_in4logreg = matrixConv.n_out self.outputList = [] self.output_probList = [] self.predictors = [] self.params4var = [] self.paramL14var = 0 self.paramL24var = 0 for res in modelSpecs['responses']: labelType = Response2LabelType(res) predictor = None if labelType.startswith('Discrete'): assert GetResponseValueDims(res) == 1 predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) elif labelType.startswith('LogNormal') or labelType.startswith( 'Normal'): predictor = NN4Normal(rng=rng, input=selected, n_in=n_in4logreg, n_variables=GetResponseValueDims(res), n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) ## recording parameters specific for variance prediction self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var else: print 'incorrect response name or label type: ', res exit(1) self.layers.append(predictor) self.predictors.append(predictor) ## output in 2d matrix output_2d = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseValueDims(res))) output_2d_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseProbDims(res))) self.outputList.append(output_2d) self.output_probList.append(output_2d_prob) self.output = T.concatenate(self.outputList, axis=3) self.output_prob = T.concatenate(self.output_probList, axis=3) ## collect all the model parameters and their norms self.params = [] self.paramL2 = 0 self.paramL1 = 0 for layer in self.layers: self.params += layer.params self.paramL2 += layer.paramL2 self.paramL1 += layer.paramL1 """
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') ## bounding box for crop of a big protein distance matrix. This box allows crop at any position. box = None if forTrain: box = T.ivector('boundingbox') ## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data trainByRefLoss = None if forTrain and config.TrainByRefLoss(modelSpecs): trainByRefLoss = T.iscalar('trainByRefLoss') distancePredictor = ResNet4DistMatrix(rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, boundingbox=box, modelSpecs=modelSpecs) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = GetResponseValueDims(response) if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append(T.wtensor4('Tlabel4' + response)) else: labelList.append(T.wtensor3('Tlabel4' + response)) else: if rValDims > 1: labelList.append(T.tensor4('Tlabel4' + response)) else: labelList.append(T.tensor3('Tlabel4' + response)) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList) > 0 and config.UseSampleWeight(modelSpecs): weightList = [ T.tensor3('Tweight4' + response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty if forTrain: return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss else: return distancePredictor, x, y, xmask, ymask, xem
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files) == 0: print('the feature file is empty') exit(-1) fhs = [open(file, 'rb') for file in files] data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], []) [fh.close() for fh in fhs] ## each protein has sequential and pairwise features as input and distance matrix as label proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ## convert the primary sequence to a one-hot encoding oneHotEncoding = config.SeqOneHotEncoding(d['sequence']) ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure if config.EmbeddingUsed(modelSpecs): if 'Seq+SS' in modelSpecs['seq2matrixMode']: embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3']) else: embedFeature = oneHotEncoding oneprotein['embedFeatures'] = embedFeature ##collecting sequential features... seqMatrices = [oneHotEncoding] ## 3-state secondary structure shall always be placed before the other features, why? if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True): seqMatrices.append(d['SS3']) if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True): seqMatrices.append(d['ACC']) if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True): seqMatrices.append(d['PSSM']) if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True: seqMatrices.append(d['DISO']) ##membrane protein specific features useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and ( modelSpecs['UseMPSpecificFeatures'] is True) if useMPSpecificFeatures: if 'MemAcc' in d: seqMatrices.append(d['MemAcc']) else: print('The data does not have a feature called MemAcc') exit(-1) if 'MemTopo' in d: seqMatrices.append(d['MemTopo']) else: print('The data does not have a feature called MemTopo') exit(-1) ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment. ## this is mainly used for homology modeling if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if 'tplSimScore' not in d: print( 'the data has no key tplSimScore, which is needed since you specify to use template information' ) exit(-1) if d['tplSimScore'].shape[1] != 11: print( 'The number of features for query-template similarity shall be equal to 11' ) exit(-1) seqMatrices.append(d['tplSimScore']) seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32) ##collecting pairwise features... pairfeatures = [] ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 ) posFeature = LocationFeature(d) pairfeatures.append(posFeature) cbrtFeature = CubeRootFeature(d) pairfeatures.append(cbrtFeature) if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True): if 'ccmpredZ' not in d: print('Something must be wrong. The data for protein ', d['name'], ' does not have the normalized ccmpred feature!') exit(-1) pairfeatures.append(d['ccmpredZ']) if modelSpecs['UsePSICOV'] is True: pairfeatures.append(d['psicovZ']) if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is True): pairfeatures.append(d['OtherPairs']) ##add template-related distance matrix. This code needs modification later ## somewhere we shall also write code to add template-related sequential features such as secondary structure? if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template distance matrix...' if 'tplDistMatrix' not in d: print( 'the data for ', d['name'], ' has no tplDistMatrix, which is needed since you specify to use template information' ) exit(-1) ## Check to make sure that we use exactly the same set of inter-atom distance information from templates ## currently we do not use HB and Beta information from template apts = d['tplDistMatrix'].keys() assert (set(apts) == set(config.allAtomPairTypes)) ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) ) tmpPairFeatures = dict() for apt, tplDistMatrix in d['tplDistMatrix'].items(): ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type if apt == 'CaCa': flagMatrix = np.zeros_like(tplDistMatrix) np.putmask(flagMatrix, tplDistMatrix < 0, 1) pairfeatures.append(flagMatrix) strengthMatrix = np.copy(tplDistMatrix) np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5) np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50) strengthMatrix = 3.5 / strengthMatrix if config.InTPLMemorySaveMode(modelSpecs): tmpPairFeatures[apt] = [strengthMatrix] else: tmpPairFeatures[apt] = [ strengthMatrix, np.square(strengthMatrix) ] ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure ## python of different versions may have different ordering of keys in dict() ? pairfeatures.extend(tmpPairFeatures['CbCb']) pairfeatures.extend(tmpPairFeatures['CgCg']) pairfeatures.extend(tmpPairFeatures['CaCg']) pairfeatures.extend(tmpPairFeatures['CaCa']) pairfeatures.extend(tmpPairFeatures['NO']) if config.InTPLMemorySaveMode(modelSpecs): matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32) else: matrixFeature = np.dstack(tuple(pairfeatures)) #print 'matrixFeature.shape: ', matrixFeature.shape oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature oneprotein['matrixFeatures'] = matrixFeature ##collecting labels... if 'atomDistMatrix' in d: atomDistMatrix = d['atomDistMatrix'] oneprotein['atomLabelMatrix'] = dict() for response in modelSpecs['responses']: responseName = Response2LabelName(response) labelType = Response2LabelType(response) if responseName not in atomDistMatrix: print('In the raw feature data, ', d['name'], ' does not have matrix for ', responseName) exit(-1) ## atomDistMatrix is the raw data, so it does not have information about labelType distm = atomDistMatrix[responseName] if labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] ## no need to discretize for HB and Beta-Pairing since they are binary matrices if responseName.startswith( 'HB') or responseName.startswith('Beta'): oneprotein['atomLabelMatrix'][response] = distm else: labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix( distm, config.distCutoffs[subType], subType.endswith('Plus')) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('LogNormal'): labelMatrix = DistanceUtils.LogDistMatrix(distm) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('Normal'): oneprotein['atomLabelMatrix'][response] = distm else: print('unsupported response: ', res) exit(-1) elif forTrainValidation: print( 'atomic distance matrix is needed for the training and validation data' ) exit(-1) ##at this point, finish collecting features and labels for one protein proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 1): print('assembled features and labels for ', counter, ' proteins.') return proteinFeatures
def PredictDistMatrix(modelFiles, predFiles, savefolder=None): ## load all the models from the files. Each file contains specification for one model. models = [] for mFile in modelFiles: fh = open(mFile, 'rb') model = cPickle.load(fh) fh.close() models.append(model) ## check consistency among models. All the models shall have the same labelType for the same atom pair type labelTypes = dict() for model in models: for response in model['responses']: labelName = Response2LabelName(response) labelType = Response2LabelType(response) if not labelTypes.has_key(labelName): labelTypes[labelName] = labelType elif labelTypes[labelName] != labelType: print 'WARNING: at least two models have different label types for the same atom pair type.' exit(-1) allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices ##We predict one prob_matrix from each model for each protein and each response ## two different models may share some overlapping responses. allresults = dict() numModels = dict() for model, mfile in zip(models, modelFiles): if not model['network'] in config.allNetworks: print 'unsupported network architecture: ', model['network'] exit(-1) distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False) inputVariables = [ x, y, xmask, ymask] if xem is not None: inputVariables.append(xem) pred_prob = distancePredictor.output_prob predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' ) ## set model parameter values if not Compatible(distancePredictor.params, model['paramValues']): print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile exit(-1) [ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ] ## We shall load these files for each model separately since each model may have different requirement of the data predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False) ##make sure the input has the same number of features as the model. We do random check here to speed up rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2] if predData[0].has_key('embedFeatures'): rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1] ## check if all the proteins of the same name have exactly the same sequence for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'Error: inconsistent primary sequence for the same protein in the protein feature files' exit(-1) ## predSeqData and names are in the exactly the same order, so we know which data is for which protein predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[ : len(inputVariables) ] result = predict(*input) x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] ) ## calculate the start and end positions of each response in the last dimension of result dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for name in names4onebatch: if not allresults.has_key(name): allresults[name]=dict() numModels[name] =dict() ## batchres is a batch of result, its ndim=4 for response, start, end in zip(model['responses'], startPositions, endPositions): ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end ] ## remove masked positions revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) del predict del predData del predSeqData gc.collect() ## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): #finalresults[name][response] = np.average(allresults[name][response], axis=0) finalresults[name][response] = allresults[name][response]/numModels[name][response] ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. apt = Response2LabelName(response) if config.IsSymmetricAPT( apt ): finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2. ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for response in model['responses']: apt = response if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][response]) labelWeights[apt].append(model['weight4labels'][response]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) ## convert the predicted distance probability matrix into a predicted contact matrix. ## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions predictedContactMatrices = dict() from scipy.stats import norm for name, results in finalresults.iteritems(): predictedContactMatrices[name] = dict() for response in results.keys(): apt = Response2LabelName(response) labelType = Response2LabelType(response) if apt in config.allAtomPairTypes: if labelType.startswith('Discrete'): subType = labelType[len('Discrete'): ] labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType]) predictedContactMatrices[name][apt] = np.sum( finalresults[name][response][:, :, :labelOf8], axis=2) elif labelType.startswith('Normal'): assert labelType.startswith('Normal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(config.ContactDefinition) elif labelType.startswith('LogNormal'): assert labelType.startswith('LogNormal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(np.log(config.ContactDefinition) ) else: print 'unsupported label type in response: ', response exit(-1) elif apt in ['HB', 'Beta']: predictedContactMatrices[name][apt] = finalresults[name][response][:, :, 0] else: print 'unsupported atom type in response: ', response exit(-1) ##write all the results here ## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence) ## we store distLabelType for future use for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if savefolder is not None: savefilename = os.path.join(savefolder, savefilename) fh = open(savefilename, 'wb') cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return finalresults, predictedContactMatrices, allsequences
def __init__(self, rng, seqInput, mask_seq=None, modelSpecs = None): """ seqInput has shape (batchSize, seqLen, n_in_seq) mask_seq has shape (batchSize, #cols_to_be_masked) """ self.modelSpecs = modelSpecs n_in_seq = modelSpecs['n_in_seq'] n_hiddens_seq = modelSpecs['conv1d_hiddens'] seq_repeats = modelSpecs['conv1d_repeats'] n_hiddens_logreg = modelSpecs['logreg_hiddens'] hwsz_seq=modelSpecs['halfWinSize_seq'] self.mask_1d = mask_seq self.layers = [] # sequence convolution if modelSpecs['network'].startswith('ResNet'): seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=modelSpecs['activation'], batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) else: print 'Unimplemented deep network type: ', modelSpecs['network'] exit(-1) self.layers.append(seqConv) ## conv_out has shape (batchSize, seqLen, seqConv.n_out) conv_out = seqConv.output ##flatten all selected = conv_out.dimshuffle(2, 0, 1).flatten(2).dimshuffle(1, 0) n_in4logreg = seqConv.n_out self.outputList = [] self.output_probList = [] self.predictors = [] self.params4var = [] self.paramL14var = 0 self.paramL24var = 0 for res in modelSpecs['responses']: labelType = Response2LabelType(res) predictor = None if labelType.startswith('vonMise'): assert (config.responseValueDims[labelType] == 2) predictor = NN4PhiPsi(rng=rng, input=selected, n_in=n_in4logreg, n_variables=config.responseValueDims[labelType], n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg) self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var elif labelType.startswith('Gauss'): predictor = NN4Normal(rng=rng, input=selected, n_in=n_in4logreg, n_variables=config.responseValueDims[labelType], n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg) self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var elif labelType.startswith('Discrete'): assert (config.responseValueDims[labelType] == 1) predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=config.responseProbDims[labelType], n_hiddens=n_hiddens_logreg) else: print 'incorrect response name or label type: ', res exit(-1) self.layers.append(predictor) self.predictors.append(predictor) ## output y_pred = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], config.responseValueDims[labelType]) ) output_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], config.responseProbDims[labelType]) ) self.outputList.append( y_pred ) self.output_probList.append( output_prob ) ## y_pred is the predicted target value ## output_prob contains information for probability distribution of a target value self.y_pred = T.concatenate( self.outputList, axis=2 ) self.output4prob = T.concatenate( self.output_probList, axis=2 ) self.params = [] self.paramL1 = 0 self.paramL2 = 0 for layer in self.layers: self.params += layer.params self.paramL1 += layer.paramL1 self.paramL2 += layer.paramL2