def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelType = Response2LabelType(currentResponse) atomType = Response2LabelName(currentResponse) symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta']) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print 'unsupported label type in EvaluateAccuracy: ', labelType exit(-1)
def CalcRefState4OneBatch(batch, modelSpecs, minSeqSep=3): ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in batch ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.]).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) allRefProbs[response], avgLen = CalcLabelProb( labelMatrices=newLabelMatrices, numLabels=config.responseProbDims[labelType], minSeqSep=minSeqSep) else: allRefProbs[response], avgLen = CalcLabelProb( labelMatrices=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=config.responseProbDims[labelType], minSeqSep=minSeqSep) return allRefProbs, avgLen
def EvaluateSinglePropertyPrediction(prediction, nativeLabelFile): from DataProcessor import LoadNativeLabelsFromFile errors = dict() nativeLabels = LoadNativeLabelsFromFile(nativeLabelFile) for response, pred in prediction.iteritems(): native = nativeLabels[Response2LabelName(response)] missing = nativeLabels['Missing'] if response.startswith('DISO'): numResidues = len(pred) totalError = sum([p != t for p, t in zip(pred, native)]) tmpError = np.array([numResidues, totalError]) elif response.startswith('SS'): numResidues = sum([m == 0 for m in missing]) totalError = sum( [p != t for p, t, m in zip(pred, native, missing) if m == 0]) tmpError = np.array([numResidues, totalError]) elif 'Phi' in response or 'Psi' in response: invalidResidues = [0] * len(missing) for i in xrange(len(missing)): if missing[i] == 1: invalidResidues[i] = 1 if i > 0: invalidResidues[i - 1] = 1 if i < len(missing) - 1: invalidResidues[i + 1] = 1 invalidResidues[0] = 1 invalidResidues[len(missing) - 1] = 1 numResidues = sum([m == 0 for m in invalidResidues]) err1 = abs(pred - native) err2 = np.float32(2 * np.pi) - err1 err = np.minimum(err1, err2) totalError = np.sum( [e for e, m in zip(err, invalidResidues) if m == 0], axis=0) tmpError = np.array([numResidues] + list(totalError)) else: print 'The Evaluate function not implemented for response: ', response exit(1) if errors.has_key(response): errors[response].append(tmpError) else: errors[response] = [tmpError] ## calculate average error avgerrors = dict() for response, err in errors.iteritems(): avgerrors[response] = np.average(err) return avgerrors
def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelType = Response2LabelType(currentResponse) atomType = Response2LabelName(currentResponse) symmetric = (atomType in ['CaCa', 'CbCb', 'CgCg', 'Beta']) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print('unsupported label type in EvaluateAccuracy: ', labelType) exit(-1) accuracyList = [] for res, out_prob, z, ratio in zip(self.responses, self.output_probList, zList, self.modelSpecs['topRatios']): ## currently TopAccuracy only works when the dimension of each z is 3 assert z.ndim == 3 if self.mask_1d is not None: paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d, axis=1) else: paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32) currentResponse = res topRatio = ratio ##here we use scan to calculate accuracy for each protein result, updates = theano.scan(fn=EvaluateAccuracy, outputs_info=None, sequences=[out_prob, z, paddingLens]) accuracy = T.mean(result, axis=0) accuracyList.append(accuracy) return T.stacklists(accuracyList)
def PredictDistMatrix(modelFiles, predFiles, savefolder=None): ## load all the models from the files. Each file contains specification for one model. models = [] for mFile in modelFiles: fh = open(mFile, 'rb') model = cPickle.load(fh) fh.close() models.append(model) ## check consistency among models. All the models shall have the same labelType for the same atom pair type labelTypes = dict() for model in models: for response in model['responses']: labelName = Response2LabelName(response) labelType = Response2LabelType(response) if not labelTypes.has_key(labelName): labelTypes[labelName] = labelType elif labelTypes[labelName] != labelType: print 'WARNING: at least two models have different label types for the same atom pair type.' exit(-1) allsequences = dict() ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices ##We predict one prob_matrix from each model for each protein and each response ## two different models may share some overlapping responses. allresults = dict() numModels = dict() for model, mfile in zip(models, modelFiles): if not model['network'] in config.allNetworks: print 'unsupported network architecture: ', model['network'] exit(-1) distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False) inputVariables = [ x, y, xmask, ymask] if xem is not None: inputVariables.append(xem) pred_prob = distancePredictor.output_prob predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' ) ## set model parameter values if not Compatible(distancePredictor.params, model['paramValues']): print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile exit(-1) [ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ] ## We shall load these files for each model separately since each model may have different requirement of the data predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False) ##make sure the input has the same number of features as the model. We do random check here to speed up rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1] rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2] if predData[0].has_key('embedFeatures'): rindex = np.random.randint(0, high=len(predData) ) assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1] ## check if all the proteins of the same name have exactly the same sequence for d in predData: if not allsequences.has_key(d['name']): allsequences[d['name']] = d['sequence'] elif allsequences[d['name']] != d['sequence']: print 'Error: inconsistent primary sequence for the same protein in the protein feature files' exit(-1) ## predSeqData and names are in the exactly the same order, so we know which data is for which protein predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) for onebatch, names4onebatch in zip(predSeqData, names): input = onebatch[ : len(inputVariables) ] result = predict(*input) x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] ) ## calculate the start and end positions of each response in the last dimension of result dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims for name in names4onebatch: if not allresults.has_key(name): allresults[name]=dict() numModels[name] =dict() ## batchres is a batch of result, its ndim=4 for response, start, end in zip(model['responses'], startPositions, endPositions): ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end ] ## remove masked positions revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) del predict del predData del predSeqData gc.collect() ## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): #finalresults[name][response] = np.average(allresults[name][response], axis=0) finalresults[name][response] = allresults[name][response]/numModels[name][response] ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. apt = Response2LabelName(response) if config.IsSymmetricAPT( apt ): finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2. ## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average. labelDistributions = dict() labelWeights = dict() for model in models: for response in model['responses']: apt = response if not labelDistributions.has_key(apt): labelDistributions[apt] = [] if not labelWeights.has_key(apt): labelWeights[apt] = [] labelDistributions[apt].append(model['labelRefProbs'][response]) labelWeights[apt].append(model['weight4labels'][response]) finalLabelDistributions = dict() finalLabelWeights = dict() for apt in labelDistributions.keys(): finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0) for apt in labelWeights.keys(): finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0) ## convert the predicted distance probability matrix into a predicted contact matrix. ## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions predictedContactMatrices = dict() from scipy.stats import norm for name, results in finalresults.iteritems(): predictedContactMatrices[name] = dict() for response in results.keys(): apt = Response2LabelName(response) labelType = Response2LabelType(response) if apt in config.allAtomPairTypes: if labelType.startswith('Discrete'): subType = labelType[len('Discrete'): ] labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType]) predictedContactMatrices[name][apt] = np.sum( finalresults[name][response][:, :, :labelOf8], axis=2) elif labelType.startswith('Normal'): assert labelType.startswith('Normal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(config.ContactDefinition) elif labelType.startswith('LogNormal'): assert labelType.startswith('LogNormal1d2') normDistribution = norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1]) predictedContactMatrices[name][apt] = normDistribution.cdf(np.log(config.ContactDefinition) ) else: print 'unsupported label type in response: ', response exit(-1) elif apt in ['HB', 'Beta']: predictedContactMatrices[name][apt] = finalresults[name][response][:, :, 0] else: print 'unsupported atom type in response: ', response exit(-1) ##write all the results here ## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence) ## we store distLabelType for future use for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if savefolder is not None: savefilename = os.path.join(savefolder, savefilename) fh = open(savefilename, 'wb') cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() return finalresults, predictedContactMatrices, allsequences
def EvaluatePropertyPrediction(predictions, nativefolder): from DataProcessor import LoadNativeLabels errors = dict() names = [] for name, preds in predictions.iteritems(): #print 'name=', name nativeLabels = LoadNativeLabels(name, nativefolder, preds.keys()) if nativeLabels is None: continue names.append(name) for response, pred in preds.iteritems(): native = nativeLabels[Response2LabelName(response)] missing = nativeLabels['Missing'] if response.startswith('DISO'): numResidues = len(pred) totalError = sum([p != t for p, t in zip(pred, native)]) tmpError = np.array([numResidues, totalError]) elif response.startswith('SS'): numResidues = sum([m == 0 for m in missing]) totalError = sum([ p != t for p, t, m in zip(pred, native, missing) if m == 0 ]) tmpError = np.array([numResidues, totalError]) elif 'Phi' in response or 'Psi' in response: invalidResidues = [0] * len(missing) for i in xrange(len(missing)): if missing[i] == 1: invalidResidues[i] = 1 if i > 0: invalidResidues[i - 1] = 1 if i < len(missing) - 1: invalidResidues[i + 1] = 1 invalidResidues[0] = 1 invalidResidues[len(missing) - 1] = 1 numResidues = sum([m == 0 for m in invalidResidues]) err1 = abs(pred - native) err2 = np.float32(2 * np.pi) - err1 err = np.minimum(err1, err2) totalError = np.sum( [e for e, m in zip(err, invalidResidues) if m == 0], axis=0) tmpError = np.array([numResidues] + list(totalError)) else: print 'The Evaluate function not implemented for response: ', response exit(1) if errors.has_key(response): errors[response].append(tmpError) else: errors[response] = [tmpError] ## calculate average error avgErrPerTarget = dict() avgErrPerResidue = dict() allerrors = dict() for response, e in errors.iteritems(): err = np.array(e) err_avg = np.average(err, axis=0) err2 = err_avg[1:] * 1. / err_avg[0] ind_err = np.divide(err[:, 1:] * 1.0, err[:, 0:1]) err1 = np.average(ind_err, axis=0) avgErrPerTarget[response] = err1 avgErrPerResidue[response] = err2 """ print '*********************Error for response ', response, '************************' print 'avg by target: ', err1, ' avg by residue: ', err2 print ' ' print '*********************Individual Error for response ', response, '************************' """ allerrors[response] = dict() for name, e0 in zip(names, ind_err): ##print name, e0 allerrors[response][name] = e0 return avgErrPerTarget, avgErrPerResidue, allerrors
def MergeOneProtein(inputFiles, method): if inputFiles is None or len(inputFiles) < 2: print 'Please provide at least two predicted matrices for merge' exit(-1) seqName = None sequence = None distProbs = dict() contactProbs = dict() labelDistributions = dict() labelWeights = dict() labelWeightFlags = [] tempNames = [] for inputFile in inputFiles: content = DistanceUtils.LoadRawDistProbFile(inputFile) name0, sequence0, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content ##add code here to check all the input files have the same protein name seqName0 = '-'.join(name0.split('-')[0:-1]) tempName = name0.split('-')[-1] tempNames.append(tempName) labelWeightFlags.append( labelWeight is not None ) if seqName is None: seqName = seqName0 else: assert seqName == seqName0 if sequence is None: sequence = sequence0 else: assert sequence == sequence0 for apt in predictedDistProb.keys(): if not distProbs.has_key(apt): distProbs[apt] =[] distProbs[apt].append( predictedDistProb[apt] ) for apt in predictedContactProb.keys(): if not contactProbs.has_key(apt): contactProbs[apt] = [] contactProbs[apt].append( predictedContactProb[apt] ) if labelWeight is not None: for apt in labelWeight.keys(): if not labelWeights.has_key(apt): labelWeights[apt] = [] labelWeights[apt].append( labelWeight[apt] ) for apt in labelDistribution.keys(): if not labelDistributions.has_key(apt): labelDistributions[apt] = [] labelDistributions[apt].append( labelDistribution[apt] ) ## check consistency among labelWeightFlags consistent = all( flag==labelWeightFlags[0] for flag in labelWeightFlags) if not consistent: print 'ERROR: the input matrix files have inconsistent format. Some have a labelWeight while others do not.' exit(-1) ### Ms is a dictionary, each value in Ms is a list of matrices ### this function calculates the geometric mean of all the matrices in the same list and the renormalize the last dim of the resultant mean def CalcGeometricMean( Ms ): result = dict() for apt, v in Ms.iteritems(): result[apt] = scipy.stats.mstats.gmean(v, axis=0) tmp_sum = np.sum(result[apt], axis=-1, keepdims=True) result[apt] = result[apt]/tmp_sum return result ## calculate arithmetic mean def CalcArithmeticMean( Ms ): result = dict() for apt, v in Ms.iteritems(): result[apt] = np.mean(v, axis=0) return result if method == 'amean': distMatrixProb = CalcArithmeticMean(distProbs) labelDistribution = CalcArithmeticMean(labelDistributions) else: distMatrixProb = CalcGeometricMean(distProbs) labelDistribution = CalcGeometricMean(labelDistributions) contactMatrixProb = dict() for k in distMatrixProb.keys(): apt = Response2LabelName(k) labelType = Response2LabelType(k) if not labelType.startswith('Discrete'): print 'ERROR: this labelType currently not supported in TPLMergePredicteDistMatrix.py : ', labelType exit(-1) subType = labelType[ len('Discrete'): ] labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType]) contactMatrixProb[apt] = ContactUtils.Distance2Contact(distMatrixProb[k], labelOf8) if labelWeightFlags[0] is True: labelWeight = CalcArithmeticMean(labelWeights) targetName = '-'.join( [ seqName ] + tempNames ) if labelWeightFlags[0] is True: content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, labelWeight, labelDistribution) else: content4save = (targetName, sequence, distMatrixProb, contactMatrixProb, None, labelDistribution) return contactMatrixProb, content4save
def CalcLabelDistributionAndWeight(data=None, modelSpecs=None): ## weight for different ranges (long, medium, short, and near-ranges) if 'weight4range' not in modelSpecs: modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape( (4, 1)).astype(np.float32) else: modelSpecs['weight4range'].reshape((4, 1)).astype(np.float32) print('weight for range: ', modelSpecs['weight4range']) ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15 if 'LRbias' in modelSpecs: modelSpecs['weight4Discrete3C'] = np.multiply( config.weight43C[modelSpecs['LRbias']], modelSpecs['weight4range']) else: modelSpecs['weight4Discrete3C'] = np.multiply( config.weight43C['mid'], modelSpecs['weight4range']) print('LRbias= ', modelSpecs['LRbias'], 'weight43C= ', modelSpecs['weight4Discrete3C']) ## weight for 2C modelSpecs['weight4HB_Discrete2C'] = np.multiply( config.weight4HB2C, modelSpecs['weight4range']) modelSpecs['weight4Beta_Discrete2C'] = np.multiply( config.weight4Beta2C, modelSpecs['weight4range']) ## weight for real value modelSpecs['weight4continuous'] = np.multiply( np.array([1.] * 4).reshape((4, 1)).astype(np.float32), modelSpecs['weight4range']) ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * 4).reshape( (4, 1)).astype(np.float32) continue if 'UseBoundingBox4RefProbs' in modelSpecs and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) allRefProbs[response] = DistanceUtils.CalcLabelProb( data=newLabelMatrices, numLabels=config.responseProbDims[labelType]) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=[m.astype(np.int32) for m in allLabelMatrices[response]], numLabels=config.responseProbDims[labelType]) modelSpecs['labelRefProbs'] = allRefProbs ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): ## just need to assign range weight modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4continuous'] continue if labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] ## if the response is for HB and BetaPairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] continue ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('3C'): modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4Discrete3C'] continue ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO modelSpecs['weight4labels'][ response] = DistanceUtils.CalcLabelWeight( modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType]) continue print('unsupported response in CalcLabelDistributionAndWeight: ', response) exit(-1) return modelSpecs['labelRefProbs'], modelSpecs['weight4labels']
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files) == 0: print('the feature file is empty') exit(-1) fhs = [open(file, 'rb') for file in files] data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], []) [fh.close() for fh in fhs] ## each protein has sequential and pairwise features as input and distance matrix as label proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ## convert the primary sequence to a one-hot encoding oneHotEncoding = config.SeqOneHotEncoding(d['sequence']) ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure if config.EmbeddingUsed(modelSpecs): if 'Seq+SS' in modelSpecs['seq2matrixMode']: embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3']) else: embedFeature = oneHotEncoding oneprotein['embedFeatures'] = embedFeature ##collecting sequential features... seqMatrices = [oneHotEncoding] ## 3-state secondary structure shall always be placed before the other features, why? if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True): seqMatrices.append(d['SS3']) if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True): seqMatrices.append(d['ACC']) if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True): seqMatrices.append(d['PSSM']) if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True: seqMatrices.append(d['DISO']) ##membrane protein specific features useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and ( modelSpecs['UseMPSpecificFeatures'] is True) if useMPSpecificFeatures: if 'MemAcc' in d: seqMatrices.append(d['MemAcc']) else: print('The data does not have a feature called MemAcc') exit(-1) if 'MemTopo' in d: seqMatrices.append(d['MemTopo']) else: print('The data does not have a feature called MemTopo') exit(-1) ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment. ## this is mainly used for homology modeling if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if 'tplSimScore' not in d: print( 'the data has no key tplSimScore, which is needed since you specify to use template information' ) exit(-1) if d['tplSimScore'].shape[1] != 11: print( 'The number of features for query-template similarity shall be equal to 11' ) exit(-1) seqMatrices.append(d['tplSimScore']) seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32) ##collecting pairwise features... pairfeatures = [] ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 ) posFeature = LocationFeature(d) pairfeatures.append(posFeature) cbrtFeature = CubeRootFeature(d) pairfeatures.append(cbrtFeature) if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True): if 'ccmpredZ' not in d: print('Something must be wrong. The data for protein ', d['name'], ' does not have the normalized ccmpred feature!') exit(-1) pairfeatures.append(d['ccmpredZ']) if modelSpecs['UsePSICOV'] is True: pairfeatures.append(d['psicovZ']) if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is True): pairfeatures.append(d['OtherPairs']) ##add template-related distance matrix. This code needs modification later ## somewhere we shall also write code to add template-related sequential features such as secondary structure? if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template distance matrix...' if 'tplDistMatrix' not in d: print( 'the data for ', d['name'], ' has no tplDistMatrix, which is needed since you specify to use template information' ) exit(-1) ## Check to make sure that we use exactly the same set of inter-atom distance information from templates ## currently we do not use HB and Beta information from template apts = d['tplDistMatrix'].keys() assert (set(apts) == set(config.allAtomPairTypes)) ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) ) tmpPairFeatures = dict() for apt, tplDistMatrix in d['tplDistMatrix'].items(): ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type if apt == 'CaCa': flagMatrix = np.zeros_like(tplDistMatrix) np.putmask(flagMatrix, tplDistMatrix < 0, 1) pairfeatures.append(flagMatrix) strengthMatrix = np.copy(tplDistMatrix) np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5) np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50) strengthMatrix = 3.5 / strengthMatrix if config.InTPLMemorySaveMode(modelSpecs): tmpPairFeatures[apt] = [strengthMatrix] else: tmpPairFeatures[apt] = [ strengthMatrix, np.square(strengthMatrix) ] ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure ## python of different versions may have different ordering of keys in dict() ? pairfeatures.extend(tmpPairFeatures['CbCb']) pairfeatures.extend(tmpPairFeatures['CgCg']) pairfeatures.extend(tmpPairFeatures['CaCg']) pairfeatures.extend(tmpPairFeatures['CaCa']) pairfeatures.extend(tmpPairFeatures['NO']) if config.InTPLMemorySaveMode(modelSpecs): matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32) else: matrixFeature = np.dstack(tuple(pairfeatures)) #print 'matrixFeature.shape: ', matrixFeature.shape oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature oneprotein['matrixFeatures'] = matrixFeature ##collecting labels... if 'atomDistMatrix' in d: atomDistMatrix = d['atomDistMatrix'] oneprotein['atomLabelMatrix'] = dict() for response in modelSpecs['responses']: responseName = Response2LabelName(response) labelType = Response2LabelType(response) if responseName not in atomDistMatrix: print('In the raw feature data, ', d['name'], ' does not have matrix for ', responseName) exit(-1) ## atomDistMatrix is the raw data, so it does not have information about labelType distm = atomDistMatrix[responseName] if labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] ## no need to discretize for HB and Beta-Pairing since they are binary matrices if responseName.startswith( 'HB') or responseName.startswith('Beta'): oneprotein['atomLabelMatrix'][response] = distm else: labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix( distm, config.distCutoffs[subType], subType.endswith('Plus')) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('LogNormal'): labelMatrix = DistanceUtils.LogDistMatrix(distm) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('Normal'): oneprotein['atomLabelMatrix'][response] = distm else: print('unsupported response: ', res) exit(-1) elif forTrainValidation: print( 'atomic distance matrix is needed for the training and validation data' ) exit(-1) ##at this point, finish collecting features and labels for one protein proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 1): print('assembled features and labels for ', counter, ' proteins.') return proteinFeatures
def LoadPropertyFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files) == 0: print 'ERROR: the feature files is empty' exit(1) data = [] for infile in files: with open(infile, 'rb') as fh: data.extend(cPickle.load(fh)) EmbeddingModel = None if modelSpecs.has_key( 'UseSequenceEmbedding') and modelSpecs['UseSequenceEmbedding']: EmbeddingModelFile = os.path.join( os.environ['DL4PropertyPredHome'], 'data', 'Mofrad-PLoSOne-2015Nov.3GramEmbeddingParams.pkl') EmbeddingModel = SequenceEmbedding.LoadEmbeddingParamsInPKL( EmbeddingModelFile) ## each protein has sequential features as input proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ##collecting sequential features... seqMatrices = [] seqMatrices.append(d['PSSM']) ##seqMatrices.append( d['PSFM'] ) ##Load sequence embedding features here if EmbeddingModel is not None: seqMatrices.append( SequenceEmbedding.EmbedOneSequence(d['sequence'], EmbeddingModel)) if modelSpecs.has_key('UsePSFM') and modelSpecs['UsePSFM']: seqMatrices.append(d['PSFM']) if modelSpecs.has_key( 'UseOneHotEncoding') and modelSpecs['UseOneHotEncoding']: seqMatrices.append(config.SeqOneHotEncoding(d['sequence'])) ## add template similarity score here if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if not d.has_key('tplSimScore'): print 'ERROR: no tplSimScore for target', d[ 'name'], 'which is needed since you specify to use template information' exit(1) if d['tplSimScore'].shape[1] != 10: print 'ERROR: the number of query-template similarity features is not 10 in data for', d[ 'name'] exit(1) if not d.has_key('tplProperties'): print 'ERROR: no tplProperties for target', d[ 'name'], 'which is needed since you specify to use template information' exit(1) if d['tplProperties'].shape[1] < 15: print 'ERROR: #template local structure properties shall be at least 15 for target', d[ 'name'] exit(1) ## the query-template similarity score shall be arranged in the order of: AA identity (binary), blosum80, blosum62, blosum45, spScore, spScore_ST, ppScore, pmScore, cc, hdsm seqMatrices.append(d['tplSimScore']) ##we do not use omg information from the template, the first 8 features shall be the 8-state secondary structure, then followed by pACC, CNa, CNb, Phi, Psi, Theta and Tau #seqMatrices.append( d['tplProperties'][:,:15] ) seqMatrices.append(d['tplProperties'][:, :8]) for r in modelSpecs['responses']: if r.startswith('ACC'): seqMatrices.append(d['tplProperties'][:, 8:9]) elif r.startswith('Phi') or r.startswith( 'Psi') or r.startswith('CLE'): seqMatrices.append(d['tplProperties'][:, 11:13]) elif r.startswith('Theta') or r.startswith('Tau'): seqMatrices.append(d['tplProperties'][:, 13:15]) elif r.startswith('CNa') or r.startswith('CNb'): seqMatrices.append(d['tplProperties'][:, 9:11]) else: print 'ERROR: unsupported response', r exit(1) if d.has_key('otherSeqFeatures'): seqMatrices.append(d['otherSeqFeatures']) ## all the features shall have shape (seqLen, nFeatures) where nFeatures is variable, but seqLen is the sequence length of one protein seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32) oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature if not d.has_key('DISO') and d.has_key('Missing'): d['DISO'] = d['Missing'] ##collecting labels... for r in modelSpecs['responses']: labelName = Response2LabelName(r) labelType = Response2LabelType(r) if not d.has_key(labelName) and forTrainValidation: print 'ERROR: missing label information for protein ', d[ 'name'], ' and response ', r exit(1) elif not d.has_key(labelName): continue labels = d[labelName] ## need some special handling of discrete labels if labelType.startswith('Discrete'): if r.startswith('SS3'): labels = np.array([ PropertyUtils.SS3Letter2Code[c] for c in labels ]).reshape((-1, 1)) elif r.startswith('SS8'): labels = np.array([ PropertyUtils.SS8Letter2Code[c] for c in labels ]).reshape((-1, 1)) elif r.startswith('ACC') or r.startswith('DISO'): labels = labels.reshape((-1, 1)) elif r.startswith('CLE'): labels = np.array([ PropertyUtils.CLELetter2Code[c] for c in labels ]).reshape((-1, 1)) else: print 'ERROR: please specify how to convert your discrete labels to numbers for response ', r exit(1) oneprotein[labelName] = labels ##at this point, finish collecting features and labels for one protein if d.has_key('Missing'): oneprotein['missing'] = d['Missing'] elif forTrainValidation: print 'ERROR: for training data, we need information to specify which residues have no 3D coordinates' exit(1) proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 1): print 'assembled features and labels for ', counter, ' proteins.' """ tmpfile = open(files[0] + '.contactInput.pkl', 'wb') cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL) tmpfile.close() """ return proteinFeatures
def AssembleOneBatch(data, modelSpecs, forTrainValidation=True): if not data: print 'WARNING: the list of data is empty' return None numSeqs = len(data) seqLens = [d['seqLen'] for d in data] maxSeqLen = max(seqLens) minSeqLen = min(seqLens) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1]), dtype=theano.config.floatX) ## for mask M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8) ## Y shall be a list of labels, each for one type ##we always need a weight vector to deal with residues without 3D coordinates in training and validation, if modelSpecs['UseSampleWeight']: Y = [] W = [] for res in modelSpecs['responses']: labelType = Response2LabelType(res) labelName = Response2LabelName(res) dataType = (np.int32 if labelType.startswith('Discrete') else theano.config.floatX) if forTrainValidation: if not data[0].has_key(labelName): print 'ERROR: label information is needed for training protein ', data[ 'name'], ' and response ', res exit(1) Y.append( np.zeros(shape=(numSeqs, maxSeqLen, config.responseValueDims[labelType]), dtype=dataType)) if not data[0].has_key('missing'): print 'ERROR: missing information is needed for training protein ', data[ 'name'] exit(1) W.append( np.zeros(shape=(numSeqs, maxSeqLen, 1), dtype=theano.config.floatX)) for j in range(len(data)): seqLen = data[j]['seqLen'] X1d[j, maxSeqLen - seqLen:, :] = data[j]['seqFeatures'] M1d[j, maxSeqLen - seqLen:].fill(1) for y, w, res in zip(Y, W, modelSpecs['responses']): y[j, maxSeqLen - seqLen:, ] = data[j][Response2LabelName(res)] if res.startswith('DISO'): ## for disorder prediction, all the residues shall be considered since those residues without 3D coordinates are positive examples ## we may assign a larger weight to positive examples since they are only 6% of the whole data set w[j, maxSeqLen - seqLen:, ] = np.reshape( data[j]['missing'], (-1, 1)) * (modelSpecs['w4diso'] - 1.) + 1. else: ## assign weight 0 to those residues without coordinates, otherwise 1 w[j, maxSeqLen - seqLen:, ] = 1.0 - np.reshape(data[j]['missing'], (-1, 1)) onebatch = [X1d, M1d] onebatch.extend(Y) onebatch.extend(W) return onebatch
def PredictMatrixLabels(models, predictors, names, inputFolders, aliFolders=None, tplFolder=None, aliFile=None, tplFile=None, saveFolder=None): if not isinstance(names, (list, tuple)): targetName = names else: targetName = None ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results ##two different models may share common responses allsequences = dict() allresults = dict() ## the results predicted from the real input numModels = dict( ) ## count the number of models that may predict each response for model, predictor in zip(models, predictors): #predict, inputVariables = BuildPredictor(model) predict, inputVariables = predictor ## load data for each model separately since each model may have a different specification if targetName is None: rawData = LoadProteinData4OneModel(model, names, inputFolders, aliFolders, tplFolder) elif aliFile is not None and tplFile is not None: rawData = LoadOneAlignment4OneModel(model, targetName, inputFolders, aliFile, tplFile) else: rawData = LoadOneProteinData4OneModel(model, targetName, inputFolders, aliFolders, tplFolder) predData = DataProcessor.ExtractFeaturesNLabels( rawData, modelSpecs=model, forTrainValidation=False, returnMode='list') ##make sure the input has the same number of features as the model FeatureUtils.CheckModelNDataConsistency(model, predData) ## check sequence consistency for d in predData: name = d['name'] if not allresults.has_key(name): allresults[name] = dict() numModels[name] = dict() if not allsequences.has_key(name): allsequences[name] = d['sequence'] elif allsequences[name] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) ##for onebatch, names4onebatch in zip(predSeqData, names): for minibatch in predSeqData: onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, model) input = onebatch[:len(inputVariables)] result = predict(*input) ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum([ GetResponseProbDims(response) for response in model['responses'] ]) ## calculate the start and end positions of each response in the last dimension of result dims = [ GetResponseProbDims(response) for response in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] for response, start, end in zip(model['responses'], startPositions, endPositions): ## batchres is a batch of result, its ndim=4 ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end] ## remove masked positions revised_batchres = [ probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): finalresults[name][response] = (allresults[name][response] / numModels[name][response]).astype( np.float32) ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. labelName = Response2LabelName(response) if config.IsSymmetricLabel(labelName): finalresults[name][response] = ( finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2))) / 2. ## convert predicted distance probability matrix into contact matrix predictedContactMatrices = DeriveContactMatrix(finalresults) ## collect the average label distributions and weight matrix finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution( models) ##write all the results here ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution) for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if saveFolder is not None: savefilename = os.path.join(saveFolder, savefilename) if targetName is not None: originalName = targetName else: for n in names: if name.startswith(n): originalName = n break with open(savefilename, 'wb') as fh: #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump((originalName, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) return (predictedContactMatrices, allsequences) """