def ValidDataLoader2(sharedQ, stopValidDataLoader, validSeqData, modelSpecs, assembleData=True, UseSharedMemory=False): bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX #print 'validDataLoader has event: ', stopValidDataLoader for batch in validSeqData: if stopValidDataLoader.is_set() or os.getppid()==1: #print 'validDataLoader receives the stop signal' break ## Load real data for one batch data = DataProcessor.LoadRealData(batch, modelSpecs, returnMode='list') ## add code here to make sure that the data has the same input dimension as the model specification FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Validate(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to validDataLoader queue...' sharedQ.put(data) print 'validDataLoader has finished loading data' sharedQ.close()
def LoadFeaturePKL(name, location='Feature4Train_2017_E001_PKL/', modelSpecs=None): ## load up the basic input features filename = os.path.join(location, name + '.inputFeatures.pkl') if not os.path.isfile(filename): print 'ERROR: the input feature file does not exist: ', filename exit(1) with open(filename) as fh: feature = cPickle.load(fh) ## check to see if loading up extra features bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) bUseExtraCCM = bUseCCMFnorm or bUseCCMsum or bUseCCMraw if bUseExtraCCM: extrafile = os.path.join(location, name + '.extraCCM.pkl') if not os.path.isfile(extrafile): print 'ERROR: the file for extra CCM information does not exist: ', extrafile exit(1) with open(extrafile) as fh: extra = cPickle.load(fh) if bUseCCMFnorm: feature['CCMFnorm'] = extra['Fnorm'] feature['CCMFnormZ'] = extra['FnormZ'] seqLen = len(feature['sequence']) if bUseCCMsum: if not extra.has_key('sumCCM'): print 'ERROR: CCM summary is requested, but the file does not have it: ', extrafile exit(1) feature['sumCCM'] = CCMpredUtils.ExpandMatrix(extra['sumCCM'], seqLen) if bUseCCMraw: if not extra.has_key('rawCCM'): print 'ERROR: CCM raw matrix is requested, but the file does not have it: ', extrafile exit(1) feature['rawCCM'] = CCMpredUtils.ExpandMatrix(extra['rawCCM'], seqLen) if bUseFullCov or bUseFullMI: alnfile = os.path.join(location, name + '.a2m') if not os.path.isfile(alnfile): print 'ERROR: the a2m file does not exist: ', alnfile exit(1) if bUseFullCov: feature['fullCov'] = MSAUtils.CalcPairMatrixFromFile(alnfile) if bUseFullMI: feature['fullMI'] = MSAUtils.CalcPairMatrixFromFile(alnfile, matrixType='mi') return feature
def CollectMatrixFeatures(d, modelSpecs, returnMode='array'): ##collecting pairwise features... pairfeatures_nomean = [ ] # a set of pairwise features for which we do not calculate their expected value pairfeatures = [] if not config.NoOldLocationFeatures(modelSpecs): posFeature = FeatureUtils.LocationFeature(d) pairfeatures_nomean.append(posFeature) cbrtFeature = FeatureUtils.CubeRootFeature(d) pairfeatures_nomean.append(cbrtFeature) if config.UseNewLocationFeatures(modelSpecs): posFeatures = FeatureUtils.NewLocationFeature(d) pairfeatures_nomean.extend(posFeatures) if config.UseCCMZ(modelSpecs): if not d.has_key('ccmpredZ'): print 'ERROR: CCMpredZ is requested, but the data for protein ', d[ 'name'], ' does not have it!' exit(1) else: pairfeatures.append(d['ccmpredZ']) if config.UseRawCCM(modelSpecs): if not d.has_key('ccmpred'): print 'ERROR: Raw CCMpred is requested, but the data for protein ', d[ 'name'], ' does not have it!' exit(1) pairfeatures.append(d['ccmpred']) if config.UsePSICOV(modelSpecs): if not d.has_key('psicovZ'): print 'ERROR: psicovZ is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['psicovZ']) if config.UseContactPotential(modelSpecs): if not d.has_key('OtherPairs'): print 'ERROR: pairwise contact potential is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['OtherPairs'][:, :, 0]) if config.UseMI(modelSpecs): if not d.has_key('OtherPairs'): print 'ERROR: mutual information is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['OtherPairs'][:, :, 1:3]) bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) if bUseCCMFnorm: if not d.has_key('CCMFnorm') or not d.has_key('CCMFnormZ'): print 'ERROR: CCM Fnorm and/or FnormZ are requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['CCMFnorm']) pairfeatures.append(d['CCMFnormZ']) if bUseCCMsum: if not d.has_key('sumCCM'): print 'ERROR: CCM summary are requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['sumCCM']) if bUseCCMraw: if not d.has_key('rawCCM'): print 'ERROR: CCM raw matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['rawCCM']) if bUseFullMI: if not d.has_key('fullMI'): print 'ERROR: full MI matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['fullMI']) if bUseFullCov: if not d.has_key('fullCov'): print 'ERROR: full covariance matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['fullCov']) ##add template-based distance and orientation matrices if config.UseTemplate(modelSpecs): pairfeatures_nomean.extend(CollectTemplateMatrixFeatures( d, modelSpecs)) if returnMode.lower() == 'array': matrixFeature = np.dstack(tuple(pairfeatures)) if len(pairfeatures_nomean) > 0: matrixFeature_nomean = np.dstack(tuple(pairfeatures_nomean)) else: seqLen = matrixFeature.shape[0] matrixFeature_nomean = np.zeros((seqLen, seqLen, 0), dtype=config.MyFloat) #print 'matrixFeature.shape: ', matrixFeature.shape return matrixFeature, matrixFeature_nomean else: return pairfeatures, pairfeatures_nomean
def LoadFeaturePKL(name, location='Feature4Train_2017_E001_PKL/', modelSpecs=None): ## load up the basic input features filename = os.path.join(location, name + '.inputFeatures.pkl') if not os.path.isfile(filename): print 'ERROR: the input feature file does not exist: ', filename exit(1) with open(filename) as fh: feature = cPickle.load(fh) ## check to see if loading up extra features bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) bUseExtraCCM = bUseCCMFnorm or bUseCCMsum or bUseCCMraw if bUseExtraCCM: extrafile = os.path.join(location, name + '.extraCCM.pkl') if not os.path.isfile(extrafile): print 'ERROR: the file for extra CCM information does not exist: ', extrafile exit(1) with open(extrafile) as fh: extra = cPickle.load(fh) if bUseCCMFnorm: feature['CCMFnorm'] = extra['Fnorm'] feature['CCMFnormZ'] = extra['FnormZ'] seqLen = len(feature['sequence']) if bUseCCMsum: if not extra.has_key('sumCCM'): print 'ERROR: CCM summary is requested, but the file does not have it: ', extrafile exit(1) feature['sumCCM'] = CCMpredUtils.ExpandMatrix(extra['sumCCM'], seqLen) if bUseCCMraw: if not extra.has_key('rawCCM'): print 'ERROR: CCM raw matrix is requested, but the file does not have it: ', extrafile exit(1) feature['rawCCM'] = CCMpredUtils.ExpandMatrix(extra['rawCCM'], seqLen) if bUseFullMI: alnfile = os.path.join(location, name + '.a2m') if not os.path.isfile(alnfile): print 'ERROR: the a2m file does not exist: ', alnfile exit(1) feature['fullMI'] = MSAUtils.CalcPairMatrixFromFile(alnfile, matrixType='mi') if bUseFullCov: covfile = os.path.join(location, name + '.cov.pkl') if not os.path.isfile(covfile): alnfile = os.path.join(location, name + '.a2m') if not os.path.isfile(alnfile): print 'ERROR: the a2m file does not exist:', alnfile exit(1) feature['fullCov'] = MSAUtils.CalcPairMatrixFromFile(alnfile) else: with open(covfile, 'rb') as fh: feature['fullCov'] = cPickle.load(fh) ## check to see if we shall load up ESM information layers = config.ParseESMmode(modelSpecs) if layers is not None: esmfile = os.path.join(location, name + '.esm2.pkl') if not os.path.isfile(esmfile): print 'ERROR: the file for ESM information does not exist: ', esmfile exit(1) with open(esmfile, 'rb') as fh: esm = cPickle.load(fh) esmfeature = [] for layer in layers: layer4key = layer % (esm['numModelLayers'] + 1) if not esm.has_key(layer4key): print 'ERROR: attention weight for layer ', layer, ' requested but not available in ', esmfile exit(1) esmfeature.append(esm[layer4key]) feature['ESM'] = np.concatenate(esmfeature, axis=2) #print 'ESM feature has shape', feature['ESM'].shape return feature
def __init__(self, rng, seqInput, matrixInput, mask_seq=None, mask_matrix=None, embedInput=None, boundingbox=None, modelSpecs=None): """ seqInput has shape (batchSize, seqLen, n_in_seq) matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix) mask_seq has shape (batchSize, #cols_to_be_masked) mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen) embedInput has shape (batchSize, seqLen, n_in2) boundingbox is a vector of 4 integer elements: top, left, bottom and right. boundingbox shall only be applied to the matrix converted from sequential features. """ assert (modelSpecs is not None) self.modelSpecs = modelSpecs self.responses = modelSpecs['responses'] ## set the number of hidden neurons and number of layers n_in_seq = modelSpecs['n_in_seq'] n_in_matrix = modelSpecs['n_in_matrix'] n_hiddens_seq = modelSpecs['conv1d_hiddens'] n_hiddens_matrix = modelSpecs['conv2d_hiddens'] n_hiddens_logreg = modelSpecs['logreg_hiddens'] seq_repeats = modelSpecs['conv1d_repeats'] matrix_repeats = modelSpecs['conv2d_repeats'] ## half win size for convolutional operation if modelSpecs['network'].startswith('DilatedResNet'): hwsz_matrix = modelSpecs['conv2d_hwszs'] hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq) dilation_seq = [1] * len(n_hiddens_seq) dilation_matrix = modelSpecs['conv2d_dilations'] else: hwsz_matrix = modelSpecs['halfWinSize_matrix'] hwsz_seq = modelSpecs['halfWinSize_seq'] ## masks to reduce impact of padding zeros self.mask_1d = mask_seq self.mask_2d = mask_matrix self.layers = [] act = T.nnet.relu if modelSpecs['activation'] == 'TANH': act = T.tanh # sequence convolution if modelSpecs['network'].startswith('DilatedResNet'): #seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, modelSpecs=modelSpecs) else: seqConv = ResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) self.layers.append(seqConv) ## transform 1d sequence to 2d matrix seq2matrixMode = modelSpecs['seq2matrixMode'] seq2matrixLayers = [] embedLayers = [] ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility ## useSequentialFeatures is True by default ##useSequentialFeatures = ( modelSpecs.has_key('UseSequentialFeatures') and (modelSpecs['UseSequentialFeatures'] is True) ) ## use OuterConcatenation operation to convert sequence features into pairwise features if seq2matrixMode.has_key('OuterCat') and config.UseSequentialFeatures: ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out) midpointfeature, n_midpoint_out = MidpointFeature(seqConv.output, seqConv.n_out, box=boundingbox) ##remove noise in midpointfeature ## mask_matrix is used to reduce noise introduced by padding positions mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :] midpointfeature = T.set_subtensor( mid_subtensor, T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor)) mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :] midpointfeature = T.set_subtensor( mid_subtensor2, T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2)) ## here we use convolution with halfWinSize=0 to reduce model complexity compressLayer = Conv2D4DistMatrix( rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=mask_matrix) #compressLayer = Conv2D4DistMatrix(rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=None ) seq2matrixLayers.append(compressLayer) ## embedding primary sequence and/or predicted secondary structure if embedInput is not None: from EmbeddingLayer import EmbeddingLayer4AllRange if seq2matrixMode.has_key('Seq+SS'): n_out_embed = seq2matrixMode['Seq+SS'] elif seq2matrixMode.has_key('SeqOnly'): n_out_embed = seq2matrixMode['SeqOnly'] else: print 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.' exit(1) embedLayer = EmbeddingLayer4AllRange(embedInput, modelSpecs['n_in_embed'], n_out_embed, box=boundingbox) seq2matrixLayers.append(embedLayer) embedLayers.append(embedLayer) """ we do not use this profile embedding any more ## embedding the sequence profile if seq2matrixMode.has_key('Profile') and useSequentialFeatures: from EmbeddingLayer import ProfileEmbeddingLayer pEmbedLayer = ProfileEmbeddingLayer(seqConv.output, seqConv.n_out, seq2matrixMode['Profile']) seq2matrixLayers.append(pEmbedLayer) embedLayers.append(pEmbedLayer) """ self.layers += seq2matrixLayers bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) if (bUseCCMraw or bUseFullMI or bUseFullCov) and config.CompressMatrixInput(modelSpecs): ## here we add a compress layer to reduce the #channels of the original matrix input. n_hiddens4MatrixCompress = modelSpecs['hiddens4MatrixCompress'] compressLayer4MatrixInput = Conv2D4DistMatrix( rng, input=matrixInput, n_in=n_in_matrix, n_hiddens=n_hiddens4MatrixCompress, halfWinSize=0, mask=mask_matrix) compressedMatrixInput = compressLayer4MatrixInput.output n_compressedMatrix = compressLayer4MatrixInput.n_out input_2d = T.concatenate( [compressedMatrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_compressedMatrix + sum( [layer.n_out for layer in seq2matrixLayers]) else: ##old code for merging original matrix input and sequential input input_2d = T.concatenate( [matrixInput] + [layer.output for layer in seq2matrixLayers], axis=3) n_input2d = n_in_matrix + sum( [layer.n_out for layer in seq2matrixLayers]) #print 'n_input2d=', n_input2d if modelSpecs['network'].startswith('ResNet'): matrixConv = ResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) elif modelSpecs['network'].startswith('DilatedResNet'): #matrixConv=DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network']) matrixConv = DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, modelSpecs=modelSpecs) else: print 'ERROR: Unimplemented deep network type: ', modelSpecs[ 'network'] exit(1) self.layers.append(matrixConv) conv_out = matrixConv.output selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0) n_in4logreg = matrixConv.n_out self.outputList = [] self.output_probList = [] self.predictors = [] self.params4var = [] self.paramL14var = 0 self.paramL24var = 0 for res in modelSpecs['responses']: labelType = Response2LabelType(res) predictor = None if labelType.startswith('Discrete'): assert GetResponseValueDims(res) == 1 predictor = NN4LogReg(rng=rng, input=selected, n_in=n_in4logreg, n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) elif labelType.startswith('LogNormal') or labelType.startswith( 'Normal'): predictor = NN4Normal(rng=rng, input=selected, n_in=n_in4logreg, n_variables=GetResponseValueDims(res), n_out=GetResponseProbDims(res), n_hiddens=n_hiddens_logreg) ## recording parameters specific for variance prediction self.params4var += predictor.params4var self.paramL14var += predictor.paramL14var self.paramL24var += predictor.paramL24var else: print 'incorrect response name or label type: ', res exit(1) self.layers.append(predictor) self.predictors.append(predictor) ## output in 2d matrix output_2d = predictor.y_pred.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseValueDims(res))) output_2d_prob = predictor.output.reshape( (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2], GetResponseProbDims(res))) self.outputList.append(output_2d) self.output_probList.append(output_2d_prob) self.output = T.concatenate(self.outputList, axis=3) self.output_prob = T.concatenate(self.output_probList, axis=3) ## collect all the model parameters and their norms self.params = [] self.paramL2 = 0 self.paramL1 = 0 for layer in self.layers: self.params += layer.params self.paramL2 += layer.paramL2 self.paramL1 += layer.paramL1 """
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() while True: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) #i = 0 for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelWeightPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelWeightPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data) """ i += 1 if i%100 == 0: print '#batches of train data loaded: ', i """ #print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices and ', len(labelMatrixPool), ' label weight matrices' print 'TrainDataLoader has finished loading data' sharedQ.close()