def CollectSequentialFeatures(d, modelSpecs, oneHotEncoding, returnMode='array'): ##collecting sequential features... seqMatrices = [] if config.UseOneHotEncoding(modelSpecs): seqMatrices.append(oneHotEncoding) if modelSpecs.has_key('UseSS') and (modelSpecs['UseSS'] is True): seqMatrices.append(d['SS3']) if modelSpecs.has_key('UseACC') and (modelSpecs['UseACC'] is True): seqMatrices.append(d['ACC']) if modelSpecs.has_key('UsePSSM') and (modelSpecs['UsePSSM'] is True): seqMatrices.append(d['PSSM']) if modelSpecs.has_key('UseDisorder') and modelSpecs['UseDisorder'] is True: seqMatrices.append(d['DISO']) if config.UseRgAsSequentialFeature(modelSpecs): seqMatrices.append(Rg(d, AsSequentialFeature=True)) ##membrane protein specific features useMPSpecificFeatures = modelSpecs.has_key('UseMPSpecificFeatures') and ( modelSpecs['UseMPSpecificFeatures'] is True) if useMPSpecificFeatures: if d.has_key('MemAcc'): seqMatrices.append(d['MemAcc']) else: print 'ERROR: The data does not have a feature called MemAcc' exit(1) if d.has_key('MemTopo'): seqMatrices.append(d['MemTopo']) else: print 'ERROR: The data does not have a feature called MemTopo' exit(1) ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment. ## this is mainly used for homology modeling if config.UseTemplate(modelSpecs): #print 'Using template similarity score...' if not d.has_key('tplSimScore'): print 'ERROR: the data has no key tplSimScore, which is needed since you specify to use template information' exit(1) numTemplateSimFeatures = d['tplSimScore'].shape[1] if numTemplateSimFeatures < 10 or numTemplateSimFeatures > 15: print 'WARNING: The number of features for query-template similarity may be incorrect. Please double check it!' seqMatrices.append(d['tplSimScore']) if returnMode.lower() == 'array': seqFeature = np.concatenate(seqMatrices, axis=1) return seqFeature else: return seqMatrices
def GenerateModelFileName(resultModel): prefix = '' verStr ='Model' + str(int(resultModel['Version']*10)) + '-' if config.UseTemplate(resultModel): prefix += ('TPL' + verStr) else: prefix += ('Seq' + verStr) prefix += resultModel['network'] + '4' prefix += resultModel['responseStr'].replace(';', '-').replace(':','_') if config.UseRawCCM(resultModel): prefix += '-CCM' if config.UseCCMZ(resultModel): prefix += '-CCMZ' if resultModel.has_key('NoWeight4Range') and resultModel['NoWeight4Range']: prefix +='-NoWR' if resultModel.has_key('NoWeight4Label') and resultModel['NoWeight4Label']: prefix +='-NoWL' arch = '' #arch += 'L1D' + str( (sum(resultModel['conv1d_repeats']) + len(resultModel['conv1d_hiddens']) )*2 -1 ) arch += 'L2D' + str( (sum(resultModel['conv2d_repeats']) + len(resultModel['conv2d_hiddens']) )*2 -1 ) #arch += 'Log' + str( (sum(resultModel['logreg_hiddens']) + len(resultModel['logreg_hiddens']) ) ) """ if resultModel['network'].startswith('DilatedResNet'): arch += 'W1D' + str(resultModel['conv1d_hwsz']) + 'W2D' arch += '-'.join(map(str, resultModel['conv2d_hwszs']) ) arch += 'Dilation' + '-'.join(map(str, resultModel['conv2d_dilations']) ) else: arch += 'W1D' + str(resultModel['halfWinSize_seq']) + 'W2D' + str(resultModel['halfWinSize_matrix']) """ #arch += 'I1D' + str(resultModel['n_in_seq']) + 'I2D' + str(resultModel['n_in_matrix']) MID = str(datetime.datetime.today()).split()[0].replace('-', '') + '-' + resultModel['ModelID'] + str(os.getpid()) #bias = resultModel['LRbias'] + 'LRbias' ##epoch = 'E' + str(resultModel['numEpochs']) ##epoch = 'E' suffix = '.pkl' datastr = os.path.basename(resultModel['trainFile']).split('.')[0:-2] datastr = ''.join(datastr) #components = [ prefix, arch+bias+epoch, datastr, pid, resultModel['algorithm'] ] components = [ prefix, arch, datastr, MID, resultModel['algorithm'] ] filename = os.path.join('LocalModels/', '-'.join(components) + suffix ) if not os.path.exists('LocalModels/'): os.makedirs('LocalModels/') return filename
def LoadOneAlignment4OneModel(model, name, inputFolders, aliFile, tplFile): if not config.UseTemplate(model): print 'ERROR: the deeep model is not trained to handle alignment and template information provided for protein', name exit(1) if not os.path.isfile(aliFile): print 'ERROR: invalid query-template alignment file', aliFile exit(1) if not os.path.isfile(tplFile): print 'ERROR: invalid template file', tplFile exit(1) data = [] for inputFolder, findex in zip(inputFolders, range(len(inputFolders))): if not os.path.isdir(inputFolder): print 'ERROR: invalid input feature folder: ', inputFolder exit(1) rawData = dict() rawData['featureDir'] = inputFolder feature = FeatureUtils.LoadFeaturePKL(name, location=inputFolder, modelSpecs=model) rawData.update(feature) rawData['name'] = name rawData['length'] = len(rawData['sequence']) feature = AlignmentUtils.GenerateAlignmentFeatures3(queryData=rawData, aliFile=aliFile, tplFile=tplFile, queryName=name, modelSpecs=model) rawData.update(feature) rawData['name'] = rchop(os.path.basename(aliFile), '.fasta') data.append(rawData) return data
def LoadRealData(proteinInfo, modelSpecs, loadFeature=True, loadLabel=True, returnMode='array'): if not isinstance(proteinInfo, (tuple, list, np.ndarray)): locations = [proteinInfo] else: locations = proteinInfo labelNames = [] for response in modelSpecs['responses']: labelName, _, _ = ParseResponse(response) labelNames.append(labelName) oriNeeded = bool( set(labelNames).intersection(set( config.allDistLabelNames))) and loadLabel ## Cb-Cb distMatrix is needed to discretize orientation matrix distNeeded = bool( set(labelNames).intersection(set( config.allDistLabelNames))) and loadLabel or oriNeeded proteins = [] for protein in locations: rawData = dict() rawData['name'] = protein['name'] rawData['sequence'] = protein['sequence'] rawData['length'] = len(rawData['sequence']) ## load data if loadFeature: feature = FeatureUtils.LoadFeaturePKL( protein['name'], location=protein['featureLocation'], modelSpecs=modelSpecs) rawData.update(feature) if loadFeature and config.UseTemplate(modelSpecs): templates = protein['templates'] if len(templates) < 1: #print 'ERROR: template is requested, but none available for protein: ', protein['name'] #exit(1) feature = AlignmentUtils.GenNullAlignmentFeatures( len(protein['sequence'])) else: ## randomly select one template template = random.choice(templates) feature = AlignmentUtils.GenerateAlignmentFeatures2( (protein['name'], template), queryData=rawData, aliDir=protein['aliLocation'], tplDir=protein['tplLocation'], modelSpecs=modelSpecs) ## feature is a Dict() with keys: tplSimScore, tplDistMatrix and tplOriMatrix. Their values have types array, dict, and dict rawData.update(feature) if distNeeded or oriNeeded: truth = NativeUtils.LoadGroundTruth( protein['name'], location=protein['nativeLocation']) NativeUtils.AddGroundTruth(rawData, truth) """ if distNeeded: atomDistMatrix = DistanceUtils.LoadNativeDistMatrix(protein['name'], location=protein['distLocation']) DistanceUtils.AddDistMatrix(rawData, atomDistMatrix) if oriNeeded: atomOriMatrix = OrientationUtils.LoadNativeOriMatrix(protein['name'], location=protein['oriLocation']) OrientationUtils.AddOrientationMatrix(rawData, atomOriMatrix) """ ## extract information needed for this specific deep model if loadFeature: result = ExtractFeaturesNLabels([rawData], modelSpecs, forTrainValidation=loadLabel, returnMode=returnMode) result = result[0] elif loadLabel: result = dict() result['name'] = protein['name'] result['atomLabelMatrix'] = LabelUtils.CollectLabels( rawData, modelSpecs) else: print 'ERROR: in LoadRealData(), you shall load input features and/or labels' exit(1) proteins.append(result) if not isinstance(proteinInfo, (tuple, list, np.ndarray)): return proteins[0] return proteins
def CollectMatrixFeatures(d, modelSpecs, returnMode='array'): ##collecting pairwise features... pairfeatures_nomean = [ ] # a set of pairwise features for which we do not calculate their expected value pairfeatures = [] if not config.NoOldLocationFeatures(modelSpecs): posFeature = FeatureUtils.LocationFeature(d) pairfeatures_nomean.append(posFeature) cbrtFeature = FeatureUtils.CubeRootFeature(d) pairfeatures_nomean.append(cbrtFeature) if config.UseNewLocationFeatures(modelSpecs): posFeatures = FeatureUtils.NewLocationFeature(d) pairfeatures_nomean.extend(posFeatures) if config.UseCCMZ(modelSpecs): if not d.has_key('ccmpredZ'): print 'ERROR: CCMpredZ is requested, but the data for protein ', d[ 'name'], ' does not have it!' exit(1) else: pairfeatures.append(d['ccmpredZ']) if config.UseRawCCM(modelSpecs): if not d.has_key('ccmpred'): print 'ERROR: Raw CCMpred is requested, but the data for protein ', d[ 'name'], ' does not have it!' exit(1) pairfeatures.append(d['ccmpred']) if config.UsePSICOV(modelSpecs): if not d.has_key('psicovZ'): print 'ERROR: psicovZ is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['psicovZ']) if config.UseContactPotential(modelSpecs): if not d.has_key('OtherPairs'): print 'ERROR: pairwise contact potential is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['OtherPairs'][:, :, 0]) if config.UseMI(modelSpecs): if not d.has_key('OtherPairs'): print 'ERROR: mutual information is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['OtherPairs'][:, :, 1:3]) bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode( modelSpecs) if bUseCCMFnorm: if not d.has_key('CCMFnorm') or not d.has_key('CCMFnormZ'): print 'ERROR: CCM Fnorm and/or FnormZ are requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['CCMFnorm']) pairfeatures.append(d['CCMFnormZ']) if bUseCCMsum: if not d.has_key('sumCCM'): print 'ERROR: CCM summary are requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['sumCCM']) if bUseCCMraw: if not d.has_key('rawCCM'): print 'ERROR: CCM raw matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['rawCCM']) if bUseFullMI: if not d.has_key('fullMI'): print 'ERROR: full MI matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['fullMI']) if bUseFullCov: if not d.has_key('fullCov'): print 'ERROR: full covariance matrix is requested, but the data for protein ', d[ 'name'], ' does not have it' exit(1) pairfeatures.append(d['fullCov']) ##add template-based distance and orientation matrices if config.UseTemplate(modelSpecs): pairfeatures_nomean.extend(CollectTemplateMatrixFeatures( d, modelSpecs)) if returnMode.lower() == 'array': matrixFeature = np.dstack(tuple(pairfeatures)) if len(pairfeatures_nomean) > 0: matrixFeature_nomean = np.dstack(tuple(pairfeatures_nomean)) else: seqLen = matrixFeature.shape[0] matrixFeature_nomean = np.zeros((seqLen, seqLen, 0), dtype=config.MyFloat) #print 'matrixFeature.shape: ', matrixFeature.shape return matrixFeature, matrixFeature_nomean else: return pairfeatures, pairfeatures_nomean
def main(argv): if len(argv) < 4: Usage() exit(1) modelFiles = None inputFolders = None aliFolders = None aliFile = None tplFolder = None tplFile = None nativefolder = None savefolder = None name = None nameFile = None inputFeature = None nameStr = None inputStr = None aliStr = None tplStr = None try: opts, args = getopt.getopt(argv, "m:p:i:a:t:g:d:", [ "model=", "name=", "input=", "alignment=", "template=", "nativefolder=", "savefolder=" ]) #print opts, args except getopt.GetoptError as err: print err Usage() exit(1) if len(opts) < 2: Usage() exit(1) for opt, arg in opts: if opt in ("-m", "--model"): modelFiles = arg.split(';') for m in modelFiles: if not os.path.isfile(m): print "ERROR: invalid deep model file:", m exit(1) elif opt in ("-p", "--name"): nameStr = arg elif opt in ("-i", "--inputFolders"): inputStr = arg elif opt in ("-a", "--aliFolders"): aliStr = arg elif opt in ("-t", "--tplFolder"): tplStr = arg elif opt in ("-d", "--savefolder"): savefolder = arg if not os.path.isdir(savefolder): print 'ERROR: the specified folder for results does not exist: ', savefolder exit(1) elif opt in ("-g", "--nativefolder"): nativefolder = arg if not os.path.isdir(nativefolder): print 'ERROR: the specified folder for ground truth does not exist: ', nativefolder exit(1) else: Usage() exit(1) if nameStr.endswith('.inputFeatures.pkl'): if not os.path.isfile(nameStr): print 'ERROR: input feature file does not exist: ', nameStr exit(1) inputFeature = nameStr elif nameStr.endswith('.list'): if not os.path.isfile(nameStr): print 'ERROR: list file for protein names does not exist: ', nameStr exit(1) nameFile = nameStr else: name = nameStr if name is not None or nameFile is not None: assert inputStr is not None inputFolders = inputStr.split(';') for f in inputFolders: if not os.path.isdir(f): print "ERROR: one input feature folder does not exist: ", f exit(1) if aliStr is not None and tplStr is None: print "ERROR: aliStr and tplStr shall be simultaneously None or non-None" exit(1) if aliStr is None and tplStr is not None: print "ERROR: aliStr and tplStr shall be simultaneously None or non-None" exit(1) if tplStr is not None and tplStr.endswith('.tpl.pkl'): tplFile = tplStr if not os.path.isfile(tplFile): print "ERROR: the template file does not exist: ", tplFile exit(1) tplFolder = os.path.dirname(tplFile) elif tplStr is not None: tplFolder = tplStr if not os.path.isdir(tplFolder): print "ERROR: the template folder does not exist: ", tplFolder exit(1) if aliStr is not None and aliStr.endswith('.fasta'): aliFile = aliStr if not os.path.isfile(aliFile): print "ERROR: the alignment file does not exist: ", aliFile exit(1) if tplFile is None: print "ERROR: a template file shall be provided to build 3D models from the alignment file", aliFile exit(1) elif aliStr is not None: aliFolders = aliStr.split(';') for f in aliFolders: if not os.path.isdir(f): print "ERROR: one alignment folder does not exist: ", f exit(1) if tplFolder is None: print 'ERROR: the template folder is None although aliFolders is not None' exit(1) ## when a protein list file is provided, aliStr shall be one or multiple folders and aliStr shall be a folder if nameFile is not None: if aliStr is not None and tplStr is not None: if aliFolders is None or tplFolder is None: print "ERROR: a protein list file is provided, but aliFolders or tplFolder is empty" exit(1) assert len(modelFiles) > 0 print 'modelFiles=', modelFiles print 'protein nameFile=', nameFile """ print 'protein name=', name print 'input feature file=', inputFeature """ print 'inputFolders=', inputFolders """ print 'aliFolders=', aliFolders print 'aliFile=', aliFile print 'tplFolder=', tplFolder print 'tplFile=', tplFile """ print 'savefolder=', savefolder """ print 'nativefolder=', nativefolder """ ## check consistency between deep models and input models = LoadModels(modelFiles) if aliStr is not None: for model, mfile in zip(models, modelFiles): if not config.UseTemplate(model): print 'ERROR: alignment information is provided, but deep model not trained to handle alignments is used:', mfile exit(1) else: for model, mfile in zip(models, modelFiles): if config.UseTemplate(model): print 'ERROR: no alignment information is provided, but deep model trained to handle alignments is used:', mfile exit(1) predictors = BuildPredictors(models) if inputFeature is not None: inputFolders = [os.path.dirname(inputFeature)] name = os.path.basename(inputFeature)[:-len('.inputFeatures.pkl')] if name is not None: if aliFile is not None: contPredictions = PredictMatrixLabels(models, predictors, name, inputFolders, aliFile=aliFile, tplFile=tplFile, saveFolder=savefolder)[0] else: contPredictions = PredictMatrixLabels(models, predictors, [name], inputFolders, aliFolders=aliFolders, tplFolder=tplFolder, saveFolder=savefolder)[0] if nativefolder is not None: avgacc, allacc = ContactUtils.EvaluateContactPredictions( contPredictions, nativefolder) ContactUtils.PrintAllContactAccuracy(avgacc, allacc) elif nameFile is not None: with open(nameFile, 'r') as fh: names = [n.strip() for n in list(fh)] ## for a batch of proteins, we predict 100 proteins every time to save CPU memory consumption if nativefolder is not None: allaccuracy = dict() avgaccuracy = dict() groupSize = 100 if tplFolder is not None: groupSize = 10 for i in range(0, len(names), groupSize): group = names[i:min(i + groupSize, len(names))] contPredictions = PredictMatrixLabels(models, predictors, group, inputFolders, aliFolders=aliFolders, tplFolder=tplFolder, saveFolder=savefolder)[0] if nativefolder is not None: avgacc, allacc = ContactUtils.EvaluateContactPredictions( contPredictions, nativefolder) allaccuracy.update(allacc) for k, v in avgacc.iteritems(): if not avgaccuracy.has_key(k): avgaccuracy[k] = v * len(group) else: avgaccuracy[k] += v * len(group) if nativefolder is not None: for k, v in avgaccuracy: avgaccuracy[k] = v / len(names) ContactUtils.PrintAllContactAccuracy(avgaccuracy, allacc) else: print 'ERROR: at least one of name and nameFile shall not be None' exit(1)
def LoadProteinData4OneModel(model, names, inputFolders, aliFolders=None, tplFolder=None): for inputFolder in inputFolders: if not os.path.isdir(inputFolder): print 'ERROR: folder for protein features does not exist: ', inputFolder exit(1) if config.UseTemplate(model): from copy import deepcopy assert tplFolder is not None assert aliFolders is not None if not os.path.isdir(tplFolder): print 'ERROR: invalid folde for templates: ', tplFolder exit(1) for aliFolder in aliFolders: if not os.path.isdir(aliFolder): print 'ERROR: invalid folder for query-template alignments: ', aliFolder exit(1) data = [] for name in names: if config.UseTemplate(model): aliFiles = FindAllAliFiles(query=name, aliFolders=aliFolders) print 'In total find', len(aliFiles), 'alignment files for', name #print aliFiles if config.UseTemplate(model) and len(aliFiles) < 1: continue for inputFolder in inputFolders: rawData = dict() feature = FeatureUtils.LoadFeaturePKL(name, location=inputFolder, modelSpecs=model) rawData.update(feature) rawData['length'] = len(rawData['sequence']) rawData['name'] = name rawData['featureDir'] = inputFolder if not config.UseTemplate(model): data.append(rawData) continue for aliFile in aliFiles: rawData2 = deepcopy(rawData) feature = AlignmentUtils.GenerateAlignmentFeatures4( queryData=rawData, aliFile=aliFile, tplFolder=tplFolder, modelSpecs=model) if feature is None: continue rawData2.update(feature) rawData2['name'] = rchop(os.path.basename(aliFile), '.fasta') data.append(rawData2) if len(data) < 1: print 'ERROR: cannot find any input data for distance/orientation prediction' exit(1) return data