def CollectSequentialFeatures(d,
                              modelSpecs,
                              oneHotEncoding,
                              returnMode='array'):
    ##collecting sequential features...
    seqMatrices = []

    if config.UseOneHotEncoding(modelSpecs):
        seqMatrices.append(oneHotEncoding)

    if modelSpecs.has_key('UseSS') and (modelSpecs['UseSS'] is True):
        seqMatrices.append(d['SS3'])

    if modelSpecs.has_key('UseACC') and (modelSpecs['UseACC'] is True):
        seqMatrices.append(d['ACC'])

    if modelSpecs.has_key('UsePSSM') and (modelSpecs['UsePSSM'] is True):
        seqMatrices.append(d['PSSM'])

    if modelSpecs.has_key('UseDisorder') and modelSpecs['UseDisorder'] is True:
        seqMatrices.append(d['DISO'])

    if config.UseRgAsSequentialFeature(modelSpecs):
        seqMatrices.append(Rg(d, AsSequentialFeature=True))

    ##membrane protein specific features
    useMPSpecificFeatures = modelSpecs.has_key('UseMPSpecificFeatures') and (
        modelSpecs['UseMPSpecificFeatures'] is True)
    if useMPSpecificFeatures:
        if d.has_key('MemAcc'):
            seqMatrices.append(d['MemAcc'])
        else:
            print 'ERROR: The data does not have a feature called MemAcc'
            exit(1)

        if d.has_key('MemTopo'):
            seqMatrices.append(d['MemTopo'])
        else:
            print 'ERROR: The data does not have a feature called MemTopo'
            exit(1)

    ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment.
    ## this is mainly used for homology modeling
    if config.UseTemplate(modelSpecs):
        #print 'Using template similarity score...'
        if not d.has_key('tplSimScore'):
            print 'ERROR: the data has no key tplSimScore, which is needed since you specify to use template information'
            exit(1)

        numTemplateSimFeatures = d['tplSimScore'].shape[1]
        if numTemplateSimFeatures < 10 or numTemplateSimFeatures > 15:
            print 'WARNING: The number of features for query-template similarity may be incorrect. Please double check it!'

        seqMatrices.append(d['tplSimScore'])

    if returnMode.lower() == 'array':
        seqFeature = np.concatenate(seqMatrices, axis=1)
        return seqFeature
    else:
        return seqMatrices
def GenerateModelFileName(resultModel):

        prefix = ''
        verStr ='Model' +  str(int(resultModel['Version']*10)) + '-'
        if config.UseTemplate(resultModel):
                prefix += ('TPL' + verStr)
        else:
                prefix += ('Seq' + verStr)

        prefix += resultModel['network'] + '4'
        prefix += resultModel['responseStr'].replace(';', '-').replace(':','_')

        if config.UseRawCCM(resultModel):
                prefix += '-CCM'
        if config.UseCCMZ(resultModel):
                prefix += '-CCMZ'
        if resultModel.has_key('NoWeight4Range') and resultModel['NoWeight4Range']:
                prefix +='-NoWR'
        if resultModel.has_key('NoWeight4Label') and resultModel['NoWeight4Label']:
                prefix +='-NoWL'

        arch = ''
        #arch +=  'L1D' + str( (sum(resultModel['conv1d_repeats']) + len(resultModel['conv1d_hiddens']) )*2 -1 )  
        arch += 'L2D' + str( (sum(resultModel['conv2d_repeats']) + len(resultModel['conv2d_hiddens']) )*2 -1 )
        #arch += 'Log' + str( (sum(resultModel['logreg_hiddens']) + len(resultModel['logreg_hiddens']) )      )  

        """
        if resultModel['network'].startswith('DilatedResNet'):
                arch += 'W1D' + str(resultModel['conv1d_hwsz']) + 'W2D' 
                arch += '-'.join(map(str, resultModel['conv2d_hwszs']) )
                arch += 'Dilation' + '-'.join(map(str, resultModel['conv2d_dilations']) )
        else:
                arch += 'W1D' + str(resultModel['halfWinSize_seq']) + 'W2D' + str(resultModel['halfWinSize_matrix'])
        """
        #arch += 'I1D' + str(resultModel['n_in_seq']) + 'I2D' + str(resultModel['n_in_matrix'])

        MID = str(datetime.datetime.today()).split()[0].replace('-', '') + '-' + resultModel['ModelID'] + str(os.getpid())
        #bias = resultModel['LRbias'] + 'LRbias'
        ##epoch = 'E' + str(resultModel['numEpochs'])
        ##epoch = 'E' 
        suffix = '.pkl'

        datastr = os.path.basename(resultModel['trainFile']).split('.')[0:-2]
        datastr = ''.join(datastr)

        #components = [ prefix, arch+bias+epoch, datastr, pid, resultModel['algorithm'] ]
        components = [ prefix, arch, datastr, MID, resultModel['algorithm'] ]
        filename = os.path.join('LocalModels/', '-'.join(components) + suffix )

        if not os.path.exists('LocalModels/'):
                os.makedirs('LocalModels/')

        return filename
Ejemplo n.º 3
0
def LoadOneAlignment4OneModel(model, name, inputFolders, aliFile, tplFile):

    if not config.UseTemplate(model):
        print 'ERROR: the deeep model is not trained to handle alignment and template information provided for protein', name
        exit(1)

    if not os.path.isfile(aliFile):
        print 'ERROR: invalid query-template alignment file', aliFile
        exit(1)

    if not os.path.isfile(tplFile):
        print 'ERROR: invalid template file', tplFile
        exit(1)

    data = []
    for inputFolder, findex in zip(inputFolders, range(len(inputFolders))):
        if not os.path.isdir(inputFolder):
            print 'ERROR: invalid input feature folder: ', inputFolder
            exit(1)

        rawData = dict()
        rawData['featureDir'] = inputFolder
        feature = FeatureUtils.LoadFeaturePKL(name,
                                              location=inputFolder,
                                              modelSpecs=model)
        rawData.update(feature)
        rawData['name'] = name
        rawData['length'] = len(rawData['sequence'])

        feature = AlignmentUtils.GenerateAlignmentFeatures3(queryData=rawData,
                                                            aliFile=aliFile,
                                                            tplFile=tplFile,
                                                            queryName=name,
                                                            modelSpecs=model)
        rawData.update(feature)
        rawData['name'] = rchop(os.path.basename(aliFile), '.fasta')

        data.append(rawData)

    return data
def LoadRealData(proteinInfo,
                 modelSpecs,
                 loadFeature=True,
                 loadLabel=True,
                 returnMode='array'):

    if not isinstance(proteinInfo, (tuple, list, np.ndarray)):
        locations = [proteinInfo]
    else:
        locations = proteinInfo

    labelNames = []
    for response in modelSpecs['responses']:
        labelName, _, _ = ParseResponse(response)
        labelNames.append(labelName)

    oriNeeded = bool(
        set(labelNames).intersection(set(
            config.allDistLabelNames))) and loadLabel

    ## Cb-Cb distMatrix is needed to discretize orientation matrix
    distNeeded = bool(
        set(labelNames).intersection(set(
            config.allDistLabelNames))) and loadLabel or oriNeeded

    proteins = []
    for protein in locations:
        rawData = dict()
        rawData['name'] = protein['name']
        rawData['sequence'] = protein['sequence']
        rawData['length'] = len(rawData['sequence'])

        ## load data
        if loadFeature:
            feature = FeatureUtils.LoadFeaturePKL(
                protein['name'],
                location=protein['featureLocation'],
                modelSpecs=modelSpecs)
            rawData.update(feature)

        if loadFeature and config.UseTemplate(modelSpecs):
            templates = protein['templates']
            if len(templates) < 1:
                #print 'ERROR: template is requested, but none available for protein: ', protein['name']
                #exit(1)
                feature = AlignmentUtils.GenNullAlignmentFeatures(
                    len(protein['sequence']))
            else:
                ## randomly select one template
                template = random.choice(templates)
                feature = AlignmentUtils.GenerateAlignmentFeatures2(
                    (protein['name'], template),
                    queryData=rawData,
                    aliDir=protein['aliLocation'],
                    tplDir=protein['tplLocation'],
                    modelSpecs=modelSpecs)

            ## feature is a Dict() with keys: tplSimScore, tplDistMatrix and tplOriMatrix. Their values have types array, dict, and dict
            rawData.update(feature)

        if distNeeded or oriNeeded:
            truth = NativeUtils.LoadGroundTruth(
                protein['name'], location=protein['nativeLocation'])
            NativeUtils.AddGroundTruth(rawData, truth)
        """
                if distNeeded:
                        atomDistMatrix = DistanceUtils.LoadNativeDistMatrix(protein['name'], location=protein['distLocation'])
                        DistanceUtils.AddDistMatrix(rawData, atomDistMatrix)

                if oriNeeded:
                        atomOriMatrix = OrientationUtils.LoadNativeOriMatrix(protein['name'], location=protein['oriLocation'])
                        OrientationUtils.AddOrientationMatrix(rawData, atomOriMatrix)
		"""

        ## extract information needed for this specific deep model
        if loadFeature:
            result = ExtractFeaturesNLabels([rawData],
                                            modelSpecs,
                                            forTrainValidation=loadLabel,
                                            returnMode=returnMode)
            result = result[0]
        elif loadLabel:
            result = dict()
            result['name'] = protein['name']
            result['atomLabelMatrix'] = LabelUtils.CollectLabels(
                rawData, modelSpecs)
        else:
            print 'ERROR: in LoadRealData(), you shall load input features and/or labels'
            exit(1)

        proteins.append(result)

    if not isinstance(proteinInfo, (tuple, list, np.ndarray)):
        return proteins[0]

    return proteins
def CollectMatrixFeatures(d, modelSpecs, returnMode='array'):
    ##collecting pairwise features...
    pairfeatures_nomean = [
    ]  # a set of pairwise features for which we do not calculate their expected value
    pairfeatures = []

    if not config.NoOldLocationFeatures(modelSpecs):
        posFeature = FeatureUtils.LocationFeature(d)
        pairfeatures_nomean.append(posFeature)

        cbrtFeature = FeatureUtils.CubeRootFeature(d)
        pairfeatures_nomean.append(cbrtFeature)

    if config.UseNewLocationFeatures(modelSpecs):
        posFeatures = FeatureUtils.NewLocationFeature(d)
        pairfeatures_nomean.extend(posFeatures)

    if config.UseCCMZ(modelSpecs):
        if not d.has_key('ccmpredZ'):
            print 'ERROR: CCMpredZ is requested, but the data for protein ', d[
                'name'], ' does not have it!'
            exit(1)
        else:
            pairfeatures.append(d['ccmpredZ'])

    if config.UseRawCCM(modelSpecs):
        if not d.has_key('ccmpred'):
            print 'ERROR: Raw CCMpred is requested, but the data for protein ', d[
                'name'], ' does not have it!'
            exit(1)
        pairfeatures.append(d['ccmpred'])

    if config.UsePSICOV(modelSpecs):
        if not d.has_key('psicovZ'):
            print 'ERROR: psicovZ is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['psicovZ'])

    if config.UseContactPotential(modelSpecs):
        if not d.has_key('OtherPairs'):
            print 'ERROR: pairwise contact potential is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['OtherPairs'][:, :, 0])

    if config.UseMI(modelSpecs):
        if not d.has_key('OtherPairs'):
            print 'ERROR: mutual information is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['OtherPairs'][:, :, 1:3])

    bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
        modelSpecs)
    if bUseCCMFnorm:
        if not d.has_key('CCMFnorm') or not d.has_key('CCMFnormZ'):
            print 'ERROR: CCM Fnorm and/or FnormZ are requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['CCMFnorm'])
        pairfeatures.append(d['CCMFnormZ'])

    if bUseCCMsum:
        if not d.has_key('sumCCM'):
            print 'ERROR: CCM summary are requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['sumCCM'])

    if bUseCCMraw:
        if not d.has_key('rawCCM'):
            print 'ERROR: CCM raw matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['rawCCM'])

    if bUseFullMI:
        if not d.has_key('fullMI'):
            print 'ERROR: full MI matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['fullMI'])

    if bUseFullCov:
        if not d.has_key('fullCov'):
            print 'ERROR: full covariance matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['fullCov'])

    ##add template-based distance and orientation matrices
    if config.UseTemplate(modelSpecs):
        pairfeatures_nomean.extend(CollectTemplateMatrixFeatures(
            d, modelSpecs))

    if returnMode.lower() == 'array':
        matrixFeature = np.dstack(tuple(pairfeatures))
        if len(pairfeatures_nomean) > 0:
            matrixFeature_nomean = np.dstack(tuple(pairfeatures_nomean))
        else:
            seqLen = matrixFeature.shape[0]
            matrixFeature_nomean = np.zeros((seqLen, seqLen, 0),
                                            dtype=config.MyFloat)

        #print 'matrixFeature.shape: ', matrixFeature.shape
        return matrixFeature, matrixFeature_nomean
    else:
        return pairfeatures, pairfeatures_nomean
Ejemplo n.º 6
0
def main(argv):

    if len(argv) < 4:
        Usage()
        exit(1)

    modelFiles = None

    inputFolders = None

    aliFolders = None
    aliFile = None

    tplFolder = None
    tplFile = None

    nativefolder = None
    savefolder = None

    name = None
    nameFile = None
    inputFeature = None

    nameStr = None
    inputStr = None
    aliStr = None
    tplStr = None

    try:
        opts, args = getopt.getopt(argv, "m:p:i:a:t:g:d:", [
            "model=", "name=", "input=", "alignment=", "template=",
            "nativefolder=", "savefolder="
        ])

#print opts, args
    except getopt.GetoptError as err:
        print err
        Usage()
        exit(1)

    if len(opts) < 2:
        Usage()
        exit(1)

    for opt, arg in opts:
        if opt in ("-m", "--model"):
            modelFiles = arg.split(';')
            for m in modelFiles:
                if not os.path.isfile(m):
                    print "ERROR: invalid deep model file:", m
                    exit(1)

        elif opt in ("-p", "--name"):
            nameStr = arg

        elif opt in ("-i", "--inputFolders"):
            inputStr = arg

        elif opt in ("-a", "--aliFolders"):
            aliStr = arg

        elif opt in ("-t", "--tplFolder"):
            tplStr = arg

        elif opt in ("-d", "--savefolder"):
            savefolder = arg
            if not os.path.isdir(savefolder):
                print 'ERROR: the specified folder for results does not exist: ', savefolder
                exit(1)

        elif opt in ("-g", "--nativefolder"):
            nativefolder = arg
            if not os.path.isdir(nativefolder):
                print 'ERROR: the specified folder for ground truth does not exist: ', nativefolder
                exit(1)
        else:
            Usage()
            exit(1)

    if nameStr.endswith('.inputFeatures.pkl'):
        if not os.path.isfile(nameStr):
            print 'ERROR: input feature file does not exist: ', nameStr
            exit(1)
        inputFeature = nameStr

    elif nameStr.endswith('.list'):
        if not os.path.isfile(nameStr):
            print 'ERROR: list file for protein names does not exist: ', nameStr
            exit(1)
        nameFile = nameStr
    else:
        name = nameStr

    if name is not None or nameFile is not None:
        assert inputStr is not None

        inputFolders = inputStr.split(';')
        for f in inputFolders:
            if not os.path.isdir(f):
                print "ERROR: one input feature folder does not exist: ", f
                exit(1)

    if aliStr is not None and tplStr is None:
        print "ERROR: aliStr and tplStr shall be simultaneously None or non-None"
        exit(1)
    if aliStr is None and tplStr is not None:
        print "ERROR: aliStr and tplStr shall be simultaneously None or non-None"
        exit(1)

    if tplStr is not None and tplStr.endswith('.tpl.pkl'):
        tplFile = tplStr
        if not os.path.isfile(tplFile):
            print "ERROR: the template file does not exist: ", tplFile
            exit(1)
        tplFolder = os.path.dirname(tplFile)

    elif tplStr is not None:
        tplFolder = tplStr
        if not os.path.isdir(tplFolder):
            print "ERROR: the template folder does not exist: ", tplFolder
            exit(1)

    if aliStr is not None and aliStr.endswith('.fasta'):
        aliFile = aliStr
        if not os.path.isfile(aliFile):
            print "ERROR: the alignment file does not exist: ", aliFile
            exit(1)
        if tplFile is None:
            print "ERROR: a template file shall be provided to build 3D models from the alignment file", aliFile
            exit(1)

    elif aliStr is not None:
        aliFolders = aliStr.split(';')
        for f in aliFolders:
            if not os.path.isdir(f):
                print "ERROR: one alignment folder does not exist: ", f
                exit(1)
        if tplFolder is None:
            print 'ERROR: the template folder is None although aliFolders is not None'
            exit(1)

    ## when a protein list file is provided, aliStr shall be one or multiple folders and aliStr shall be a folder
    if nameFile is not None:
        if aliStr is not None and tplStr is not None:
            if aliFolders is None or tplFolder is None:
                print "ERROR: a protein list file is provided, but aliFolders or tplFolder is empty"
                exit(1)

    assert len(modelFiles) > 0
    print 'modelFiles=', modelFiles

    print 'protein nameFile=', nameFile
    """
	print 'protein name=', name
	print 'input feature file=', inputFeature
	"""

    print 'inputFolders=', inputFolders
    """
	print 'aliFolders=', aliFolders
	print 'aliFile=', aliFile

	print 'tplFolder=', tplFolder
	print 'tplFile=', tplFile
	"""

    print 'savefolder=', savefolder
    """
	print 'nativefolder=', nativefolder
	"""

    ## check consistency between deep models and input
    models = LoadModels(modelFiles)
    if aliStr is not None:
        for model, mfile in zip(models, modelFiles):
            if not config.UseTemplate(model):
                print 'ERROR: alignment information is provided, but deep model not trained to handle alignments is used:', mfile
                exit(1)
    else:
        for model, mfile in zip(models, modelFiles):
            if config.UseTemplate(model):
                print 'ERROR: no alignment information is provided, but deep model trained to handle alignments is used:', mfile
                exit(1)

    predictors = BuildPredictors(models)

    if inputFeature is not None:
        inputFolders = [os.path.dirname(inputFeature)]
        name = os.path.basename(inputFeature)[:-len('.inputFeatures.pkl')]

    if name is not None:
        if aliFile is not None:
            contPredictions = PredictMatrixLabels(models,
                                                  predictors,
                                                  name,
                                                  inputFolders,
                                                  aliFile=aliFile,
                                                  tplFile=tplFile,
                                                  saveFolder=savefolder)[0]
        else:
            contPredictions = PredictMatrixLabels(models,
                                                  predictors, [name],
                                                  inputFolders,
                                                  aliFolders=aliFolders,
                                                  tplFolder=tplFolder,
                                                  saveFolder=savefolder)[0]
        if nativefolder is not None:
            avgacc, allacc = ContactUtils.EvaluateContactPredictions(
                contPredictions, nativefolder)
            ContactUtils.PrintAllContactAccuracy(avgacc, allacc)

    elif nameFile is not None:
        with open(nameFile, 'r') as fh:
            names = [n.strip() for n in list(fh)]

        ## for a batch of proteins, we predict 100 proteins every time to save CPU memory consumption
        if nativefolder is not None:
            allaccuracy = dict()
            avgaccuracy = dict()

        groupSize = 100
        if tplFolder is not None:
            groupSize = 10

        for i in range(0, len(names), groupSize):
            group = names[i:min(i + groupSize, len(names))]
            contPredictions = PredictMatrixLabels(models,
                                                  predictors,
                                                  group,
                                                  inputFolders,
                                                  aliFolders=aliFolders,
                                                  tplFolder=tplFolder,
                                                  saveFolder=savefolder)[0]
            if nativefolder is not None:
                avgacc, allacc = ContactUtils.EvaluateContactPredictions(
                    contPredictions, nativefolder)
                allaccuracy.update(allacc)
                for k, v in avgacc.iteritems():
                    if not avgaccuracy.has_key(k):
                        avgaccuracy[k] = v * len(group)
                    else:
                        avgaccuracy[k] += v * len(group)

        if nativefolder is not None:
            for k, v in avgaccuracy:
                avgaccuracy[k] = v / len(names)
            ContactUtils.PrintAllContactAccuracy(avgaccuracy, allacc)

    else:
        print 'ERROR: at least one of name and nameFile shall not be None'
        exit(1)
Ejemplo n.º 7
0
def LoadProteinData4OneModel(model,
                             names,
                             inputFolders,
                             aliFolders=None,
                             tplFolder=None):

    for inputFolder in inputFolders:
        if not os.path.isdir(inputFolder):
            print 'ERROR: folder for protein features does not exist: ', inputFolder
            exit(1)

    if config.UseTemplate(model):
        from copy import deepcopy

        assert tplFolder is not None
        assert aliFolders is not None
        if not os.path.isdir(tplFolder):
            print 'ERROR: invalid folde for templates: ', tplFolder
            exit(1)
        for aliFolder in aliFolders:
            if not os.path.isdir(aliFolder):
                print 'ERROR: invalid folder for query-template alignments: ', aliFolder
                exit(1)

    data = []
    for name in names:
        if config.UseTemplate(model):
            aliFiles = FindAllAliFiles(query=name, aliFolders=aliFolders)
            print 'In total find', len(aliFiles), 'alignment files for', name
            #print aliFiles
        if config.UseTemplate(model) and len(aliFiles) < 1:
            continue

        for inputFolder in inputFolders:
            rawData = dict()
            feature = FeatureUtils.LoadFeaturePKL(name,
                                                  location=inputFolder,
                                                  modelSpecs=model)
            rawData.update(feature)
            rawData['length'] = len(rawData['sequence'])
            rawData['name'] = name
            rawData['featureDir'] = inputFolder

            if not config.UseTemplate(model):
                data.append(rawData)
                continue

            for aliFile in aliFiles:
                rawData2 = deepcopy(rawData)
                feature = AlignmentUtils.GenerateAlignmentFeatures4(
                    queryData=rawData,
                    aliFile=aliFile,
                    tplFolder=tplFolder,
                    modelSpecs=model)
                if feature is None:
                    continue
                rawData2.update(feature)
                rawData2['name'] = rchop(os.path.basename(aliFile), '.fasta')
                data.append(rawData2)

    if len(data) < 1:
        print 'ERROR: cannot find any input data for distance/orientation prediction'
        exit(1)
    return data