Python DataProcessor.LoadPropertyFeatures Beispiele

Programmiersprache: Python

Klasse / Typ: DataProcessor

Methode / Funktion: LoadPropertyFeatures

Beispiele auf hotexamples.com: 2

Python DataProcessor.LoadPropertyFeatures - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die DataProcessor.LoadPropertyFeatures vom Programmpaket NeMo, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

DataProcessor(23)

SplitData2Batches(10)

LoadRealData(10)

SampleProteinInfo(8)

File(6)

AssembleOneBatch(5)

InventoryCountProcessor(5)

InventoryProcessor(5)

ProductsProcessor(3)

ProductProcessor(3)

LoadNativeDistMatrixFromFile(2)

LoadPropertyFeatures(2)

unsize_vector(2)

LoadMetaData(2)

LoadNativeDistMatrix(1)

getDailyReviewsCount(1)

get_cell(1)

get_ewma(1)

get_logistic_reg_prediction(1)

get_moving_average(1)

get_randomforest_prediction(1)

partition_data(1)

get_svc_prediction(1)

prepare_data_for_classification(1)

processFile(1)

select_best_param_svc(1)

split_around_nan(1)

time_vector(1)

truncate_data(1)

find_timeshift(1)

butterworth(1)

find_bump(1)

FitLineFromCsv(1)

LoadDistanceLabelMatrices(1)

Process(1)

Processor(1)

LoadDistanceFeatures(1)

InventoryCountsProcessor(1)

Regress32x128Color00PastDataLoader(1)

SaveListToCsv(1)

apply_svc(1)

ExtractFeaturesNLabels(1)

Database(1)

TFIDFConverter(1)

apply_knn(1)

apply_logistic_regressor(1)

apply_random_forest(1)

SplitMetaData(1)

Beispiel #1

Datei anzeigen

Datei: RunPropertyPredictor.py Projekt: zhujianwei31415/RaptorX-3DModeling

def PredictProperty(models, predictors, predFiles):

    allsequences = dict()

    ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list
    allresults4prob = dict()
    allresults = dict()

    for model, predictor in zip(models, predictors):

        predict, inputVariables = predictor

        ## We shall load these files for each model separately since each model may use a different set of features
        predData = DataProcessor.LoadPropertyFeatures(predFiles,
                                                      modelSpecs=model,
                                                      forTrainValidation=False)

        ##make sure the input has the same number of features as the model
        rindex = np.random.randint(0, high=len(predData))
        assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

        ## collecting sequences
        for d in predData:
            if not allsequences.has_key(d['name']):
                allsequences[d['name']] = d['sequence']
            elif allsequences[d['name']] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData, names = DataProcessor.SplitData2Batches(
            data=predData,
            numDataPoints=30,
            modelSpecs=model,
            forTrainValidation=False)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        for onebatch, names4onebatch in zip(predSeqData, names):
            input = onebatch[:len(inputVariables)]
            result4prob, result = predict(*input)

            ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked)
            x1d, x1dmask = input[0:2]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses'])  )
            assert result4prob.shape[2] == sum([
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses'])  )
            assert result.shape[2] == sum([
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            nameGenerator = (name for name in names4onebatch
                             if not allresults.has_key(name))
            for name in nameGenerator:
                allresults[name] = dict()
                allresults4prob[name] = dict()

            dims = [
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults4prob[name].has_key(res))
                for name in nameGenerator:
                    allresults4prob[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result4prob[:, :,
                                                       start:end], seqLens)
                ]

                [
                    allresults4prob[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

            dims = [
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults[name].has_key(res))
                for name in nameGenerator:
                    allresults[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result[:, :, start:end], seqLens)
                ]
                [
                    allresults[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

    ## calculate the final result, which is the average of all the predictd properties for the same protein and response name
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()
        for response in results.keys():
            tmpresult = np.average(allresults[name][response], axis=0)

            ##convert coding of discrete labels to more meaningful representation
            labelType = Response2LabelType(response)
            if not labelType.startswith('Discrete'):
                finalresults[name][response] = tmpresult

    finalresults4prob = dict()
    for name, results in allresults4prob.iteritems():
        if not finalresults4prob.has_key(name):
            finalresults4prob[name] = dict()
        for response in results.keys():
            finalresults4prob[name][response] = np.average(
                allresults4prob[name][response], axis=0)

            labelType = Response2LabelType(response)
            if labelType.startswith('Discrete'):
                tmpresult = np.argmax(finalresults4prob[name][response],
                                      axis=1)
                finalresults[name][response] = PropertyUtils.Coding2String(
                    tmpresult, response)
    """
	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for apt in model['responseNames']:
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][apt])
			labelWeights[apt].append(model['weight4' + model['labelType'] ][apt])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)
	"""

    return finalresults4prob, finalresults, allsequences

Beispiel #2

Datei anzeigen

Datei: TrainPropertyPredictor.py Projekt: zhujianwei31415/RaptorX-3DModeling

def main(argv):

    #modelSpecs = config.InitializeModelSpecs()
    modelSpecs = InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    startTime = datetime.datetime.now()

    ##trainData and validData are a list. Each element corresponds to one protein, which is a dict()
    trainData = DataProcessor.LoadPropertyFeatures(modelSpecs['trainFile'],
                                                   modelSpecs=modelSpecs)
    validData = DataProcessor.LoadPropertyFeatures(modelSpecs['validFile'],
                                                   modelSpecs=modelSpecs)
    print '#trainData: ', len(trainData), '#validData: ', len(validData)

    ## where to add code to assign weight to each residue? We need to deal with the residues without 3D coordinates for angle and SS prediction
    ##a, b = DataProcessor.CalcLabelDistributionAndWeight(trainData, modelSpecs)

    modelSpecs['numOfTrainProteins'] = len(trainData)

    beforeBatchTime = datetime.datetime.now()
    print 'time spent on data loading: ', beforeBatchTime - startTime

    print 'Preparing batch data for training...'
    groupSize = modelSpecs['minibatchSize']
    trainSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=trainData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    validSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=validData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    #validSeqDataset = DataProcessor.SplitData2Batches(data=validData, numDataPoints=20000, modelSpecs=modelSpecs)
    print "#trainData minibatches:", len(
        trainSeqDataset), "#validData minibatches:", len(validSeqDataset)

    predSeqDataset = None
    if modelSpecs['predFile'] is not None:
        predData = DataProcessor.LoadPropertyFeatures(modelSpecs['predFile'],
                                                      modelSpecs=modelSpecs,
                                                      forTrainValidation=False)
        print '#predData: ', len(predData)
        predSeqDataset, _ = DataProcessor.SplitData2Batches(
            data=predData, numDataPoints=40, modelSpecs=modelSpecs)
        print "#predData minibatches:", len(predSeqDataset)

## Each protein in trainData contains three or four components: seqFeatures and label
    modelSpecs['n_in_seq'] = trainData[0]['seqFeatures'].shape[1]

    beforeTrainTime = datetime.datetime.now()

    print 'time spent on generating batch data:', beforeTrainTime - beforeBatchTime

    result = TrainModel(modelSpecs=modelSpecs,
                        trainSeqData=trainSeqDataset,
                        validSeqData=validSeqDataset,
                        predSeqData=predSeqDataset)

    ##merge ModelSpecs and result
    resultModel = modelSpecs.copy()
    resultModel.update(result)

    modelFile = GenerateModelFileName(resultModel)
    print 'Writing the resultant model to ', modelFile
    cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)