def run(dataName, foldId, imputationMethod, proposedMethod): EPOCHS = 2000 NUMBER_OF_CV_FOLDS = 10 ALL_WEIGHT_REG_CANDIDATES = [1.0, 0.1, 0.01, 0.001, 0.0001] ALL_TRANSFORM_REG_CANDIDATES = [1.0, 0.1, 0.01, 0.001, 0.0] # ALL_WEIGHT_REG_CANDIDATES = [1.0] # ALL_TRANSFORM_REG_CANDIDATES = [1.0] NR_JOBS = 1 if proposedMethod: createModel = createModelProposed else: createModel = createModelLogReg modelForCV = KerasClassifier(build_fn=createModel, epochs=EPOCHS, verbose=True) trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod) parameters = {"classWeightRegularizer" : ALL_WEIGHT_REG_CANDIDATES, "transformWeightRegularizer" : ALL_TRANSFORM_REG_CANDIDATES} gridSearchObj = sklearn.model_selection.GridSearchCV(modelForCV, parameters, scoring = myScorer, cv = NUMBER_OF_CV_FOLDS, n_jobs = NR_JOBS) gridSearchObj.fit(trainData, trainLabels) cvResult = pandas.DataFrame.from_dict(gridSearchObj.cv_results_) meanScoresEval = (cvResult["mean_test_score"]).as_matrix() bestId = numpy.argmax(meanScoresEval) bestWeightParam = cvResult.loc[bestId, "param_classWeightRegularizer"] bestTransformParam = cvResult.loc[bestId, "param_transformWeightRegularizer"] meanScoresTrain = (cvResult["mean_train_score"]).as_matrix() finalModel = createModel(transformWeightRegularizer=bestTransformParam, classWeightRegularizer=bestWeightParam) finalModel.fit(trainData, trainLabels, epochs=EPOCHS, verbose=True) aucTest, logLikelihood = evaluation.eval_NN(finalModel, testData, testLabels) # print("bestWeightParam = ") # print(bestWeightParam) # print("meanScores = ") # print(meanScores) # print("TRAIN DATA:") # auc, logLikelihood = evaluation.eval_NN(finalModel, trainData, trainLabels) # print("auc = ", auc) # print("logLikelihood = ", logLikelihood) # print("TEST DATA:") # print("auc = ", aucTest) # print("logLikelihood = ", logLikelihood) # print("average training score = ", meanScoresTrain[bestId]) # print("average eval score = ", meanScoresEval[bestId]) # print("test score = ", logLikelihood) return logLikelihood, meanScoresEval[bestId], meanScoresTrain[bestId], aucTest, bestWeightParam, bestTransformParam
def evalOneFold(foldId): definedFeatureCosts = realdata.getFeaturesCosts(dataName) trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod) if USE_UNLABELED_DATA: assert(unlabeledData.shape[0] > 0) else: unlabeledData = numpy.zeros((0, trainData.shape[1])) allData = numpy.vstack((trainData, unlabeledData)) assert(definedFeatureCosts.shape[0] == allData.shape[1]) print("training data size = ", trainData.shape[0]) print("unlabeled data size = ", unlabeledData.shape[0]) print("test data size = ", testData.shape[0]) print("*****************************") print("foldId = ", foldId) print("*****************************") if FULL_MODEL: bestFixedFeatures = numpy.arange(trainData.shape[1]) # print("bestFixedFeatures = ", bestFixedFeatures) # assert(False) bestModel, misclassificationCosts, totalCostEstimate = prepareFeatureSets.getPredictionModelsAndCosts(trainData, trainLabels, bestFixedFeatures, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName) else: if USE_L1: allFeatureSetsInOrder, _ = prepareFeatureSets.getAllFeatureSetsInOrderWithL1LogReg(trainData, trainLabels, unlabeledData, None, definedFeatureCosts) else: print("NOT YET SUPPORTED !!") assert(False) # allFeatureSetsInOrder, allEstimatedTotalCosts = prepareFeatureSets.getAllFeatureSetsInOrderWithGreedyMethod(trainData, trainLabels, unlabeledData, misclassificationCosts, definedFeatureCosts) print("GET ALL PREDICTION MODEL AND DETERMINE FALSE NEGATIVE COSTS: ") allPredictionModels, allMisclassificationCosts, allEstimatedTotalCosts = prepareFeatureSets.getAllPredictionModelsAndCosts(trainData, trainLabels, allFeatureSetsInOrder, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName) bestModelId = numpy.argmin(allEstimatedTotalCosts) bestModel = allPredictionModels[bestModelId] misclassificationCosts = allMisclassificationCosts[bestModelId] bestFixedFeatures = allFeatureSetsInOrder[bestModelId] return evaluation.getOverallPerformance_fixedCovariateSet(bestModel, testData, testLabels, definedFeatureCosts, misclassificationCosts, bestFixedFeatures, targetRecall)
def play(dataName, foldId, imputationMethod): EPOCHS = 1000 # BATCH_SIZE=100 createModel = createModelProposed # createModel = createModelLogReg trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod) finalModel = createModel(transformWeightRegularizer=0.1, classWeightRegularizer=0.001, nrTransformationUnits = 10, learningRate = 0.01) finalModel.fit(trainData, trainLabels, epochs=EPOCHS, verbose=True) aucTest, logLikelihoodTest = evaluation.eval_NN(finalModel, testData, testLabels) print("TRAIN DATA:") aucTrain, logLikelihoodTrain = evaluation.eval_NN(finalModel, trainData, trainLabels) print("auc = ", aucTrain) print("logLikelihood = ", logLikelihoodTrain) print("TEST DATA:") print("auc = ", aucTest) print("logLikelihood = ", logLikelihoodTest) assert(False) return
with open( constants.MODEL_FOLDERNAME + trainedModelsFilenameGreedy + "_features", "rb") as f: allFeatureArraysInOrderGreedy_allFolds = pickle.load(f) startTime = time.time() runTimesAllFolds = numpy.zeros(constants.NUMBER_OF_FOLDS) for foldId in range(constants.NUMBER_OF_FOLDS): # ******************************************************************************************************************* # ********************************* get feature sets, prediction models, and predictedTrueProbs(on training data) ****************************** # ******************************************************************************************************************* trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset( dataName, None, foldId, constants.IMPUTATION_METHOD) if USE_UNLABELED_DATA: assert (unlabeledData.shape[0] > 0) else: unlabeledData = numpy.zeros((0, trainData.shape[1])) allData = numpy.vstack((trainData, unlabeledData)) assert (definedFeatureCosts.shape[0] == allData.shape[1]) print( "GET ALL PREDICTION MODEL AND DETERMINE FALSE NEGATIVE COSTS: " ) # ************************************************************
testOperationCostsAllFolds_exactRecall = numpy.zeros(constants.NUMBER_OF_FOLDS) for testFoldId in range(constants.NUMBER_OF_FOLDS): allResultsInMatlab = scipy.io.loadmat(experimentSettingBaselines.MATLAB_FOLDER_RESULTS_GREEDY_MISER + dataName + "_" + str(int(falsePositiveCost)) + "_forFinalTrainingAndTesting_" + str(testFoldId) + "_allResults_" + "asymmetric" ) avgFeatureCosts_allTrees = (allResultsInMatlab['allTotalCost'].transpose())[0] scores_allTrees = allResultsInMatlab['allScores'] assert(avgFeatureCosts_allTrees.shape[0] == scores_allTrees.shape[1]) bestTreeId = allBestSettings[testFoldId, 1] predictedTestLabels = evaluation.getLabelsFromGreedyScores(scores_allTrees[:,bestTreeId]) predictedTestTrueLabelProbs = evaluation.getProbabilitiesFromGreedyScores(scores_allTrees[:,bestTreeId]) avgTestFeatureCosts = avgFeatureCosts_allTrees[bestTreeId] _, _, _, _, testLabels = realdata.loadSubset(dataName, None, testFoldId, constants.IMPUTATION_METHOD) assert(avgTestFeatureCosts <= numpy.sum(definedFeatureCosts)) # just to ensure that it is really the average and not a sum over all samples # testOperationCostsAllFolds_exactRecall[testFoldId], testFDRAllFolds_exactRecall[testFoldId], testRecallAllFolds_exactRecall[testFoldId] = evaluation.getResultsAtTargetRecall(falsePositiveCost, targetRecall, testLabels, predictedTestTrueLabelProbs, avgTestFeatureCosts) # threshold_forExactRecall = evaluation.getThresholdFromPredictedProbabilities(testLabels, predictedTestTrueLabelProbs, targetRecall) # testRecallAllFolds_exactRecall[testFoldId] = evaluation.getRecall(testLabels, predictedTestTrueLabelProbs, threshold_forExactRecall) # testFDRAllFolds_exactRecall[testFoldId] = evaluation.getFDR(testLabels, predictedTestTrueLabelProbs, threshold_forExactRecall) # predictedTestLabels_atExactRecall = evaluation.getPredictedLabelsAtThreshold(threshold_forExactRecall, predictedTestTrueLabelProbs) # testOperationCostsAllFolds_exactRecall[testFoldId] = evaluation.getAverageOperationCosts(testLabels, predictedTestLabels_atExactRecall, avgTestFeatureCosts, falsePositiveCost) testFeatureCostsAllFolds[testFoldId] = avgTestFeatureCosts misclassificationCosts = numpy.zeros((2, 2)) misclassificationCosts[0, 1] = falsePositiveCost misclassificationCosts[1, 0] = falseNegativeCost
misclassificationCosts[0, 0] = sameClassCost misclassificationCosts[1, 1] = sameClassCost NUMBER_OF_FOLDS = 5 definedFeatureCosts = realdata.getFeaturesCosts(dataName) testTotalCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS) testFeatureCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS) testMisClassificationCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS) testAccuracyAllFolds = numpy.zeros(NUMBER_OF_FOLDS) testAUCAllFolds = numpy.zeros(NUMBER_OF_FOLDS) for foldId in range(NUMBER_OF_FOLDS): trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset( dataName, None, foldId, imputationMethod) selectedFeatureIds = numpy.arange(trainData.shape[1]) # allFeatureArraysInOrder, _ = prepareFeatureSets.getAllFeatureSetsInOrderWithL1LogReg(trainData, trainLabels, unlabeledData, None, definedFeatureCosts) allFeatureArraysInOrder = evaluation.nonLinearFeatureSelection_withGAM( trainData, trainLabels, definedFeatureCosts) allFeatureArraysInOrder = prepareFeatureSets.filterToEnsureSetInclusionOrder( allFeatureArraysInOrder) print("found covariate sets = ") for i in range(len(allFeatureArraysInOrder)): print("covariateIds = " + str(allFeatureArraysInOrder[i]) + " | expected total costs = ?") assert (False)
dataName + "_" + str(int(falsePositiveCost)) + "_forFinalTrainingAndTesting_" + str(testFoldId) + "_allResults_" + str(targetRecall) + "targetRecall") avgFeatureCosts_allTrees = ( allResultsInMatlab['allTotalCost'].transpose())[0] scores_allTrees = allResultsInMatlab['allScores'] assert ( avgFeatureCosts_allTrees.shape[0] == scores_allTrees.shape[1]) bestTreeId = allBestSettings[testFoldId, 1] predictedTestTrueLabelProbs = evaluation.getProbabilitiesFromGreedyScores( scores_allTrees[:, bestTreeId]) avgTestFeatureCosts = avgFeatureCosts_allTrees[bestTreeId] _, _, _, _, testLabels = realdata.loadSubset( dataName, None, testFoldId, imputationMethod) assert ( avgTestFeatureCosts <= numpy.sum(definedFeatureCosts) ) # just to ensure that it is really the average and not a sum over all samples threshold = allThresholds[testFoldId] # set to same recall as proposed method to allow for fair comparison targetRecall_fromProposedMethod = evaluation.getTargetRecallFromProposedMethod( dataName, falsePositiveCost, targetRecall) threshold_forExactRecall = evaluation.getThresholdFromPredictedProbabilities( testLabels, predictedTestTrueLabelProbs, targetRecall_fromProposedMethod) testRecallAllFolds_exactRecall[testFoldId] = evaluation.getRecall( testLabels, predictedTestTrueLabelProbs,