def run(dataName, foldId, imputationMethod, proposedMethod):
    
    EPOCHS = 2000
    NUMBER_OF_CV_FOLDS = 10
    ALL_WEIGHT_REG_CANDIDATES = [1.0, 0.1, 0.01, 0.001, 0.0001]
    ALL_TRANSFORM_REG_CANDIDATES = [1.0, 0.1, 0.01, 0.001, 0.0]
    
    # ALL_WEIGHT_REG_CANDIDATES = [1.0]
    # ALL_TRANSFORM_REG_CANDIDATES = [1.0]
    
    NR_JOBS = 1


    if proposedMethod:
        createModel = createModelProposed
    else:
        createModel = createModelLogReg
    
    modelForCV = KerasClassifier(build_fn=createModel, epochs=EPOCHS, verbose=True)
    
    trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod)

    parameters = {"classWeightRegularizer" : ALL_WEIGHT_REG_CANDIDATES, "transformWeightRegularizer" : ALL_TRANSFORM_REG_CANDIDATES}
    gridSearchObj = sklearn.model_selection.GridSearchCV(modelForCV, parameters, scoring = myScorer, cv = NUMBER_OF_CV_FOLDS, n_jobs = NR_JOBS)
    gridSearchObj.fit(trainData, trainLabels)
    
    cvResult = pandas.DataFrame.from_dict(gridSearchObj.cv_results_)
    meanScoresEval = (cvResult["mean_test_score"]).as_matrix()
    bestId = numpy.argmax(meanScoresEval)
    
    bestWeightParam = cvResult.loc[bestId, "param_classWeightRegularizer"]
    bestTransformParam = cvResult.loc[bestId, "param_transformWeightRegularizer"]
    meanScoresTrain = (cvResult["mean_train_score"]).as_matrix()
    
    
    finalModel = createModel(transformWeightRegularizer=bestTransformParam, classWeightRegularizer=bestWeightParam)
    finalModel.fit(trainData, trainLabels, epochs=EPOCHS, verbose=True)
    aucTest, logLikelihood = evaluation.eval_NN(finalModel, testData, testLabels)
    
#     print("bestWeightParam = ")
#     print(bestWeightParam)
#     print("meanScores = ")
#     print(meanScores)
#     print("TRAIN DATA:")
#     auc, logLikelihood = evaluation.eval_NN(finalModel, trainData, trainLabels)
#     print("auc = ", auc)
#     print("logLikelihood = ", logLikelihood)
         
        
    # print("TEST DATA:")
    # print("auc = ", aucTest)
    # print("logLikelihood = ", logLikelihood)
    
#     print("average training score = ", meanScoresTrain[bestId])
#     print("average eval score = ", meanScoresEval[bestId])
#     print("test score = ", logLikelihood)
    
    return logLikelihood, meanScoresEval[bestId], meanScoresTrain[bestId], aucTest, bestWeightParam, bestTransformParam
 def evalOneFold(foldId):
     
     definedFeatureCosts = realdata.getFeaturesCosts(dataName)
         
     trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod)
     
     if USE_UNLABELED_DATA:
         assert(unlabeledData.shape[0] > 0)
     else:
         unlabeledData = numpy.zeros((0, trainData.shape[1]))
     
     allData = numpy.vstack((trainData, unlabeledData))
     
     assert(definedFeatureCosts.shape[0] == allData.shape[1])
     
     print("training data size = ", trainData.shape[0])
     print("unlabeled data size = ", unlabeledData.shape[0])
     print("test data size = ", testData.shape[0])
     
     print("*****************************")
     print("foldId = ", foldId)
     print("*****************************")
     
     if FULL_MODEL:
         bestFixedFeatures = numpy.arange(trainData.shape[1])
         # print("bestFixedFeatures = ", bestFixedFeatures)
         # assert(False)
         bestModel, misclassificationCosts, totalCostEstimate = prepareFeatureSets.getPredictionModelsAndCosts(trainData, trainLabels, bestFixedFeatures, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName)
         
     else:
         if USE_L1:
             allFeatureSetsInOrder, _ = prepareFeatureSets.getAllFeatureSetsInOrderWithL1LogReg(trainData, trainLabels, unlabeledData, None, definedFeatureCosts)
         else:
             print("NOT YET SUPPORTED !!")
             assert(False)
             # allFeatureSetsInOrder, allEstimatedTotalCosts = prepareFeatureSets.getAllFeatureSetsInOrderWithGreedyMethod(trainData, trainLabels, unlabeledData, misclassificationCosts, definedFeatureCosts)
         
     
         print("GET ALL PREDICTION MODEL AND DETERMINE FALSE NEGATIVE COSTS: ")  
         allPredictionModels, allMisclassificationCosts, allEstimatedTotalCosts = prepareFeatureSets.getAllPredictionModelsAndCosts(trainData, trainLabels, allFeatureSetsInOrder, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName)
         
         bestModelId = numpy.argmin(allEstimatedTotalCosts)
         bestModel = allPredictionModels[bestModelId]
         misclassificationCosts = allMisclassificationCosts[bestModelId]
         bestFixedFeatures = allFeatureSetsInOrder[bestModelId]
    
     return evaluation.getOverallPerformance_fixedCovariateSet(bestModel, testData, testLabels, definedFeatureCosts, misclassificationCosts, bestFixedFeatures, targetRecall)
def play(dataName, foldId, imputationMethod):
    EPOCHS = 1000
    # BATCH_SIZE=100
    
    createModel = createModelProposed
    # createModel = createModelLogReg
    
    trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod)
    
    
    finalModel = createModel(transformWeightRegularizer=0.1, classWeightRegularizer=0.001, nrTransformationUnits = 10, learningRate = 0.01)
    finalModel.fit(trainData, trainLabels, epochs=EPOCHS, verbose=True)
    aucTest, logLikelihoodTest = evaluation.eval_NN(finalModel, testData, testLabels)

    print("TRAIN DATA:")
    aucTrain, logLikelihoodTrain = evaluation.eval_NN(finalModel, trainData, trainLabels)
    print("auc = ", aucTrain)
    print("logLikelihood = ", logLikelihoodTrain)
         
    print("TEST DATA:")
    print("auc = ", aucTest)
    print("logLikelihood = ", logLikelihoodTest)
    assert(False)
    return
Beispiel #4
0
            with open(
                    constants.MODEL_FOLDERNAME + trainedModelsFilenameGreedy +
                    "_features", "rb") as f:
                allFeatureArraysInOrderGreedy_allFolds = pickle.load(f)

        startTime = time.time()

        runTimesAllFolds = numpy.zeros(constants.NUMBER_OF_FOLDS)

        for foldId in range(constants.NUMBER_OF_FOLDS):

            # *******************************************************************************************************************
            # ********************************* get feature sets, prediction models, and predictedTrueProbs(on training data)  ******************************
            # *******************************************************************************************************************

            trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(
                dataName, None, foldId, constants.IMPUTATION_METHOD)

            if USE_UNLABELED_DATA:
                assert (unlabeledData.shape[0] > 0)
            else:
                unlabeledData = numpy.zeros((0, trainData.shape[1]))

            allData = numpy.vstack((trainData, unlabeledData))

            assert (definedFeatureCosts.shape[0] == allData.shape[1])

            print(
                "GET ALL PREDICTION MODEL AND DETERMINE FALSE NEGATIVE COSTS: "
            )

            # ************************************************************
     testOperationCostsAllFolds_exactRecall = numpy.zeros(constants.NUMBER_OF_FOLDS)
 
     for testFoldId in range(constants.NUMBER_OF_FOLDS):
      
         allResultsInMatlab = scipy.io.loadmat(experimentSettingBaselines.MATLAB_FOLDER_RESULTS_GREEDY_MISER + dataName + "_" + str(int(falsePositiveCost)) + "_forFinalTrainingAndTesting_" + str(testFoldId) + "_allResults_" + "asymmetric" )
         avgFeatureCosts_allTrees = (allResultsInMatlab['allTotalCost'].transpose())[0]
         scores_allTrees = allResultsInMatlab['allScores']
         
         assert(avgFeatureCosts_allTrees.shape[0] == scores_allTrees.shape[1])
         
         bestTreeId = allBestSettings[testFoldId, 1]
         predictedTestLabels = evaluation.getLabelsFromGreedyScores(scores_allTrees[:,bestTreeId])
         predictedTestTrueLabelProbs = evaluation.getProbabilitiesFromGreedyScores(scores_allTrees[:,bestTreeId])
         avgTestFeatureCosts = avgFeatureCosts_allTrees[bestTreeId]
             
         _, _, _, _, testLabels = realdata.loadSubset(dataName, None, testFoldId, constants.IMPUTATION_METHOD)
         
         assert(avgTestFeatureCosts <= numpy.sum(definedFeatureCosts)) # just to ensure that it is really the average and not a sum over all samples
                    
         # testOperationCostsAllFolds_exactRecall[testFoldId], testFDRAllFolds_exactRecall[testFoldId], testRecallAllFolds_exactRecall[testFoldId] = evaluation.getResultsAtTargetRecall(falsePositiveCost, targetRecall, testLabels, predictedTestTrueLabelProbs, avgTestFeatureCosts) 
         # threshold_forExactRecall = evaluation.getThresholdFromPredictedProbabilities(testLabels, predictedTestTrueLabelProbs, targetRecall)
         # testRecallAllFolds_exactRecall[testFoldId] = evaluation.getRecall(testLabels, predictedTestTrueLabelProbs, threshold_forExactRecall)
         # testFDRAllFolds_exactRecall[testFoldId] = evaluation.getFDR(testLabels, predictedTestTrueLabelProbs, threshold_forExactRecall)
         # predictedTestLabels_atExactRecall = evaluation.getPredictedLabelsAtThreshold(threshold_forExactRecall, predictedTestTrueLabelProbs)
         # testOperationCostsAllFolds_exactRecall[testFoldId] = evaluation.getAverageOperationCosts(testLabels, predictedTestLabels_atExactRecall, avgTestFeatureCosts, falsePositiveCost)
         
         testFeatureCostsAllFolds[testFoldId] = avgTestFeatureCosts
         
         misclassificationCosts = numpy.zeros((2, 2))
         misclassificationCosts[0, 1] = falsePositiveCost 
         misclassificationCosts[1, 0] = falseNegativeCost 
Beispiel #6
0
    misclassificationCosts[0, 0] = sameClassCost
    misclassificationCosts[1, 1] = sameClassCost

    NUMBER_OF_FOLDS = 5

    definedFeatureCosts = realdata.getFeaturesCosts(dataName)

    testTotalCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS)
    testFeatureCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS)
    testMisClassificationCostsAllFolds = numpy.zeros(NUMBER_OF_FOLDS)
    testAccuracyAllFolds = numpy.zeros(NUMBER_OF_FOLDS)
    testAUCAllFolds = numpy.zeros(NUMBER_OF_FOLDS)

    for foldId in range(NUMBER_OF_FOLDS):

        trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(
            dataName, None, foldId, imputationMethod)

        selectedFeatureIds = numpy.arange(trainData.shape[1])

        # allFeatureArraysInOrder, _ = prepareFeatureSets.getAllFeatureSetsInOrderWithL1LogReg(trainData, trainLabels, unlabeledData, None, definedFeatureCosts)
        allFeatureArraysInOrder = evaluation.nonLinearFeatureSelection_withGAM(
            trainData, trainLabels, definedFeatureCosts)
        allFeatureArraysInOrder = prepareFeatureSets.filterToEnsureSetInclusionOrder(
            allFeatureArraysInOrder)
        print("found covariate sets = ")
        for i in range(len(allFeatureArraysInOrder)):
            print("covariateIds = " + str(allFeatureArraysInOrder[i]) +
                  " | expected total costs =  ?")

        assert (False)
Beispiel #7
0
                dataName + "_" + str(int(falsePositiveCost)) +
                "_forFinalTrainingAndTesting_" + str(testFoldId) +
                "_allResults_" + str(targetRecall) + "targetRecall")
            avgFeatureCosts_allTrees = (
                allResultsInMatlab['allTotalCost'].transpose())[0]
            scores_allTrees = allResultsInMatlab['allScores']

            assert (
                avgFeatureCosts_allTrees.shape[0] == scores_allTrees.shape[1])

            bestTreeId = allBestSettings[testFoldId, 1]
            predictedTestTrueLabelProbs = evaluation.getProbabilitiesFromGreedyScores(
                scores_allTrees[:, bestTreeId])
            avgTestFeatureCosts = avgFeatureCosts_allTrees[bestTreeId]

            _, _, _, _, testLabels = realdata.loadSubset(
                dataName, None, testFoldId, imputationMethod)

            assert (
                avgTestFeatureCosts <= numpy.sum(definedFeatureCosts)
            )  # just to ensure that it is really the average and not a sum over all samples

            threshold = allThresholds[testFoldId]

            # set to same recall as proposed method to allow for fair comparison
            targetRecall_fromProposedMethod = evaluation.getTargetRecallFromProposedMethod(
                dataName, falsePositiveCost, targetRecall)
            threshold_forExactRecall = evaluation.getThresholdFromPredictedProbabilities(
                testLabels, predictedTestTrueLabelProbs,
                targetRecall_fromProposedMethod)
            testRecallAllFolds_exactRecall[testFoldId] = evaluation.getRecall(
                testLabels, predictedTestTrueLabelProbs,