def testInitialBiasAndWeightsAffectResult(prostateDataset):
    [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                   1)

    def createInitialDeepLearningDefinition():
        return H2ODeepLearning(seed=42,
                               reproducible=True,
                               labelCol="CAPSULE",
                               featuresCols=["AGE", "RACE", "DPROS", "DCAPS"],
                               hidden=[
                                   3,
                               ])

    referenceDeepLearning = createInitialDeepLearningDefinition()
    referenceModel = referenceDeepLearning.fit(traningDataset)
    referenceResult = referenceModel.transform(testingDataset)

    deepLearning = createInitialDeepLearningDefinition()
    matrix0 = DenseMatrix(3, 4,
                          [.1, .2, .3, .4, .4, .5, .6, .7, .7, .8, .9, .6],
                          False)
    matrix1 = DenseMatrix(1, 3, [.2, .3, .4], False)
    deepLearning.setInitialWeights([matrix0, matrix1])
    deepLearning.setInitialBiases(
        [DenseVector([.1, .2, .3]),
         DenseVector([.1])])
    model = deepLearning.fit(traningDataset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_have_different_values(
        referenceResult, result)
Ejemplo n.º 2
0
def testBetaConstraintsAffectResult(spark, prostateDataset):
    [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                   1)
    featuresCols = ["DPROS", "DCAPS", "RACE", "GLEASON"]

    def createInitialGamDefinition():
        return H2OGAM(featuresCols=featuresCols,
                      labelCol="CAPSULE",
                      seed=1,
                      splitRatio=0.8,
                      gamCols=["PSA", "AGE"])

    referenceGam = createInitialGamDefinition()
    referenceModel = referenceGam.fit(traningDataset)
    referenceResult = referenceModel.transform(testingDataset)

    betaConstraints = map(lambda feature: (feature, -1000, 1000, 1, 0.2),
                          featuresCols)
    betaConstraintsFrame = spark.createDataFrame(
        betaConstraints,
        ['names', 'lower_bounds', 'upper_bounds', 'beta_given', 'rho'])

    gam = createInitialGamDefinition()
    gam.setBetaConstraints(betaConstraintsFrame)
    model = gam.fit(traningDataset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_have_different_values(
        referenceResult, result)
Ejemplo n.º 3
0
def testPlugValuesAffectResult(spark, carsDatasetPath):
    carsDataset=spark.read.csv(carsDatasetPath, header=True, inferSchema=True)
    carsDataset=carsDataset.withColumn("economy_20mpg", carsDataset.economy_20mpg.cast("string"))
    [traningDataset, testingDataset] = carsDataset.randomSplit([0.9, 0.1], 1)

    def createInitialGlmDefinition():
        featuresCols=["economy","displacement", "power", "weight", "acceleration", "year", "economy_20mpg"]
        return H2OGLM(featuresCols=featuresCols, labelCol="cylinders", seed=1,splitRatio=0.8)

    referenceGlm = createInitialGlmDefinition()
    referenceModel = referenceGlm.fit(traningDataset)
    referenceResult = referenceModel.transform(testingDataset)

    plugValues = {
        "economy": 1.1,
        "displacement": 2.2,
        "power": 3.3,
        "weight": 4.4,
        "acceleration": 5.5,
        "year": 2000,
        "economy_20mpg": "0"}
    glm = createInitialGlmDefinition()
    glm.setMissingValuesHandling("PlugValues")
    glm.setPlugValues(plugValues)
    model = glm.fit(traningDataset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_have_different_values(referenceResult, result)
Ejemplo n.º 4
0
def testH2OAutoMLClassifierBehavesDiffenrentlyThanH2OAutoMLRegressor(prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42)

    regressor = setParametersForTesting(H2OAutoMLRegressor())
    regressionModel = regressor.fit(trainingDateset)
    regressionDataset = regressionModel.transform(testingDataset).drop("detailed_prediction")

    classifier = setParametersForTesting(H2OAutoMLClassifier())
    classificationModel = classifier.fit(trainingDateset)
    classificationDataset = classificationModel.transform(testingDataset).drop("detailed_prediction")

    unit_test_utils.assert_data_frames_have_different_values(regressionDataset, classificationDataset)
Ejemplo n.º 5
0
def testGridSearchWithDRFClassifierBehavesDiffenrentlyThanGridSearchWithDRFRegressor(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    regressor = createGridForProblemSpecificTesting(H2ODRFRegressor())
    regressionModel = regressor.fit(trainingDateset)
    regressionDataset = regressionModel.transform(testingDataset).drop(
        "detailed_prediction")

    classifier = createGridForProblemSpecificTesting(H2ODRFClassifier())
    classificationModel = classifier.fit(trainingDateset)
    classificationDataset = classificationModel.transform(testingDataset).drop(
        "detailed_prediction")

    unit_test_utils.assert_data_frames_have_different_values(
        regressionDataset, classificationDataset)
Ejemplo n.º 6
0
def testCalibrationDataFrameCauseGenerationOfCalibratedProbabilities(prostateDataset):
    prostateDataset = prostateDataset.withColumn("CAPSULE", prostateDataset.CAPSULE.cast("string"))
    [trainingDataset, testingDataset, calibrationDataset] = prostateDataset.randomSplit([0.9, 0.05, 0.05], 1)

    algo = H2ODRF(featuresCols=["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                  labelCol="CAPSULE",
                  seed=1,
                  splitRatio=0.8,
                  withDetailedPredictionCol=True,
                  calibrateModel=True,
                  calibrationDataFrame=calibrationDataset)
    model = algo.fit(trainingDataset)
    result = model.transform(testingDataset).cache()
    probabilities = result.select("ID", "detailed_prediction.probabilities.0", "detailed_prediction.probabilities.1")
    calibrated = result.select(
        "ID",
        "detailed_prediction.calibratedProbabilities.0",
        "detailed_prediction.calibratedProbabilities.1")

    unit_test_utils.assert_data_frames_have_different_values(probabilities, calibrated)
Ejemplo n.º 7
0
def testInteractionConstraintsAffectResult(spark, prostateDataset):
    [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                   1)
    featureCols = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]

    def createInitialXGBoostDefinition():
        return H2OXGBoost(featuresCols=featureCols,
                          labelCol="CAPSULE",
                          seed=1,
                          splitRatio=0.8)

    referenceXGBoost = createInitialXGBoostDefinition()
    referenceModel = referenceXGBoost.fit(traningDataset)
    referenceResult = referenceModel.transform(testingDataset)

    xgboost = createInitialXGBoostDefinition()
    xgboost.setInteractionConstraints([["DPROS", "DCAPS"],
                                       ["PSA", "VOL", "GLEASON"]])
    model = xgboost.fit(traningDataset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_have_different_values(
        referenceResult, result)
Ejemplo n.º 8
0
 def _fit(self, dataset):
     unit_test_utils.assert_data_frames_are_identical(expected, dataset)
     unit_test_utils.assert_data_frames_have_different_values(unexpected, dataset)
     return DummyTransformer()