def testInitialBiasAndWeightsAffectResult(prostateDataset): [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 1) def createInitialDeepLearningDefinition(): return H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE", featuresCols=["AGE", "RACE", "DPROS", "DCAPS"], hidden=[ 3, ]) referenceDeepLearning = createInitialDeepLearningDefinition() referenceModel = referenceDeepLearning.fit(traningDataset) referenceResult = referenceModel.transform(testingDataset) deepLearning = createInitialDeepLearningDefinition() matrix0 = DenseMatrix(3, 4, [.1, .2, .3, .4, .4, .5, .6, .7, .7, .8, .9, .6], False) matrix1 = DenseMatrix(1, 3, [.2, .3, .4], False) deepLearning.setInitialWeights([matrix0, matrix1]) deepLearning.setInitialBiases( [DenseVector([.1, .2, .3]), DenseVector([.1])]) model = deepLearning.fit(traningDataset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_have_different_values( referenceResult, result)
def testBetaConstraintsAffectResult(spark, prostateDataset): [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 1) featuresCols = ["DPROS", "DCAPS", "RACE", "GLEASON"] def createInitialGamDefinition(): return H2OGAM(featuresCols=featuresCols, labelCol="CAPSULE", seed=1, splitRatio=0.8, gamCols=["PSA", "AGE"]) referenceGam = createInitialGamDefinition() referenceModel = referenceGam.fit(traningDataset) referenceResult = referenceModel.transform(testingDataset) betaConstraints = map(lambda feature: (feature, -1000, 1000, 1, 0.2), featuresCols) betaConstraintsFrame = spark.createDataFrame( betaConstraints, ['names', 'lower_bounds', 'upper_bounds', 'beta_given', 'rho']) gam = createInitialGamDefinition() gam.setBetaConstraints(betaConstraintsFrame) model = gam.fit(traningDataset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_have_different_values( referenceResult, result)
def testPlugValuesAffectResult(spark, carsDatasetPath): carsDataset=spark.read.csv(carsDatasetPath, header=True, inferSchema=True) carsDataset=carsDataset.withColumn("economy_20mpg", carsDataset.economy_20mpg.cast("string")) [traningDataset, testingDataset] = carsDataset.randomSplit([0.9, 0.1], 1) def createInitialGlmDefinition(): featuresCols=["economy","displacement", "power", "weight", "acceleration", "year", "economy_20mpg"] return H2OGLM(featuresCols=featuresCols, labelCol="cylinders", seed=1,splitRatio=0.8) referenceGlm = createInitialGlmDefinition() referenceModel = referenceGlm.fit(traningDataset) referenceResult = referenceModel.transform(testingDataset) plugValues = { "economy": 1.1, "displacement": 2.2, "power": 3.3, "weight": 4.4, "acceleration": 5.5, "year": 2000, "economy_20mpg": "0"} glm = createInitialGlmDefinition() glm.setMissingValuesHandling("PlugValues") glm.setPlugValues(plugValues) model = glm.fit(traningDataset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_have_different_values(referenceResult, result)
def testH2OAutoMLClassifierBehavesDiffenrentlyThanH2OAutoMLRegressor(prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) regressor = setParametersForTesting(H2OAutoMLRegressor()) regressionModel = regressor.fit(trainingDateset) regressionDataset = regressionModel.transform(testingDataset).drop("detailed_prediction") classifier = setParametersForTesting(H2OAutoMLClassifier()) classificationModel = classifier.fit(trainingDateset) classificationDataset = classificationModel.transform(testingDataset).drop("detailed_prediction") unit_test_utils.assert_data_frames_have_different_values(regressionDataset, classificationDataset)
def testGridSearchWithDRFClassifierBehavesDiffenrentlyThanGridSearchWithDRFRegressor( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) regressor = createGridForProblemSpecificTesting(H2ODRFRegressor()) regressionModel = regressor.fit(trainingDateset) regressionDataset = regressionModel.transform(testingDataset).drop( "detailed_prediction") classifier = createGridForProblemSpecificTesting(H2ODRFClassifier()) classificationModel = classifier.fit(trainingDateset) classificationDataset = classificationModel.transform(testingDataset).drop( "detailed_prediction") unit_test_utils.assert_data_frames_have_different_values( regressionDataset, classificationDataset)
def testCalibrationDataFrameCauseGenerationOfCalibratedProbabilities(prostateDataset): prostateDataset = prostateDataset.withColumn("CAPSULE", prostateDataset.CAPSULE.cast("string")) [trainingDataset, testingDataset, calibrationDataset] = prostateDataset.randomSplit([0.9, 0.05, 0.05], 1) algo = H2ODRF(featuresCols=["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], labelCol="CAPSULE", seed=1, splitRatio=0.8, withDetailedPredictionCol=True, calibrateModel=True, calibrationDataFrame=calibrationDataset) model = algo.fit(trainingDataset) result = model.transform(testingDataset).cache() probabilities = result.select("ID", "detailed_prediction.probabilities.0", "detailed_prediction.probabilities.1") calibrated = result.select( "ID", "detailed_prediction.calibratedProbabilities.0", "detailed_prediction.calibratedProbabilities.1") unit_test_utils.assert_data_frames_have_different_values(probabilities, calibrated)
def testInteractionConstraintsAffectResult(spark, prostateDataset): [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 1) featureCols = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] def createInitialXGBoostDefinition(): return H2OXGBoost(featuresCols=featureCols, labelCol="CAPSULE", seed=1, splitRatio=0.8) referenceXGBoost = createInitialXGBoostDefinition() referenceModel = referenceXGBoost.fit(traningDataset) referenceResult = referenceModel.transform(testingDataset) xgboost = createInitialXGBoostDefinition() xgboost.setInteractionConstraints([["DPROS", "DCAPS"], ["PSA", "VOL", "GLEASON"]]) model = xgboost.fit(traningDataset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_have_different_values( referenceResult, result)
def _fit(self, dataset): unit_test_utils.assert_data_frames_are_identical(expected, dataset) unit_test_utils.assert_data_frames_have_different_values(unexpected, dataset) return DummyTransformer()