def testIsolationForestModelGiveDifferentPredictionsOnDifferentRecords(prostateDataset): [trainingDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) algo = H2OIsolationForest(seed=1) model = algo.fit(trainingDataset) result = model.transform(testingDataset) predictions = result.select("prediction").take(2) assert(predictions[0][0] != predictions[1][0])
def testExplicitValidationFrameOnIsolationForest(spark, prostateDataset): validationDatasetPath = "file://" + os.path.abspath("../examples/smalldata/prostate/prostate_anomaly_validation.csv") validatationDataset = spark.read.csv(validationDatasetPath, header=True, inferSchema=True) algo = H2OIsolationForest(seed=1, validationDataFrame=validatationDataset, validationLabelCol="isAnomaly") model = algo.fit(prostateDataset) metrics = model.getValidationMetrics() assert(metrics['AUC'] > 0.85) assert(metrics['Logloss'] < 1.0)
def testPipelineSerialization(prostateDataset): algo = H2OIsolationForest(seed=1) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline")) model = loadedPipeline.fit(prostateDataset) expected = model.transform(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) result = loadedModel.transform(prostateDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testIsolationForestParameters(prostateDataset): features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA'] algorithm = H2OIsolationForest(seed=1, sampleRate=0.5, featuresCols=features) model = algorithm.fit(prostateDataset) compareParameterValues(algorithm, model)