Exemple #1
0
def testIsolationForestModelGiveDifferentPredictionsOnDifferentRecords(prostateDataset):
    [trainingDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42)
    algo = H2OIsolationForest(seed=1)
    model = algo.fit(trainingDataset)

    result = model.transform(testingDataset)
    predictions = result.select("prediction").take(2)

    assert(predictions[0][0] != predictions[1][0])
Exemple #2
0
def testExplicitValidationFrameOnIsolationForest(spark, prostateDataset):
    validationDatasetPath = "file://" + os.path.abspath("../examples/smalldata/prostate/prostate_anomaly_validation.csv")
    validatationDataset = spark.read.csv(validationDatasetPath, header=True, inferSchema=True)

    algo = H2OIsolationForest(seed=1, validationDataFrame=validatationDataset, validationLabelCol="isAnomaly")
    model = algo.fit(prostateDataset)
    metrics = model.getValidationMetrics()

    assert(metrics['AUC'] > 0.85)
    assert(metrics['Logloss'] < 1.0)
Exemple #3
0
def testPipelineSerialization(prostateDataset):
    algo = H2OIsolationForest(seed=1)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    model = loadedPipeline.fit(prostateDataset)
    expected = model.transform(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    result = loadedModel.transform(prostateDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Exemple #4
0
def testIsolationForestParameters(prostateDataset):
    features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA']
    algorithm = H2OIsolationForest(seed=1, sampleRate=0.5, featuresCols=features)
    model = algorithm.fit(prostateDataset)
    compareParameterValues(algorithm, model)