Ejemplo n.º 1
0
def testPipelineWithTargetEncoderIsSerializable():
    targetEncoder = H2OTargetEncoder(
        foldCol="ID",
        labelCol="CAPSULE",
        inputCols=["RACE", "DPROS", "DCAPS"],
        outputCols=["RACE_out", "DPROS_out", "DCAPS_out"],
        holdoutStrategy="KFold",
        blendedAvgEnabled=True,
        blendedAvgInflectionPoint=15.0,
        blendedAvgSmoothing=25.0,
        noise=0.05,
        noiseSeed=123)
    gbm = H2OGBM() \
        .setLabelCol("CAPSULE") \
        .setFeaturesCols(targetEncoder.getOutputCols())
    pipeline = Pipeline(stages=[targetEncoder, gbm])
    path = "file://" + os.path.abspath(
        "build/testPipelineWithTargetEncoderIsSerializable")
    pipeline.write().overwrite().save(path)
    loadedPipeline = Pipeline.load(path)
    [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages()

    assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder,
                                                  loadedTargetEncoder)
    assert gbm.getLabelCol() == loadedGbm.getLabelCol()
    assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
Ejemplo n.º 2
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Ejemplo n.º 3
0
def testPipelineWithTargetEncoderTransformsTrainingAndTestingDatasetWithoutException(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"])
    gbm = H2OGBM(labelCol="CAPSULE")

    pipeline = Pipeline(stages=[targetEncoder, gbm])
    model = pipeline.fit(trainingDataset)

    model.transform(testingDataset).collect()
Ejemplo n.º 4
0
def testMonotoneConstraintsGetProperlyPropagatedToJavaBackend():
    gbm = H2OGBM(monotoneConstraints={"District": -1, "Group": 1})

    gbm._transfer_params_to_java()
    constraints = gbm._java_obj.getMonotoneConstraints()

    assert constraints.apply("District") == -1.0
    assert constraints.apply("Group") == 1.0
Ejemplo n.º 5
0
def testGetGridModelsNoParams(prostateDataset):
    grid = H2OGridSearch(labelCol="AGE", splitRatio=0.8, algo=H2OGBM(),
                         strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    params = grid.getGridModelsParams()
    assert params.count() == 1
    assert params.columns == ['MOJO Model ID']
    params.collect() # try materializing
Ejemplo n.º 6
0
def testGetGridModels(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(splitRatio=0.8, labelCol="AGE"),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    models = grid.getGridModels()
    assert len(models) == 3
Ejemplo n.º 7
0
def testGetAlgoViaSetter():
    # SW-2276, 3rd call of getAlgo failed
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")
    grid.setAlgo(H2OGBM().setNtrees(100).setLabelCol("AGE").setSplitRatio(0.8))
    grid.getAlgo()
    grid.getAlgo()
    assert grid.getAlgo().getNtrees() == 100
Ejemplo n.º 8
0
def gbmModelWithOffset(dataset):
    gbm = H2OGBM(distribution="tweedie",
                 ntrees=600,
                 maxDepth=1,
                 minRows=1,
                 learnRate=0.1,
                 minSplitImprovement=0,
                 featuresCols=["District", "Group", "Age"],
                 labelCol="Claims",
                 offsetCol="Offset")
    return gbm.fit(dataset)
Ejemplo n.º 9
0
def testGetAlgoViaConstructor():
    # SW-2276, 3rd call of getAlgo failed
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(labelCol="AGE",
                                     ntrees=100,
                                     splitRatio=0.8),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")
    grid.getAlgo()
    grid.getAlgo()
    assert grid.getAlgo().getNtrees() == 100
Ejemplo n.º 10
0
def testGetGridModelsParams(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(splitRatio=0.8, labelCol="AGE"),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    params = grid.getGridModelsParams()
    assert params.count() == 3
    assert params.columns == ['MOJO Model ID', 'seed']
    params.collect()  # try materializing
Ejemplo n.º 11
0
def testLoadAndTrainMojo(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule")

    model = gbm.fit(prostateDataset)

    predMojo = mojo.transform(prostateDataset).repartition(1).collect()
    predModel = model.transform(prostateDataset).repartition(1).collect()

    assert len(predMojo) == len(predModel)
    for i in range(0, len(predMojo)):
        assert predMojo[i] == predModel[i]
Ejemplo n.º 12
0
def testGetGridModelsMetrics(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(labelCol="AGE", splitRatio=0.8),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    metrics = grid.getGridModelsMetrics()
    assert metrics.count() == 3
    assert metrics.columns == [
        'MOJO Model ID', 'MSE', 'MeanResidualDeviance', 'R2', 'RMSE'
    ]
    metrics.collect()  # try materializing
Ejemplo n.º 13
0
    def test_load_mojo_gbm(self):
        from pysparkling.ml import H2OMOJOModel, H2OGBM
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule")

        model = gbm.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
Ejemplo n.º 14
0
def testMonotoneConstraintsGetProperlyPropagatedFromJavaBackend():
    gbm = H2OGBM(monotoneConstraints={"District": -1, "Group": 1})
    gbm._transfer_params_to_java()

    gbm.setMonotoneConstraints({"District": 1, "Group": -1})

    constraints = gbm.getMonotoneConstraints()
    assert constraints["District"] == 1.0
    assert constraints["Group"] == -1.0

    gbm._transfer_params_from_java()

    constraints = gbm.getMonotoneConstraints()
    assert constraints["District"] == -1.0
    assert constraints["Group"] == 1.0
Ejemplo n.º 15
0
    def h2o_gbm(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_gbm = H2OGBM(ratio=0.8,
                         seed=1,
                         featuresCols=columns,
                         labelCol=label,
                         **kargs)
        model = h2o_gbm.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Ejemplo n.º 16
0
def testDomainColumns(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    gbm = H2OGBM(ntrees=2,
                 seed=42,
                 distribution="bernoulli",
                 labelCol="capsule")
    model = gbm.fit(prostateDataset)
    domainValues = model.getDomainValues()
    assert domainValues["DPROS"] is None
    assert domainValues["DCAPS"] is None
    assert domainValues["VOL"] is None
    assert domainValues["AGE"] is None
    assert domainValues["PSA"] is None
    assert domainValues["capsule"] == ["0", "1"]
    assert domainValues["RACE"] is None
    assert domainValues["ID"] is None
Ejemplo n.º 17
0
def testPipelineSerialization(prostateDataset):
    algo = H2OGridSearch(labelCol="AGE",
                         hyperParameters={"_seed": [1, 2, 3]},
                         splitRatio=0.8,
                         algo=H2OGBM(),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save(
        "file://" + os.path.abspath("build/grid_gbm_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/grid_gbm_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save(
        "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                       seed=1,
                       featuresCols=[idf.getOutputCol()],
                       predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                                seed=1,
                                l1=0.001,
                                l2=0.0,
                                hidden=[200, 200],
                                featuresCols=[idf.getOutputCol()],
                                predictionCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(
        convertUnknownCategoricalLevelsToNa=True,
        maxRuntimeSecs=60,  # 1 minutes
Ejemplo n.º 19
0
def testGBMParameters(prostateDataset):
    features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA']
    algorithm = H2OGBM(seed=1, labelCol="CAPSULE", featuresCols=features, monotoneConstraints={'AGE': 1, 'RACE': -1})
    model = algorithm.fit(prostateDataset)
    compareParameterValues(algorithm, model)
Ejemplo n.º 20
0
def testParams():
    gbm = H2OGBM(modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="Auto",
                 ntrees=50,
                 maxDepth=5,
                 minRows=10.0,
                 nbins=20,
                 nbinsCats=1024,
                 minSplitImprovement=1e-5,
                 histogramType="AUTO",
                 r2Stopping=1,
                 nbinsTopLevel=1 << 10,
                 buildTreeOneNode=False,
                 scoreTreeInterval=0,
                 sampleRate=1.0,
                 sampleRatePerClass=None,
                 colSampleRateChangePerLevel=1.0,
                 colSampleRatePerTree=1.0,
                 learnRate=0.1,
                 learnRateAnnealing=1.0,
                 colSampleRate=1.0,
                 maxAbsLeafnodePred=1,
                 predNoiseBandwidth=0.0,
                 convertUnknownCategoricalLevelsToNa=False,
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False)

    assert gbm.getModelId() == None
    assert gbm.getSplitRatio() == 1.0
    assert gbm.getLabelCol() == "label"
    assert gbm.getWeightCol() == None
    assert gbm.getFeaturesCols() == []
    assert gbm.getAllStringColumnsToCategorical() == True
    assert gbm.getColumnsToCategorical() == []
    assert gbm.getNfolds() == 0
    assert gbm.getKeepCrossValidationPredictions() == False
    assert gbm.getKeepCrossValidationFoldAssignment() == False
    assert gbm.getParallelizeCrossValidation() == True
    assert gbm.getSeed() == -1
    assert gbm.getDistribution() == "AUTO"
    assert gbm.getNtrees() == 50
    assert gbm.getMaxDepth() == 5
    assert gbm.getMinRows() == 10.0
    assert gbm.getNbins() == 20
    assert gbm.getNbinsCats() == 1024
    assert gbm.getMinSplitImprovement() == 1e-5
    assert gbm.getHistogramType() == "AUTO"
    assert gbm.getR2Stopping() == 1
    assert gbm.getNbinsTopLevel() == 1 << 10
    assert gbm.getBuildTreeOneNode() == False
    assert gbm.getScoreTreeInterval() == 0
    assert gbm.getSampleRate() == 1.0
    assert gbm.getSampleRatePerClass() == None
    assert gbm.getColSampleRateChangePerLevel() == 1.0
    assert gbm.getColSampleRatePerTree() == 1.0
    assert gbm.getLearnRate() == 0.1
    assert gbm.getLearnRateAnnealing() == 1.0
    assert gbm.getColSampleRate() == 1.0
    assert gbm.getMaxAbsLeafnodePred() == 1
    assert gbm.getPredNoiseBandwidth() == 0.0
    assert gbm.getConvertUnknownCategoricalLevelsToNa() == False
    assert gbm.getFoldCol() == None
    assert gbm.getPredictionCol() == "prediction"
    assert gbm.getDetailedPredictionCol() == "detailed_prediction"
    assert gbm.getWithDetailedPredictionCol() == False
    assert gbm.getConvertInvalidNumbersToNa() == False
Ejemplo n.º 21
0
def testPipelineSerializationGBM(prostateDataset):
    gridSearchTester(H2OGBM().setLabelCol("AGE"), prostateDataset)
Ejemplo n.º 22
0
def gbmModel(prostateDataset):
    gbm = H2OGBM(ntrees=2,
                 seed=42,
                 distribution="bernoulli",
                 labelCol="capsule")
    return gbm.fit(prostateDataset)
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

gbm = H2OGBM(splitRatio=0.8,
             seed=1,
             featuresCols=[idf.getOutputCol()],
             labelCol="label")

dl = H2ODeepLearning(epochs=10,
                     seed=1,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")

automl = H2OAutoML(
    convertUnknownCategoricalLevelsToNa=True,
    maxRuntimeSecs=60 * 100,  # 100 minutes
    maxModels=10,
    seed=1,
Ejemplo n.º 24
0
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, gbm, colPruner])

## Train the pipeline model
def testPipelineSerializationGBM(prostateDataset):
    gridSearchTester(H2OGBM(), prostateDataset)