def testParams():
    automl = H2OAutoML(featuresCols=[],
                       labelCol="label",
                       allStringColumnsToCategorical=True,
                       columnsToCategorical=[],
                       splitRatio=1.0,
                       foldCol=None,
                       weightCol=None,
                       ignoredCols=[],
                       includeAlgos=["XGbooST"],
                       excludeAlgos=["DRF", "DeePLeArNING"],
                       projectName="test",
                       maxRuntimeSecs=3600.0,
                       stoppingRounds=3,
                       stoppingTolerance=0.001,
                       stoppingMetric="AUTO",
                       nfolds=5,
                       convertUnknownCategoricalLevelsToNa=True,
                       seed=-1,
                       sortMetric="AUTO",
                       balanceClasses=False,
                       classSamplingFactors=None,
                       maxAfterBalanceSize=5.0,
                       keepCrossValidationPredictions=True,
                       keepCrossValidationModels=True,
                       maxModels=0,
                       predictionCol="prediction",
                       detailedPredictionCol="detailed_prediction",
                       withDetailedPredictionCol=False,
                       convertInvalidNumbersToNa=False)

    assert automl.getFeaturesCols() == []
    assert automl.getLabelCol() == "label"
    assert automl.getAllStringColumnsToCategorical() == True
    assert automl.getColumnsToCategorical() == []
    assert automl.getSplitRatio() == 1.0
    assert automl.getFoldCol() == None
    assert automl.getWeightCol() == None
    assert automl.getIgnoredCols() == []
    assert automl.getIncludeAlgos() == ["XGBoost"]
    assert automl.getExcludeAlgos() == ["DRF", "DeepLearning"]
    assert automl.getProjectName() == "test"
    assert automl.getMaxRuntimeSecs() == 3600.0
    assert automl.getStoppingRounds() == 3
    assert automl.getStoppingTolerance() == 0.001
    assert automl.getStoppingMetric() == "AUTO"
    assert automl.getNfolds() == 5
    assert automl.getConvertUnknownCategoricalLevelsToNa() == True
    assert automl.getSeed() == -1
    assert automl.getSortMetric() == "AUTO"
    assert automl.getBalanceClasses() == False
    assert automl.getClassSamplingFactors() == None
    assert automl.getMaxAfterBalanceSize() == 5.0
    assert automl.getKeepCrossValidationPredictions() == True
    assert automl.getKeepCrossValidationModels() == True
    assert automl.getMaxModels() == 0
    assert automl.getPredictionCol() == "prediction"
    assert automl.getDetailedPredictionCol() == "detailed_prediction"
    assert automl.getWithDetailedPredictionCol() == False
    assert automl.getConvertInvalidNumbersToNa() == False
Beispiel #2
0
    def h2o_automl(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
                           maxRuntimeSecs=60,  # 1 minutes
                           seed=1,
                           maxModels=3,
                           labelCol=label + "_index",
                           **kargs)

        model = automl.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner])

## Train the pipeline model
data = load()
model = pipeline.fit(data)

##
## Make predictions on unlabeled data
## Spam detector
                       featuresCols=[idf.getOutputCol()],
                       predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                                seed=1,
                                l1=0.001,
                                l2=0.0,
                                hidden=[200, 200],
                                featuresCols=[idf.getOutputCol()],
                                predictionCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(
        convertUnknownCategoricalLevelsToNa=True,
        maxRuntimeSecs=60,  # 1 minutes
        seed=1,
        predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])
def getAlgorithmForGetLeaderboardTesting():
    automl = H2OAutoML(labelCol="CAPSULE", ignoredCols=["ID"])
    automl.setExcludeAlgos(["GLM"])
    automl.setMaxModels(5)
    automl.setSortMetric("AUC")
    return automl
gbm = H2OGBM(splitRatio=0.8,
             seed=1,
             featuresCols=[idf.getOutputCol()],
             labelCol="label")

dl = H2ODeepLearning(epochs=10,
                     seed=1,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")

automl = H2OAutoML(
    convertUnknownCategoricalLevelsToNa=True,
    maxRuntimeSecs=60 * 100,  # 100 minutes
    maxModels=10,
    seed=1,
    labelCol="label")

xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")

data = load()


def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage,
                       data):
    ## Remove all helper columns
    colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),