Esempio n. 1
0
def testGLMParameters(prostateDataset):
    features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA']
    algorithm = H2OGLM(seed=1, labelCol="CAPSULE", alphaValue=[0.5], lambdaValue=[0.5], maxIterations=30,
                       objectiveEpsilon=0.001, gradientEpsilon=0.001, objReg=0.001, maxActivePredictors=3000,
                       lambdaMinRatio=0.001, featuresCols=features)
    model = algorithm.fit(prostateDataset)
    compareParameterValues(algorithm, model)
Esempio n. 2
0
def createInitialGlmDefinitionForRandomCols():
    return H2OGLM(featuresCols=["x1", "x3", "x5", "x6"],
                  labelCol="y",
                  family="gaussian",
                  randomFamily=["gaussian"],
                  randomLink=["identity"],
                  HGLM=True,
                  calcLike=True)
Esempio n. 3
0
 def createInitialGlmDefinition():
     featuresCols = [
         "economy", "displacement", "power", "weight", "acceleration",
         "year", "economy_20mpg"
     ]
     return H2OGLM(featuresCols=featuresCols,
                   labelCol="cylinders",
                   seed=1,
                   splitRatio=0.8)
Esempio n. 4
0
def testPropagationOfPredictionCol(prostateDataset):
    predictionCol = "my_prediction_col_name"
    algo = H2OGLM(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8,
                  predictionCol=predictionCol)

    model = algo.fit(prostateDataset)
    columns = model.transform(prostateDataset).columns
    assert True == (predictionCol in columns)
Esempio n. 5
0
def testPipelineSerialization(prostateDataset):
    algo = H2OGLM(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/glm_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
Esempio n. 6
0
def testInteractionColumnNamesArePassedWithoutException(spark):
    data = [(0.0, "a", 2.0), (float("nan"), "b", 8.0), (0.0, "a", 4.0),
            (1.0, "b", 1.0)]
    df = spark.createDataFrame(data, ["x", "y", "z"])

    plugValues = {"x": 0, "x_y.a": 1, "x_y.b": 2, "y": "b"}
    glm = H2OGLM(labelCol="z",
                 seed=42,
                 ignoreConstCols=False,
                 standardize=False,
                 family="gaussian",
                 missingValuesHandling="PlugValues",
                 plugValues=plugValues)

    glm.fit(df)
Esempio n. 7
0
def testPipelineSerializationGLM(prostateDataset):
    gridSearchTester(H2OGLM().setLabelCol("AGE"), prostateDataset)
def testPipelineSerializationGLM(prostateDataset):
    gridSearchTester(H2OGLM(), prostateDataset)
Esempio n. 9
0
def testParams():
    glm = H2OGLM(modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 convertUnknownCategoricalLevelsToNa=False,
                 standardize=True,
                 family="gaussian",
                 link="family_default",
                 solver="AUTO",
                 tweedieVariancePower=0.0,
                 tweedieLinkPower=0.0,
                 alpha=[1],
                 lambda_=None,
                 missingValuesHandling="MeanImputation",
                 prior=-1.0,
                 lambdaSearch=False,
                 nlambdas=-1,
                 nonNegative=False,
                 exactLambdas=False,
                 lambdaMinRatio=-1.0,
                 maxIterations=-1,
                 intercept=True,
                 betaEpsilon=1e-4,
                 objectiveEpsilon=-1.0,
                 gradientEpsilon=-1.0,
                 objReg=-1.0,
                 computePValues=False,
                 removeCollinearCols=False,
                 interactions=None,
                 interactionPairs=None,
                 earlyStopping=True,
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False)

    assert glm.getModelId() == None
    assert glm.getSplitRatio() == 1.0
    assert glm.getLabelCol() == "label"
    assert glm.getWeightCol() == None
    assert glm.getFeaturesCols() == []
    assert glm.getAllStringColumnsToCategorical() == True
    assert glm.getColumnsToCategorical() == []
    assert glm.getNfolds() == 0
    assert glm.getKeepCrossValidationPredictions() == False
    assert glm.getKeepCrossValidationFoldAssignment() == False
    assert glm.getParallelizeCrossValidation() == True
    assert glm.getSeed() == -1
    assert glm.getDistribution() == "AUTO"
    assert glm.getConvertUnknownCategoricalLevelsToNa() == False
    assert glm.getStandardize() == True
    assert glm.getFamily() == "gaussian"
    assert glm.getLink() == "family_default"
    assert glm.getSolver() == "AUTO"
    assert glm.getTweedieVariancePower() == 0.0
    assert glm.getTweedieLinkPower() == 0.0
    assert glm.getAlpha() == [1.0]
    assert glm.getLambda() == None
    assert glm.getMissingValuesHandling() == "MeanImputation"
    assert glm.getPrior() == -1.0
    assert glm.getLambdaSearch() == False
    assert glm.getNlambdas() == -1
    assert glm.getNonNegative() == False
    assert glm.getExactLambdas() == False
    assert glm.getLambdaMinRatio() == -1.0
    assert glm.getMaxIterations() == -1
    assert glm.getIntercept() == True
    assert glm.getBetaEpsilon() == 1e-4
    assert glm.getObjectiveEpsilon() == -1.0
    assert glm.getGradientEpsilon() == -1.0
    assert glm.getObjReg() == -1.0
    assert glm.getComputePValues() == False
    assert glm.getRemoveCollinearCols() == False
    assert glm.getInteractions() == None
    assert glm.getInteractionPairs() == None
    assert glm.getEarlyStopping() == True
    assert glm.getFoldCol() == None
    assert glm.getPredictionCol() == "prediction"
    assert glm.getDetailedPredictionCol() == "detailed_prediction"
    assert glm.getWithDetailedPredictionCol() == False
    assert glm.getConvertInvalidNumbersToNa() == False
Esempio n. 10
0
 def createInitialGlmDefinition():
     return H2OGLM(featuresCols=featuresCols, labelCol="CAPSULE", seed=1, splitRatio=0.8)