def testGLMParameters(prostateDataset): features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA'] algorithm = H2OGLM(seed=1, labelCol="CAPSULE", alphaValue=[0.5], lambdaValue=[0.5], maxIterations=30, objectiveEpsilon=0.001, gradientEpsilon=0.001, objReg=0.001, maxActivePredictors=3000, lambdaMinRatio=0.001, featuresCols=features) model = algorithm.fit(prostateDataset) compareParameterValues(algorithm, model)
def createInitialGlmDefinitionForRandomCols(): return H2OGLM(featuresCols=["x1", "x3", "x5", "x6"], labelCol="y", family="gaussian", randomFamily=["gaussian"], randomLink=["identity"], HGLM=True, calcLike=True)
def createInitialGlmDefinition(): featuresCols = [ "economy", "displacement", "power", "weight", "acceleration", "year", "economy_20mpg" ] return H2OGLM(featuresCols=featuresCols, labelCol="cylinders", seed=1, splitRatio=0.8)
def testPropagationOfPredictionCol(prostateDataset): predictionCol = "my_prediction_col_name" algo = H2OGLM(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], labelCol="AGE", seed=1, splitRatio=0.8, predictionCol=predictionCol) model = algo.fit(prostateDataset) columns = model.transform(prostateDataset).columns assert True == (predictionCol in columns)
def testPipelineSerialization(prostateDataset): algo = H2OGLM(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], labelCol="AGE", seed=1, splitRatio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/glm_pipeline_model")) loadedModel.transform(prostateDataset).count()
def testInteractionColumnNamesArePassedWithoutException(spark): data = [(0.0, "a", 2.0), (float("nan"), "b", 8.0), (0.0, "a", 4.0), (1.0, "b", 1.0)] df = spark.createDataFrame(data, ["x", "y", "z"]) plugValues = {"x": 0, "x_y.a": 1, "x_y.b": 2, "y": "b"} glm = H2OGLM(labelCol="z", seed=42, ignoreConstCols=False, standardize=False, family="gaussian", missingValuesHandling="PlugValues", plugValues=plugValues) glm.fit(df)
def testPipelineSerializationGLM(prostateDataset): gridSearchTester(H2OGLM().setLabelCol("AGE"), prostateDataset)
def testPipelineSerializationGLM(prostateDataset): gridSearchTester(H2OGLM(), prostateDataset)
def testParams(): glm = H2OGLM(modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[], nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True, seed=-1, distribution="AUTO", convertUnknownCategoricalLevelsToNa=False, standardize=True, family="gaussian", link="family_default", solver="AUTO", tweedieVariancePower=0.0, tweedieLinkPower=0.0, alpha=[1], lambda_=None, missingValuesHandling="MeanImputation", prior=-1.0, lambdaSearch=False, nlambdas=-1, nonNegative=False, exactLambdas=False, lambdaMinRatio=-1.0, maxIterations=-1, intercept=True, betaEpsilon=1e-4, objectiveEpsilon=-1.0, gradientEpsilon=-1.0, objReg=-1.0, computePValues=False, removeCollinearCols=False, interactions=None, interactionPairs=None, earlyStopping=True, foldCol=None, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False) assert glm.getModelId() == None assert glm.getSplitRatio() == 1.0 assert glm.getLabelCol() == "label" assert glm.getWeightCol() == None assert glm.getFeaturesCols() == [] assert glm.getAllStringColumnsToCategorical() == True assert glm.getColumnsToCategorical() == [] assert glm.getNfolds() == 0 assert glm.getKeepCrossValidationPredictions() == False assert glm.getKeepCrossValidationFoldAssignment() == False assert glm.getParallelizeCrossValidation() == True assert glm.getSeed() == -1 assert glm.getDistribution() == "AUTO" assert glm.getConvertUnknownCategoricalLevelsToNa() == False assert glm.getStandardize() == True assert glm.getFamily() == "gaussian" assert glm.getLink() == "family_default" assert glm.getSolver() == "AUTO" assert glm.getTweedieVariancePower() == 0.0 assert glm.getTweedieLinkPower() == 0.0 assert glm.getAlpha() == [1.0] assert glm.getLambda() == None assert glm.getMissingValuesHandling() == "MeanImputation" assert glm.getPrior() == -1.0 assert glm.getLambdaSearch() == False assert glm.getNlambdas() == -1 assert glm.getNonNegative() == False assert glm.getExactLambdas() == False assert glm.getLambdaMinRatio() == -1.0 assert glm.getMaxIterations() == -1 assert glm.getIntercept() == True assert glm.getBetaEpsilon() == 1e-4 assert glm.getObjectiveEpsilon() == -1.0 assert glm.getGradientEpsilon() == -1.0 assert glm.getObjReg() == -1.0 assert glm.getComputePValues() == False assert glm.getRemoveCollinearCols() == False assert glm.getInteractions() == None assert glm.getInteractionPairs() == None assert glm.getEarlyStopping() == True assert glm.getFoldCol() == None assert glm.getPredictionCol() == "prediction" assert glm.getDetailedPredictionCol() == "detailed_prediction" assert glm.getWithDetailedPredictionCol() == False assert glm.getConvertInvalidNumbersToNa() == False
def createInitialGlmDefinition(): return H2OGLM(featuresCols=featuresCols, labelCol="CAPSULE", seed=1, splitRatio=0.8)