def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(labelCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def testGetAlgoViaSetter(): # SW-2276, 3rd call of getAlgo failed grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.setAlgo(H2OGBM().setNtrees(100).setLabelCol("AGE").setSplitRatio(0.8)) grid.getAlgo() grid.getAlgo() assert grid.getAlgo().getNtrees() == 100
def testGetGridModels(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(splitRatio=0.8, labelCol="AGE"), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) models = grid.getGridModels() assert len(models) == 3
def testGetGridModelsNoParams(prostateDataset): grid = H2OGridSearch(algo=H2OGBM(labelCol="AGE", splitRatio=0.8), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) params = grid.getGridModelsParams() assert params.count() == 1 assert params.columns == ['MOJO Model ID'] params.collect() # try materializing
def testGetGridModelsParams(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(splitRatio=0.8, labelCol="AGE"), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) params = grid.getGridModelsParams() assert params.count() == 3 assert params.columns == ['MOJO Model ID', 'seed'] params.collect() # try materializing
def testGetAlgoViaConstructor(): # SW-2276, 3rd call of getAlgo failed grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(labelCol="AGE", ntrees=100, splitRatio=0.8), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.getAlgo() grid.getAlgo() assert grid.getAlgo().getNtrees() == 100
def testGetGridModelsMetrics(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(labelCol="AGE", splitRatio=0.8), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) metrics = grid.getGridModelsMetrics() assert metrics.count() == 3 assert metrics.columns == [ 'MOJO Model ID', 'MSE', 'MeanResidualDeviance', 'R2', 'RMSE' ] metrics.collect() # try materializing
def test_load_mojo_gbm(self): from pysparkling.ml import H2OMOJOModel, H2OGBM mojo = H2OMOJOModel.create_from_mojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._hc.as_spark_frame( h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") model = gbm.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() self.assertEquals(len(pred_mojo), len(pred_model)) for i in range(0, len(pred_mojo)): self.assertEquals(pred_mojo[i], pred_model[i])
def testPipelineSerializationGBM(prostateDataset): gridSearchTester(H2OGBM().setLabelCol("AGE"), prostateDataset)