def test_custom_metric(self): from custom_metric_class import WeightedFalseNegativeLossMetric train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv") train = h2o.import_file(train_path, destination_frame="loan_train") train["bad_loan"] = train["bad_loan"].asfactor() y = "bad_loan" x = train.col_names x.remove(y) x.remove("int_rate") train["weight"] = train["loan_amnt"] weighted_false_negative_loss_func = h2o.upload_custom_metric( WeightedFalseNegativeLossMetric, func_name="WeightedFalseNegativeLoss", func_file="weighted_false_negative_loss.py") from h2o.estimators import H2OGradientBoostingEstimator gbm = H2OGradientBoostingEstimator( model_id="gbm.hex", custom_metric_func=weighted_false_negative_loss_func) gbm.train(y=y, x=x, training_frame=train, weights_column="weight") perf = gbm.model_performance() self.assertEquals(perf.custom_metric_name(), "WeightedFalseNegativeLoss") self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
def test_glm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, ratio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline")) loaded_pipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(labelCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_h2o_mojo_model_serialization_in_pipeline(self): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def test_h2o_mojo_pipeline_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOPipelineModel.create_from_mojo( "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo") ) mojo.set_named_mojo_output_columns(False) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) preds = mojo.predict(prostate_frame).repartition(1) normalSelection = preds.select("prediction.preds").take(5) assert normalSelection[0][0][0] == 65.36320409515132 assert normalSelection[1][0][0] == 64.96902128114817 assert normalSelection[2][0][0] == 64.96721023747583 assert normalSelection[3][0][0] == 65.78772654671035 assert normalSelection[4][0][0] == 66.11327967814829 udfSelection = preds.select(mojo.select_prediction_udf("AGE")).take(5) assert udfSelection[0][0] == 65.36320409515132 assert udfSelection[1][0] == 64.96902128114817 assert udfSelection[2][0] == 64.96721023747583 assert udfSelection[3][0] == 65.78772654671035 assert udfSelection[4][0] == 66.11327967814829 assert mojo.get_output_names()[0] == "AGE"
def setUpClass(cls): cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() dataset = cls._spark.read\ .options(header='true', inferSchema='true')\ .csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv")) [cls._trainingDataset, cls._testingDataset] = dataset.randomSplit([0.8, 0.2], 1)
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(predictionCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM()) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_mojo_dai_pipeline_serialize(self): mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) # Create Spark pipeline of single step - mojo pipeline pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) # Train the pipeline model model = loadedPipeline.fit(prostateFrame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) preds = loadedModel.transform(prostateFrame).repartition(1).select( mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def test_h2o_mojo_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) mojo.predict(prostate_frame).repartition(1).collect()
def setUpClass(cls): cls._conf = unit_test_utils.get_default_spark_conf( cls._spark_options_from_params) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_cluster_size(1)) cls.dataset = cls._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/iris/iris_wheader.csv"), header=True, inferSchema=True)
def test_h2o_mojo_pipeline_predictions_with_named_cols(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOPipelineModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) preds = mojo.transform(prostateFrame).repartition(1).select(mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def test_load_mojo_deeplearning(self): from pysparkling.ml import H2OMOJOModel, H2ODeepLearning mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE") model = dl.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def test_load_mojo_gbm(self): from pysparkling.ml import H2OMOJOModel, H2OGBM mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule") model = gbm.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def test_propagation_of_prediction_col(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) predictionCol = "my_prediction_col_name" algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, splitRatio=0.8, predictionCol=predictionCol) model = algo.fit(prostate_frame) columns = model.transform(prostate_frame).columns self.assertEquals(True, predictionCol in columns)