def test_custom_metric(self):
        from custom_metric_class import WeightedFalseNegativeLossMetric
        train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv")
        train = h2o.import_file(train_path, destination_frame="loan_train")
        train["bad_loan"] = train["bad_loan"].asfactor()

        y = "bad_loan"
        x = train.col_names
        x.remove(y)
        x.remove("int_rate")

        train["weight"] = train["loan_amnt"]

        weighted_false_negative_loss_func = h2o.upload_custom_metric(
            WeightedFalseNegativeLossMetric,
            func_name="WeightedFalseNegativeLoss",
            func_file="weighted_false_negative_loss.py")
        from h2o.estimators import H2OGradientBoostingEstimator
        gbm = H2OGradientBoostingEstimator(
            model_id="gbm.hex",
            custom_metric_func=weighted_false_negative_loss_func)
        gbm.train(y=y, x=x, training_frame=train, weights_column="weight")

        perf = gbm.model_performance()
        self.assertEquals(perf.custom_metric_name(),
                          "WeightedFalseNegativeLoss")
        self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
    def test_glm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      ratio=0.8)

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline"))
        loaded_pipeline = Pipeline.load("file://" +
                                        os.path.abspath("build/glm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/glm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(labelCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM(),
                             strategy="RandomDiscrete",
                             maxModels=3,
                             maxRuntimeSecs=60,
                             selectBestModelBy="RMSE")

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Beispiel #4
0
    def test_h2o_mojo_model_serialization_in_pipeline(self):
        mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)

        pipeline = Pipeline(stages=[mojo])

        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))

        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
        PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
    def test_h2o_mojo_pipeline_predictions(self):
        # Try loading the Mojo and prediction on it without starting H2O Context
        mojo = H2OMOJOPipelineModel.create_from_mojo(
            "file://" +
            os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo")
        )
        mojo.set_named_mojo_output_columns(False)
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        preds = mojo.predict(prostate_frame).repartition(1)

        normalSelection = preds.select("prediction.preds").take(5)

        assert normalSelection[0][0][0] == 65.36320409515132
        assert normalSelection[1][0][0] == 64.96902128114817
        assert normalSelection[2][0][0] == 64.96721023747583
        assert normalSelection[3][0][0] == 65.78772654671035
        assert normalSelection[4][0][0] == 66.11327967814829

        udfSelection = preds.select(mojo.select_prediction_udf("AGE")).take(5)

        assert udfSelection[0][0] == 65.36320409515132
        assert udfSelection[1][0] == 64.96902128114817
        assert udfSelection[2][0] == 64.96721023747583
        assert udfSelection[3][0] == 65.78772654671035
        assert udfSelection[4][0] == 66.11327967814829

        assert mojo.get_output_names()[0] == "AGE"
 def setUpClass(cls):
     cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     dataset = cls._spark.read\
         .options(header='true', inferSchema='true')\
         .csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"))
     [cls._trainingDataset, cls._testingDataset] = dataset.randomSplit([0.8, 0.2], 1)
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(predictionCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM())

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Beispiel #8
0
    def test_mojo_dai_pipeline_serialize(self):
        mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        # Create Spark pipeline of single step - mojo pipeline
        pipeline = Pipeline(stages=[mojo])
        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))
        loadedPipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))

        # Train the pipeline model
        model = loadedPipeline.fit(prostateFrame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))
        loadedModel = PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))

        preds = loadedModel.transform(prostateFrame).repartition(1).select(
            mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
 def test_h2o_mojo_predictions(self):
     # Try loading the Mojo and prediction on it without starting H2O Context
     mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
         "../ml/src/test/resources/binom_model_prostate.mojo"))
     prostate_frame = self._spark.read.csv(
         "file://" +
         unit_test_utils.locate("smalldata/prostate/prostate.csv"),
         header=True)
     mojo.predict(prostate_frame).repartition(1).collect()
 def setUpClass(cls):
     cls._conf = unit_test_utils.get_default_spark_conf(
         cls._spark_options_from_params)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_cluster_size(1))
     cls.dataset = cls._spark.read.csv(
         "file://" +
         unit_test_utils.locate("smalldata/iris/iris_wheader.csv"),
         header=True,
         inferSchema=True)
    def test_h2o_mojo_pipeline_predictions_with_named_cols(self):
        # Try loading the Mojo and prediction on it without starting H2O Context
        mojo = H2OMOJOPipelineModel.createFromMojo(
            "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"),
                                              header=True)
        preds = mojo.transform(prostateFrame).repartition(1).select(mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
Beispiel #12
0
    def test_load_mojo_deeplearning(self):
        from pysparkling.ml import H2OMOJOModel, H2ODeepLearning
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE")

        model = dl.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
Beispiel #13
0
    def test_load_mojo_gbm(self):
        from pysparkling.ml import H2OMOJOModel, H2OGBM
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule")

        model = gbm.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
    def test_propagation_of_prediction_col(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        predictionCol = "my_prediction_col_name"
        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      splitRatio=0.8,
                      predictionCol=predictionCol)

        model = algo.fit(prostate_frame)
        columns = model.transform(prostate_frame).columns
        self.assertEquals(True, predictionCol in columns)