def test_h2o_mojo_model_serialization_in_pipeline(self): mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def test_h2o_mojo_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) mojo.predict(prostate_frame).repartition(1).collect()
def test_h2o_mojo_predictions_unseen_categoricals(self): mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip") mojo.setConvertUnknownCategoricalLevelsToNa(True) d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}] df = self._spark.createDataFrame(d) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction_output"][0] == 5.240174068202646
def test_h2o_mojo_unsupervised(self): mojo = H2OMOJOModel.create_from_mojo( "file://" + os.path.abspath("../ml/src/test/resources/isolation_forest.mojo")) row_for_scoring = Row("V1") df = self._spark.createDataFrame( self._spark.sparkContext.parallelize([ (5.1, ) ]).map(lambda r: row_for_scoring(*r))) mojo.predict(df).repartition(1).collect()
def load_mojo_model(local_dir, filename, extension=""): """ Loads a saved H2OMOJOModel (can be used with Spark without a running H2OSparkling Session) :param string local_dir: Local directory where the model is saved :param string filename: Filename with which the model is saved :param string extension: Extension to the filename with which the model is saved :return: """ from pysparkling.ml import H2OMOJOModel return H2OMOJOModel.create_from_mojo(local_dir + "/" + filename + extension)
def test_load_mojo_deeplearning(self): from pysparkling.ml import H2OMOJOModel, H2ODeepLearning mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE") model = dl.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def test_load_mojo_gbm(self): from pysparkling.ml import H2OMOJOModel, H2OGBM mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule") model = gbm.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def test_h2o_mojo_predictions_unseen_categoricals(self): mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/deep_learning_airlines_categoricals.zip") ) mojo.setConvertUnknownCategoricalLevelsToNa(True) row_for_scoring = Row("sepal_len", "sepal_wid", "petal_len", "petal_wid", "class") df = self._spark.createDataFrame( self._spark.sparkContext.parallelize([ (5.1, 3.5, 1.4, 0.2, "Missing_categorical") ]).map(lambda r: row_for_scoring(*r))) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction_output"][0] == 5.240174068202646