def testLoadAndTrainMojo(hc, spark): referenceMojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) df = spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) frame = hc.asH2OFrame(df) frame["CAPSULE"] = frame["CAPSULE"].asfactor() gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, seed=42) gbm.train(y="CAPSULE", training_frame=frame) mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False) trainedMojo = H2OMOJOModel.createFromMojo("file://" + mojoFile) expect = referenceMojo.transform(df) result = trainedMojo.transform(df) unit_test_utils.assert_data_frames_are_identical(expect, result)
def test_h2o_mojo_model_serialization_in_pipeline(self): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def testMOJOModelReturnsSameResultAsBinaryModelWhenOffsetColumnsIsSet( hc, dataset): [trainingDataset, testingDataset] = dataset.randomSplit([0.8, 0.2], 1) trainingFrame = hc.as_h2o_frame(trainingDataset) testingFrame = hc.as_h2o_frame(testingDataset) gbm = H2OGradientBoostingEstimator(distribution="tweedie", ntrees=600, max_depth=1, min_rows=1, learn_rate=0.1, min_split_improvement=0) gbm.train(x=["District", "Group", "Age"], y="Claims", training_frame=trainingFrame, offset_column="Offset") mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False) print(mojoFile) mojoModel = H2OMOJOModel.createFromMojo("file://" + mojoFile) binaryModelResult = hc.as_spark_frame(gbm.predict(testingFrame)) mojoResult = mojoModel.transform(testingDataset).select("prediction") unit_test_utils.assert_data_frames_are_identical(binaryModelResult, mojoResult) assert mojoModel.getOffsetCol( ) == "Offset", "Offset column must be propagated to the MOJO model."
def func(): try: global model print("尝试加载PipelineModel") model = PipelineModel.load(local_model_path) # 加载模型 print("加载pipeline模型成功") except: try: # H2O模型必须走这里 from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel print("从加载PipelineModel的try中跳出") print("在except的try中尝试加载H2OMOJOModel") settings = H2OMOJOSettings(withDetailedPredictionCol=True) model = H2OMOJOModel.createFromMojo( local_model_path + '/mojo_model', settings) except: global pipeline_model print("从加载H2OMOJOModel的try中跳出") print("尝试加载XGBModel") # model = XGBoostClassificationModel.load(local_model_path) model = load_xgb_model(local_model_path, m_type='XGBoostClassificationModel') if not model: logging.error('XGBoostClassificationModel没有加载成功') pipeline_model = load_xgb_model(local_model_path, "PipelineModel") if not pipeline_model: logging.error('XGB需要的pipelinemodel没有加载成功') logging.error(pipeline_model) return model, pipeline_model
def init(): # 加载模型 try: global model print("尝试加载PipelineModel") model = PipelineModel.load(local_model_path)#加载模型 except: try: # H2O模型必须走这里 from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel print("从加载PipelineModel的try中跳出") print("在except的try中尝试加载H2OMOJOModel") settings = H2OMOJOSettings(withDetailedPredictionCol=True) model = H2OMOJOModel.createFromMojo(local_model_path + '/mojo_model', settings) except: global pipeline_model print("从加载H2OMOJOModel的try中跳出") print("尝试加载XGBModel") # model = XGBoostClassificationModel.load(local_model_path) model = load_xgb_model(local_model_path,m_type='XGBoostClassificationModel') if not model: logging.error('XGBoostClassificationModel没有加载成功') pipeline_model = load_xgb_model(local_model_path, "PipelineModel") if not pipeline_model: logging.error('XGB需要的pipelinemodel没有加载成功') logging.error(pipeline_model) global final_transform_json_path final_transform_json_path = get_jsonfile_fullname() # 读取json,model_json: 模型中存储的json with open(final_transform_json_path, encoding='utf-8') as f: global model_json model_json = json.load(f)
def test_h2o_mojo_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) mojo.predict(prostate_frame).repartition(1).collect()
def testMojoUnsupervised(spark): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/isolation_forest.mojo")) rowForScoring = Row("V1") df = spark.createDataFrame( spark.sparkContext.parallelize([(5.1, ) ]).map(lambda r: rowForScoring(*r))) mojo.transform(df).repartition(1).collect()
def test_h2o_mojo_unsupervised(self): mojo = H2OMOJOModel.create_from_mojo( "file://" + os.path.abspath("../ml/src/test/resources/isolation_forest.mojo")) row_for_scoring = Row("V1") df = self._spark.createDataFrame( self._spark.sparkContext.parallelize([ (5.1, ) ]).map(lambda r: row_for_scoring(*r))) mojo.predict(df).repartition(1).collect()
def test_h2o_mojo_predictions_unseen_categoricals(self): mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip") mojo.setConvertUnknownCategoricalLevelsToNa(True) d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}] df = self._spark.createDataFrame(d) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction_output"][0] == 5.240174068202646
def h2o_model_load(self, path): """ 加载h2o model :param path: :return: """ full_path = self.concat_path(path, self.model_key) from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel settings = H2OMOJOSettings(withDetailedPredictionCol=False) model = H2OMOJOModel.createFromMojo(full_path + "/mojo_model", settings) return model
def testMojoPredictions(prostateDataset, prostateDatasetWithBinomialPrediction): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) result = mojo.transform(prostateDataset)\ .repartition(1)\ .withColumn("prob0", col("detailed_prediction.probabilities.0")) \ .withColumn("prob1", col("detailed_prediction.probabilities.1")) \ .drop("detailed_prediction") unit_test_utils.assert_data_frames_are_identical( result, prostateDatasetWithBinomialPrediction)
def load_mojo_model(local_dir, filename, extension=""): """ Loads a saved H2OMOJOModel (can be used with Spark without a running H2OSparkling Session) :param string local_dir: Local directory where the model is saved :param string filename: Filename with which the model is saved :param string extension: Extension to the filename with which the model is saved :return: """ from pysparkling.ml import H2OMOJOModel return H2OMOJOModel.create_from_mojo(local_dir + "/" + filename + extension)
def testLoadAndTrainMojo(prostateDataset): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/deep_learning_prostate.mojo")) dl = H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE") model = dl.fit(prostateDataset) predMojo = mojo.transform(prostateDataset).repartition(1).collect() predModel = model.transform(prostateDataset).repartition(1).collect() assert len(predMojo) == len(predModel) for i in range(0, len(predMojo)): assert predMojo[i] == predModel[i]
def testLoadAndTrainMojo(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") model = gbm.fit(prostateDataset) predMojo = mojo.transform(prostateDataset).repartition(1).collect() predModel = model.transform(prostateDataset).repartition(1).collect() assert len(predMojo) == len(predModel) for i in range(0, len(predMojo)): assert predMojo[i] == predModel[i]
def test_load_mojo_deeplearning(self): from pysparkling.ml import H2OMOJOModel, H2ODeepLearning mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE") model = dl.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def test_load_mojo_gbm(self): from pysparkling.ml import H2OMOJOModel, H2OGBM mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule") model = gbm.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def testDomainColumns(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") model = gbm.fit(prostateDataset) domainValues = model.getDomainValues() assert domainValues["DPROS"] is None assert domainValues["DCAPS"] is None assert domainValues["VOL"] is None assert domainValues["AGE"] is None assert domainValues["PSA"] is None assert domainValues["capsule"] == ["0", "1"] assert domainValues["RACE"] is None assert domainValues["ID"] is None
def test_h2o_mojo_predictions_unseen_categoricals(self): mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath( "../ml/src/test/resources/deep_learning_airlines_categoricals.zip") ) mojo.setConvertUnknownCategoricalLevelsToNa(True) row_for_scoring = Row("sepal_len", "sepal_wid", "petal_len", "petal_wid", "class") df = self._spark.createDataFrame( self._spark.sparkContext.parallelize([ (5.1, 3.5, 1.4, 0.2, "Missing_categorical") ]).map(lambda r: row_for_scoring(*r))) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction_output"][0] == 5.240174068202646
def testMojoModelSerializationInPipeline(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def testMojoPredictionsUnseenCategoricals(spark): path = "file://" + os.path.abspath( "../ml/src/test/resources/deep_learning_airlines_categoricals.zip") settings = H2OMOJOSettings(convertUnknownCategoricalLevelsToNa=True) mojo = H2OMOJOModel.createFromMojo(path, settings) rowForScoring = Row("sepal_len", "sepal_wid", "petal_len", "petal_wid", "class") df = spark.createDataFrame( spark.sparkContext.parallelize([ (5.1, 3.5, 1.4, 0.2, "Missing_categorical") ]).map(lambda r: rowForScoring(*r))) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction"] == 5.240174068202646
def testLoadGBMModelAsMOJOModel(savedGbmModel): gbmModel = H2OMOJOModel.createFromMojo(savedGbmModel) assert gbmModel.getNtrees() > 0
def init(): global model_tag global pmmlFields # 下载模型 download_model.download_model(download_model_zip_path, unzip_path) try: #如果模型路径下存在pmml文件,那么直接加载pmml模型 #pmml文件压缩包的结构是model/xxx.pmml文件 #因为pmml文件结构的特殊性,所以解压函数要修改代码 model_path_childs = os.listdir(local_model_path) logging.info(f'模型文件夹下的文件有:{model_path_childs}') for child in model_path_childs: if child.endswith(".pmml"): full_path = os.path.join(local_model_path, child) break #或者是保存在model/model/part-00000中的pmml模型 elif child == "model": for file in os.listdir(os.path.join(local_model_path,"model")): if file.startswith("part"): full_path = local_model_path + "/model/" + file break logging.info(f'获取到的模型路径是:{full_path}') print("模型大小是:",os.path.getsize(full_path)) global pmmlModel pmmlModel = loadPmml.fromFile(full_path) pmmlFields = parse_xml(full_path) logging.info(f'成功加载pmml模型') model_tag = 1 except: logging.info("从pmml模型的加载处理中跳出") # 获取模型路径 get_model_path(local_model_path) # 加载模型 try: global model logging.info("尝试加载PipelineModel") model = PipelineModel.load(local_model_path)#加载模型 model_tag = 2 except: try: # H2O模型必须走这里 from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel logging.info("从加载PipelineModel的try中跳出") print("在except的try中尝试加载H2OMOJOModel") settings = H2OMOJOSettings(withDetailedPredictionCol=True) model = H2OMOJOModel.createFromMojo(local_model_path + '/mojo_model', settings) model_tag = 3 except: global pipeline_model print("从加载H2OMOJOModel的try中跳出") print("尝试加载XGBModel") # model = XGBoostClassificationModel.load(local_model_path) model = load_xgb_model(local_model_path,m_type='XGBoostClassificationModel') if not model: logging.error('XGBoostClassificationModel没有加载成功') pipeline_model = load_xgb_model(local_model_path, "PipelineModel") if not pipeline_model: logging.error('XGB需要的pipelinemodel没有加载成功') logging.error(pipeline_model) model_tag = 4 global final_transform_json_path final_transform_json_path = get_jsonfile_fullname() # 读取json,model_json: 模型中存储的json with open(final_transform_json_path, encoding='utf-8') as f: global model_json model_json = json.load(f)
def testMojoPredictions(prostateDataset): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) mojo.transform(prostateDataset).repartition(1).collect()