Esempio n. 1
0
def testLoadAndTrainMojo(hc, spark):
    referenceMojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
    df = spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True)
    frame = hc.asH2OFrame(df)
    frame["CAPSULE"] = frame["CAPSULE"].asfactor()
    gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, seed=42)
    gbm.train(y="CAPSULE", training_frame=frame)
    mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False)
    trainedMojo = H2OMOJOModel.createFromMojo("file://" + mojoFile)

    expect = referenceMojo.transform(df)
    result = trainedMojo.transform(df)

    unit_test_utils.assert_data_frames_are_identical(expect, result)
Esempio n. 2
0
    def test_h2o_mojo_model_serialization_in_pipeline(self):
        mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)

        pipeline = Pipeline(stages=[mojo])

        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))

        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
        PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
Esempio n. 3
0
def testMOJOModelReturnsSameResultAsBinaryModelWhenOffsetColumnsIsSet(
        hc, dataset):
    [trainingDataset, testingDataset] = dataset.randomSplit([0.8, 0.2], 1)
    trainingFrame = hc.as_h2o_frame(trainingDataset)
    testingFrame = hc.as_h2o_frame(testingDataset)
    gbm = H2OGradientBoostingEstimator(distribution="tweedie",
                                       ntrees=600,
                                       max_depth=1,
                                       min_rows=1,
                                       learn_rate=0.1,
                                       min_split_improvement=0)
    gbm.train(x=["District", "Group", "Age"],
              y="Claims",
              training_frame=trainingFrame,
              offset_column="Offset")

    mojoFile = gbm.download_mojo(path=os.path.abspath("build/"),
                                 get_genmodel_jar=False)
    print(mojoFile)
    mojoModel = H2OMOJOModel.createFromMojo("file://" + mojoFile)

    binaryModelResult = hc.as_spark_frame(gbm.predict(testingFrame))
    mojoResult = mojoModel.transform(testingDataset).select("prediction")

    unit_test_utils.assert_data_frames_are_identical(binaryModelResult,
                                                     mojoResult)
    assert mojoModel.getOffsetCol(
    ) == "Offset", "Offset column must be propagated to the MOJO model."
Esempio n. 4
0
def func():
    try:
        global model
        print("尝试加载PipelineModel")
        model = PipelineModel.load(local_model_path)  # 加载模型
        print("加载pipeline模型成功")
    except:
        try:
            # H2O模型必须走这里
            from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel
            print("从加载PipelineModel的try中跳出")
            print("在except的try中尝试加载H2OMOJOModel")
            settings = H2OMOJOSettings(withDetailedPredictionCol=True)
            model = H2OMOJOModel.createFromMojo(
                local_model_path + '/mojo_model', settings)
        except:
            global pipeline_model
            print("从加载H2OMOJOModel的try中跳出")
            print("尝试加载XGBModel")
            # model = XGBoostClassificationModel.load(local_model_path)
            model = load_xgb_model(local_model_path,
                                   m_type='XGBoostClassificationModel')
            if not model:
                logging.error('XGBoostClassificationModel没有加载成功')
            pipeline_model = load_xgb_model(local_model_path, "PipelineModel")
            if not pipeline_model:
                logging.error('XGB需要的pipelinemodel没有加载成功')
                logging.error(pipeline_model)

    return model, pipeline_model
def init():
    # 加载模型
    try:
        global model
        print("尝试加载PipelineModel")
        model = PipelineModel.load(local_model_path)#加载模型
    except:
        try:
        # H2O模型必须走这里
            from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel
            print("从加载PipelineModel的try中跳出")
            print("在except的try中尝试加载H2OMOJOModel")
            settings = H2OMOJOSettings(withDetailedPredictionCol=True)
            model = H2OMOJOModel.createFromMojo(local_model_path + '/mojo_model', settings)
        except:
            global pipeline_model
            print("从加载H2OMOJOModel的try中跳出")
            print("尝试加载XGBModel")
            # model = XGBoostClassificationModel.load(local_model_path)
            model = load_xgb_model(local_model_path,m_type='XGBoostClassificationModel')
            if not model:
                logging.error('XGBoostClassificationModel没有加载成功')
            pipeline_model = load_xgb_model(local_model_path, "PipelineModel")
            if not pipeline_model:
                logging.error('XGB需要的pipelinemodel没有加载成功')
                logging.error(pipeline_model)

    global final_transform_json_path
    final_transform_json_path = get_jsonfile_fullname()

    # 读取json,model_json: 模型中存储的json
    with open(final_transform_json_path, encoding='utf-8') as f:
        global model_json
        model_json = json.load(f)
 def test_h2o_mojo_predictions(self):
     # Try loading the Mojo and prediction on it without starting H2O Context
     mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
         "../ml/src/test/resources/binom_model_prostate.mojo"))
     prostate_frame = self._spark.read.csv(
         "file://" +
         unit_test_utils.locate("smalldata/prostate/prostate.csv"),
         header=True)
     mojo.predict(prostate_frame).repartition(1).collect()
Esempio n. 7
0
def testMojoUnsupervised(spark):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/isolation_forest.mojo"))

    rowForScoring = Row("V1")

    df = spark.createDataFrame(
        spark.sparkContext.parallelize([(5.1, )
                                        ]).map(lambda r: rowForScoring(*r)))
    mojo.transform(df).repartition(1).collect()
Esempio n. 8
0
    def test_h2o_mojo_unsupervised(self):
        mojo = H2OMOJOModel.create_from_mojo(
            "file://" +
            os.path.abspath("../ml/src/test/resources/isolation_forest.mojo"))

        row_for_scoring = Row("V1")

        df = self._spark.createDataFrame(
            self._spark.sparkContext.parallelize([
                (5.1, )
            ]).map(lambda r: row_for_scoring(*r)))
        mojo.predict(df).repartition(1).collect()
 def test_h2o_mojo_predictions_unseen_categoricals(self):
     mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
     mojo.setConvertUnknownCategoricalLevelsToNa(True)
     d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}]
     df = self._spark.createDataFrame(d)
     data = mojo.transform(df).collect()[0]
     assert data["class"] == "Missing_categorical"
     assert data["petal_len"] == 1.4
     assert data["petal_wid"] == 0.2
     assert data["sepal_len"] == 5.1
     assert data["sepal_wid"] == 3.5
     assert data["prediction_output"][0] == 5.240174068202646
Esempio n. 10
0
 def h2o_model_load(self, path):
     """
     加载h2o model
     :param path:
     :return:
     """
     full_path = self.concat_path(path, self.model_key)
     from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel
     settings = H2OMOJOSettings(withDetailedPredictionCol=False)
     model = H2OMOJOModel.createFromMojo(full_path + "/mojo_model",
                                         settings)
     return model
Esempio n. 11
0
def testMojoPredictions(prostateDataset,
                        prostateDatasetWithBinomialPrediction):
    # Try loading the Mojo and prediction on it without starting H2O Context
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
    result = mojo.transform(prostateDataset)\
        .repartition(1)\
        .withColumn("prob0", col("detailed_prediction.probabilities.0")) \
        .withColumn("prob1", col("detailed_prediction.probabilities.1")) \
        .drop("detailed_prediction")
    unit_test_utils.assert_data_frames_are_identical(
        result, prostateDatasetWithBinomialPrediction)
Esempio n. 12
0
def load_mojo_model(local_dir, filename, extension=""):
    """
    Loads a saved H2OMOJOModel (can be used with Spark without a running H2OSparkling Session)

    :param string local_dir: Local directory where the model is saved
    :param string filename: Filename with which the model is saved
    :param string extension: Extension to the filename with which the model is saved
    :return:
    """

    from pysparkling.ml import H2OMOJOModel
    return H2OMOJOModel.create_from_mojo(local_dir + "/" + filename +
                                         extension)
Esempio n. 13
0
def testLoadAndTrainMojo(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
        "../ml/src/test/resources/deep_learning_prostate.mojo"))

    dl = H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE")

    model = dl.fit(prostateDataset)

    predMojo = mojo.transform(prostateDataset).repartition(1).collect()
    predModel = model.transform(prostateDataset).repartition(1).collect()

    assert len(predMojo) == len(predModel)
    for i in range(0, len(predMojo)):
        assert predMojo[i] == predModel[i]
Esempio n. 14
0
def testLoadAndTrainMojo(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule")

    model = gbm.fit(prostateDataset)

    predMojo = mojo.transform(prostateDataset).repartition(1).collect()
    predModel = model.transform(prostateDataset).repartition(1).collect()

    assert len(predMojo) == len(predModel)
    for i in range(0, len(predMojo)):
        assert predMojo[i] == predModel[i]
Esempio n. 15
0
    def test_load_mojo_deeplearning(self):
        from pysparkling.ml import H2OMOJOModel, H2ODeepLearning
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE")

        model = dl.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
Esempio n. 16
0
    def test_load_mojo_gbm(self):
        from pysparkling.ml import H2OMOJOModel, H2OGBM
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule")

        model = gbm.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
Esempio n. 17
0
def testDomainColumns(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    gbm = H2OGBM(ntrees=2,
                 seed=42,
                 distribution="bernoulli",
                 labelCol="capsule")
    model = gbm.fit(prostateDataset)
    domainValues = model.getDomainValues()
    assert domainValues["DPROS"] is None
    assert domainValues["DCAPS"] is None
    assert domainValues["VOL"] is None
    assert domainValues["AGE"] is None
    assert domainValues["PSA"] is None
    assert domainValues["capsule"] == ["0", "1"]
    assert domainValues["RACE"] is None
    assert domainValues["ID"] is None
    def test_h2o_mojo_predictions_unseen_categoricals(self):
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
            "../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
                                             )
        mojo.setConvertUnknownCategoricalLevelsToNa(True)
        row_for_scoring = Row("sepal_len", "sepal_wid", "petal_len",
                              "petal_wid", "class")

        df = self._spark.createDataFrame(
            self._spark.sparkContext.parallelize([
                (5.1, 3.5, 1.4, 0.2, "Missing_categorical")
            ]).map(lambda r: row_for_scoring(*r)))
        data = mojo.transform(df).collect()[0]
        assert data["class"] == "Missing_categorical"
        assert data["petal_len"] == 1.4
        assert data["petal_wid"] == 0.2
        assert data["sepal_len"] == 5.1
        assert data["sepal_wid"] == 3.5
        assert data["prediction_output"][0] == 5.240174068202646
Esempio n. 19
0
def testMojoModelSerializationInPipeline(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    pipeline = Pipeline(stages=[mojo])

    pipeline.write().overwrite().save(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))
    loadedPipeline = Pipeline.load(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))

    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
    PipelineModel.load(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
Esempio n. 20
0
def testMojoPredictionsUnseenCategoricals(spark):
    path = "file://" + os.path.abspath(
        "../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
    settings = H2OMOJOSettings(convertUnknownCategoricalLevelsToNa=True)
    mojo = H2OMOJOModel.createFromMojo(path, settings)

    rowForScoring = Row("sepal_len", "sepal_wid", "petal_len", "petal_wid",
                        "class")

    df = spark.createDataFrame(
        spark.sparkContext.parallelize([
            (5.1, 3.5, 1.4, 0.2, "Missing_categorical")
        ]).map(lambda r: rowForScoring(*r)))
    data = mojo.transform(df).collect()[0]

    assert data["class"] == "Missing_categorical"
    assert data["petal_len"] == 1.4
    assert data["petal_wid"] == 0.2
    assert data["sepal_len"] == 5.1
    assert data["sepal_wid"] == 3.5
    assert data["prediction"] == 5.240174068202646
Esempio n. 21
0
def testLoadGBMModelAsMOJOModel(savedGbmModel):
    gbmModel = H2OMOJOModel.createFromMojo(savedGbmModel)
    assert gbmModel.getNtrees() > 0
Esempio n. 22
0
def init():
    global model_tag
    global pmmlFields
    # 下载模型
    download_model.download_model(download_model_zip_path, unzip_path)
    try:
        #如果模型路径下存在pmml文件,那么直接加载pmml模型
        #pmml文件压缩包的结构是model/xxx.pmml文件
        #因为pmml文件结构的特殊性,所以解压函数要修改代码
        model_path_childs = os.listdir(local_model_path)
        logging.info(f'模型文件夹下的文件有:{model_path_childs}')
        for child in model_path_childs:
            if child.endswith(".pmml"):
                full_path = os.path.join(local_model_path, child)
                break
                #或者是保存在model/model/part-00000中的pmml模型
            elif child == "model":
                for file in os.listdir(os.path.join(local_model_path,"model")):
                    if file.startswith("part"):
                        full_path = local_model_path + "/model/" + file
                        break

        logging.info(f'获取到的模型路径是:{full_path}')
        print("模型大小是:",os.path.getsize(full_path))
        global pmmlModel
        pmmlModel = loadPmml.fromFile(full_path)
        pmmlFields = parse_xml(full_path)
        logging.info(f'成功加载pmml模型')
        model_tag = 1
    except:
        logging.info("从pmml模型的加载处理中跳出")
        # 获取模型路径
        get_model_path(local_model_path)
        # 加载模型
        try:
            global model
            logging.info("尝试加载PipelineModel")
            model = PipelineModel.load(local_model_path)#加载模型
            model_tag = 2
        except:
            try:
            # H2O模型必须走这里
                from pysparkling.ml import H2OMOJOSettings, H2OMOJOModel
                logging.info("从加载PipelineModel的try中跳出")
                print("在except的try中尝试加载H2OMOJOModel")
                settings = H2OMOJOSettings(withDetailedPredictionCol=True)
                model = H2OMOJOModel.createFromMojo(local_model_path + '/mojo_model', settings)
                model_tag = 3
            except:
                global pipeline_model
                print("从加载H2OMOJOModel的try中跳出")
                print("尝试加载XGBModel")
                # model = XGBoostClassificationModel.load(local_model_path)
                model = load_xgb_model(local_model_path,m_type='XGBoostClassificationModel')
                if not model:
                    logging.error('XGBoostClassificationModel没有加载成功')
                pipeline_model = load_xgb_model(local_model_path, "PipelineModel")
                if not pipeline_model:
                    logging.error('XGB需要的pipelinemodel没有加载成功')
                    logging.error(pipeline_model)

                model_tag = 4
        global final_transform_json_path
        final_transform_json_path = get_jsonfile_fullname()

        # 读取json,model_json: 模型中存储的json
        with open(final_transform_json_path, encoding='utf-8') as f:
            global model_json
            model_json = json.load(f)
Esempio n. 23
0
def testMojoPredictions(prostateDataset):
    # Try loading the Mojo and prediction on it without starting H2O Context
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
    mojo.transform(prostateDataset).repartition(1).collect()