Example #1
0
    def featureExtractLr(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            # pipeline = PipelineModel.load(ROOT_PATH+'/logistic')
            pipeline = Pipeline.load(ROOT_PATH + '/logistic')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            lr = LogisticRegression(maxIter=10,
                                    regParam=0.001).setElasticNetParam(0.8)
            pipeline = Pipeline(stages=[remover, hashingTF, lr])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/logistic')
        # model.write().overwrite().save(ROOT_PATH+'/logistic')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("id", "features", "probability",
                                          "prediction")

        for row in selected.collect():
            rid, features, prob, prediction = row
            self.logger.info("features: %s", features)
            self.logger.info("prob: %s", str(prob))
            self.logger.info("prediction: %s", str(prediction))
Example #2
0
def testPipelineWithTargetEncoderIsSerializable():
    targetEncoder = H2OTargetEncoder(
        foldCol="ID",
        labelCol="CAPSULE",
        inputCols=["RACE", "DPROS", "DCAPS"],
        outputCols=["RACE_out", "DPROS_out", "DCAPS_out"],
        holdoutStrategy="KFold",
        blendedAvgEnabled=True,
        blendedAvgInflectionPoint=15.0,
        blendedAvgSmoothing=25.0,
        noise=0.05,
        noiseSeed=123)
    gbm = H2OGBM() \
        .setLabelCol("CAPSULE") \
        .setFeaturesCols(targetEncoder.getOutputCols())
    pipeline = Pipeline(stages=[targetEncoder, gbm])
    path = "file://" + os.path.abspath(
        "build/testPipelineWithTargetEncoderIsSerializable")
    pipeline.write().overwrite().save(path)
    loadedPipeline = Pipeline.load(path)
    [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages()

    assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder,
                                                  loadedTargetEncoder)
    assert gbm.getLabelCol() == loadedGbm.getLabelCol()
    assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
Example #3
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Example #4
0
 def runTest(self):
     document_assembler = DocumentAssembler() \
         .setInputCol("text") \
         .setOutputCol("document")
     tokenizer = RegexTokenizer() \
         .setOutputCol("token")
     lemmatizer = Lemmatizer() \
         .setInputCols(["token"]) \
         .setOutputCol("lemma") \
         .setDictionary({"sad": "unsad"})
     finisher = Finisher() \
         .setInputCols(["token", "lemma"]) \
         .setOutputCols(["token_views", "lemma_views"])
     pipeline = Pipeline(
         stages=[document_assembler, tokenizer, lemmatizer, finisher])
     model = pipeline.fit(self.data)
     token_before_save = model.transform(self.data).select(
         "token_views").take(1)[0].token_views.split("@")[2]
     lemma_before_save = model.transform(self.data).select(
         "lemma_views").take(1)[0].lemma_views.split("@")[2]
     pipe_path = "./tmp_pipeline"
     pipeline.write().overwrite().save(pipe_path)
     loaded_pipeline = Pipeline.read().load(pipe_path)
     token_after_save = model.transform(self.data).select(
         "token_views").take(1)[0].token_views.split("@")[2]
     lemma_after_save = model.transform(self.data).select(
         "lemma_views").take(1)[0].lemma_views.split("@")[2]
     print(token_before_save)
     assert token_before_save == "sad"
     assert lemma_before_save == "unsad"
     assert token_after_save == token_before_save
     assert lemma_after_save == lemma_before_save
     loaded_pipeline.fit(self.data).transform(self.data).show()
Example #5
0
    def featureExtract(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            pipeline = Pipeline.load(ROOT_PATH + '/pipeline')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff")
            # lr = LogisticRegression(maxIter=10, regParam=0.001)
            pipeline = Pipeline(stages=[remover, hashingTF, idf])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/pipeline')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("filtered", "features", "idff")

        for row in selected.collect():
            filtered, features, idff = row
            self.logger.info("features: %s", features)
            self.logger.info("idff: %s", idff)
            self.logger.info(
                "filtered: %s",
                str(filtered).decode("unicode_escape").encode("utf-8"))
        return selected
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(predictionCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM())

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage,
                       data):
    ## Remove all helper columns
    colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),
        hashingTF.getOutputCol(),
        stopWordsRemover.getOutputCol(),
        tokenizer.getOutputCol()
    ])

    ## Create the pipeline by defining all the stages
    pipeline = Pipeline(stages=[
        tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner
    ])

    ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
    ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
    ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
    pipelinePath = "file://" + os.path.abspath("../build/pipeline")
    pipeline.write().overwrite().save(pipelinePath)
    loaded_pipeline = Pipeline.load(pipelinePath)

    ## Train the pipeline model
    modelPath = "file://" + os.path.abspath("../build/model")
    model = loaded_pipeline.fit(data)
    model.write().overwrite().save(modelPath)
    return PipelineModel.load(modelPath)
    def test_glm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      ratio=0.8)

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline"))
        loaded_pipeline = Pipeline.load("file://" +
                                        os.path.abspath("build/glm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/glm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
    def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(labelCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM(),
                             strategy="RandomDiscrete",
                             maxModels=3,
                             maxRuntimeSecs=60,
                             selectBestModelBy="RMSE")

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Example #10
0
    def test_mojo_dai_pipeline_serialize(self):
        mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        # Create Spark pipeline of single step - mojo pipeline
        pipeline = Pipeline(stages=[mojo])
        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))
        loadedPipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))

        # Train the pipeline model
        model = loadedPipeline.fit(prostateFrame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))
        loadedModel = PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))

        preds = loadedModel.transform(prostateFrame).repartition(1).select(
            mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
Example #11
0
    def test_h2o_mojo_model_serialization_in_pipeline(self):
        mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)

        pipeline = Pipeline(stages=[mojo])

        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))

        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
        PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def process(spark, train_data, test_data):
    df_train = spark.read.parquet(train_data)
    df_test = spark.read.parquet(test_data)

    features = VectorAssembler(inputCols=df_train.columns[1:-1],
                               outputCol='features')
    evaluator = RegressionEvaluator(labelCol='ctr',
                                    predictionCol='prediction',
                                    metricName='rmse')
    lr_model_base = LinearRegression(labelCol='ctr', **LR_PARAMS_BASE)
    lr_model_to_tune = LinearRegression(labelCol='ctr')

    lr_param_grid = ParamGridBuilder() \
        .addGrid(lr_model_to_tune.maxIter, [5, 10, 20, 40, 50]) \
        .addGrid(lr_model_to_tune.regParam, [0.4, 0.1, 0.01, 0.001]) \
        .addGrid(lr_model_to_tune.fitIntercept, [False, True]) \
        .addGrid(lr_model_to_tune.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \
        .build()

    tvs = TrainValidationSplit(estimator=lr_model_to_tune,
                               estimatorParamMaps=lr_param_grid,
                               evaluator=evaluator,
                               trainRatio=0.8)

    pipeline_model_base = Pipeline(
        stages=[features, lr_model_base]).fit(df_train)
    prediction_base = pipeline_model_base.transform(df_test)
    rmse_base = evaluator.evaluate(prediction_base)
    print(f'Base lr model params: {LR_PARAMS_BASE}')
    print(f'RMSE at base lr model = {rmse_base}')

    print('Tuning lr model...')
    pipeline_model_tuned = Pipeline(stages=[features, tvs]).fit(df_train)
    prediction_tuned = pipeline_model_tuned.transform(df_test)
    rmse_tuned = evaluator.evaluate(prediction_tuned)

    model_java_obj = pipeline_model_tuned.stages[-1].bestModel._java_obj
    lr_params_tuned = {
        'maxIter': model_java_obj.getMaxIter(),
        'regParam': model_java_obj.getRegParam(),
        'elasticNetParam': model_java_obj.getElasticNetParam(),
        'fitIntercept': model_java_obj.getFitIntercept()
    }

    print(f'Base lr model params: {lr_params_tuned}')
    print(f'RMSE at tuned lr model = {rmse_tuned}')

    if rmse_tuned < rmse_base:
        pipeline_model_tuned.write().overwrite().save(MODEL_PATH)
        print(f'Tuned model has better RMSE value')
    else:
        pipeline_model_base.write().overwrite().save(MODEL_PATH)
        print(f'Base model has better RMSE value')
    print(f'Model saved at "{MODEL_PATH}"')

    spark.stop()
Example #13
0
class KMeans:
    def __init__(self, args, args2):
        """
        Standalone version for initializing KMeans clustering
        @param args: dict
        K: int
        init: string one of "k-means++" and "random"
        n_init: int
        max_iter: int
        tol: float
        """
        # init logging
        self.logger = logging.getLogger(self.__class__.__name__)

        # init parameters
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.logger.info("initializing SparkSession")
        # init SparkSession
        self.spark = utils.init_spark()

    def getIn(self):
        return

    def execute(self):
        from pyspark.ml.clustering import KMeans
        from pyspark.ml import Pipeline

        # 聚类中心个数
        k = int(self.param["K"])
        # 初始化方法
        init = self.param["init"]
        if init == "k-means":
            init = "k-means||"
        # 运行次数
        n_init = int(self.param["n_init"])
        # 单次训练最大迭代次数
        max_iter = int(self.param["max_iter"])
        # 容忍度
        tol = int(self.param["tol"])

        # 以Pipeline的模式初始化模型,方便统一接口加载模型
        self.logger.info("initializing model")
        self.model = Pipeline(stages=[
            KMeans(k=k, initMode=init, initSteps=n_init, maxIter=max_iter, tol=tol)
        ])

    def setOut(self):
        self.logger.info("saving model to %s" % self.outputUrl1)
        self.model.write().overwrite().save(self.outputUrl1)
Example #14
0
class RandomForest:
    def __init__(self, args, args2):
        """
        Spark version for initializing RandomForest multi-class classifier
        @param args: dict
        n_estimators: int
        criterion: string one of "gini" and "entropy"
        max_depth: int
        min_samples_split: int
        min_samples_leaf: int
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.dataUtil = utils.dataUtil(args2)

        self.logger.info("initializing SparkSession")

        self.spark = utils.init_spark()

    def getIn(self):
        return

    def execute(self):
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml import Pipeline

        # 树的个数
        n_estimators = int(self.param["treeNum"])
        # 评价标准
        criterion = self.param["criterion"]
        # 最大树深度
        max_depth = int(self.param["maxDepth"])
        # 最小分割样本数
        min_samples_split = int(self.param["minSamplesSplit"])
        # 叶子节点最小样本数
        min_samples_leaf = int(self.param["minSamplesLeaf"])

        # 以Pipeline的模式初始化模型,方便统一接口加载模型
        self.logger.info("initializing model")
        self.model = Pipeline(stages=[
            RandomForestClassifier(numTrees=n_estimators,
                                   impurity=criterion,
                                   maxDepth=max_depth,
                                   minInstancesPerNode=min_samples_leaf)
        ])

    def setOut(self):
        self.logger.info("Writing model to %s" % self.outputUrl1)
        self.model.write().overwrite().save(self.outputUrl1)
Example #15
0
def gridSearchTester(algo, prostateDataset):
    grid = H2OGridSearch(labelCol="AGE", hyperParameters={"seed": [1, 2, 3]}, splitRatio=0.8, algo=algo,
                         strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE")

    pipeline = Pipeline(stages=[grid])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/grid_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/grid_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
Example #16
0
def testPipelineSerialization(prostateDataset):
    algo = H2OIsolationForest(seed=1)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    model = loadedPipeline.fit(prostateDataset)
    expected = model.transform(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    result = loadedModel.transform(prostateDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Example #17
0
def testPipelineSerialization(prostateDataset):
    algo = H2ODRF(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/drf_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/drf_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
def create_random_pipeline():
    print("Creating Data pipeline for regressor")
    assembler = VectorAssembler(inputCols=[
        "Year", "Engine HP", "Engine Cylinders", "Number of Doors",
        "highway MPG", "city mpg", "Popularity"
    ],
                                outputCol="Attributes")

    regressor = RandomForestRegressor(featuresCol="Attributes",
                                      labelCol="MSRP")
    pipeline = Pipeline(stages=[assembler, regressor])

    pipelineStr = "pipeline"
    pipeline.write().overwrite().save(pipelineStr)  #  Save pipeline
    return (pipelineStr, regressor)
Example #19
0
def train_model(dataFrame, k_value, w2v_value, seed=2137):
    """Train and save model"""

    tokenizer = Tokenizer(inputCol="text", outputCol="words_raw")
    remover = StopWordsRemover(inputCol="words_raw", outputCol="words")
    word2Vec = Word2Vec(vectorSize=w2v_value,
                        seed=seed,
                        inputCol="words",
                        outputCol="features_unnormalized")
    scaler = StandardScaler(inputCol="features_unnormalized",
                            outputCol="features",
                            withStd=True,
                            withMean=True)
    kmeans = KMeans(k=k_value, seed=seed)
    pipeline = Pipeline(stages=[tokenizer, remover, word2Vec, scaler, kmeans])
    pipeline = pipeline.fit(dataFrame)
    pipeline.write().overwrite().save("hdfs:///models/model")

    return pipeline
def main(argv):
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()

    features_df = ParquetDataFrame(
        f'data/processed/{Phase.train.name}/features', spark)
    test_data_frac = 0.1
    test_features_df, train_features_df = features_df.randomSplit(
        [test_data_frac, 1 - test_data_frac])
    label_col = 'duration_min'
    model = Pipeline(stages=[
        StringIndexer(inputCol='pickup_cell_6',
                      handleInvalid='keep',
                      outputCol='pickup_cell_6_idx'),
        StringIndexer(inputCol='dropoff_cell_6',
                      handleInvalid='keep',
                      outputCol='dropoff_cell_6_idx'),
        VectorAssembler(inputCols=[
            'pickup_cell_6_idx', 'dropoff_cell_6_idx', 'distance', 'month',
            'day_of_month', 'day_of_week', 'hour', 'requests_pickup_cell',
            'requests_dropoff_cell'
        ],
                        outputCol="features"),
        DecisionTreeRegressor(
            maxDepth=7, featuresCol='features', labelCol=label_col)
    ]).fit(train_features_df)

    model_path = 'model/trip_duration_min'
    print(f'Saving model to {model_path}')
    model.write().overwrite().save(model_path)
    print(f'Model saved...')

    model = PipelineModel.load(model_path)
    predictions_df = model.transform(test_features_df)
    mae_cv = RegressionEvaluator(labelCol=label_col,
                                 metricName='mae').evaluate(predictions_df)
    print(f'Mean absolutre error: {mae_cv}')

    spark.stop()
Example #21
0
def testPipelineSerialization(dataset):
    algo = H2OKMeans(
        splitRatio=0.8,
        seed=1,
        k=3,
        featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"])

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/kmeans_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/kmeans_pipeline"))
    model = loadedPipeline.fit(dataset)

    model.write().overwrite().save(
        "file://" + os.path.abspath("build/kmeans_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/kmeans_pipeline_model"))

    loadedModel.transform(dataset).count()
Example #22
0
def testMojoModelSerializationInPipeline(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))

    pipeline = Pipeline(stages=[mojo])

    pipeline.write().overwrite().save(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))
    loadedPipeline = Pipeline.load(
        "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo"))

    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
    PipelineModel.load(
        "file://" +
        os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
Example #23
0
def testPipelineSerialization(heartDataset):
    features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop']
    algo = H2OCoxPH(labelCol="event",
                    featuresCols=features,
                    startCol='start',
                    stopCol='stop')

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/cox_ph_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    model = loadedPipeline.fit(heartDataset)
    expected = model.transform(heartDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    loadedModel = PipelineModel.load("file://" +
                                     os.path.abspath("build/cox_ph_pipeline"))
    result = loadedModel.transform(heartDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Example #24
0
mlSourceDF.printSchema()
mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in mlSourceDF.columns if 'Lag' in x])
# after creating all lag features, we can drop NA columns on the key columns
# drop na to avoid error in StringIndex 
mlSourceDF = mlSourceDF.na.drop(subset=["ServerIP","SessionStartHourTime"])
# indexing
columnsForIndex = ['dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth', 'hourofday', 
                     'Holiday', 'BusinessHour', 'Morning']

mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in columnsForIndex ])

sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed').setHandleInvalid("skip") for x in columnsForIndex]
indexModel = Pipeline(stages=sIndexers).fit(mlSourceDF)
mlSourceDF = indexModel.transform(mlSourceDF)
# save model for operationalization
indexModel.write().overwrite().save(stringIndexModelFile)

# encoding for categorical features
catVarNames=[x + '_indexed' for x in columnsForIndex ]

columnOnlyIndexed =   [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)<2 ]
columnForEncode = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)>=2 ]

info['columnOnlyIndexed'] = columnOnlyIndexed
info['columnForEncode'] = columnForEncode

# save info to blob storage
write_blob(info, infoFile, storageContainer, storageAccount, storageKey)

ohEncoders = [OneHotEncoder(inputCol=x, outputCol=x + '_encoded')
              for x in columnForEncode ]
# Print out the predicted Play Type, Actual Play Type, and the vector of indexed features
predictions.select("predictedLabel", "play_type", "indexedFeatures").show(5)

# Determine the accuracy of the model
# Can specify other evaluation metrics
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol = "prediction", metricName="accuracy")

# Calculate the test error
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)

# COMMAND ----------

predictions.select("indexedLabel","prediction","predictedLabel").show(5)

# COMMAND ----------

rfPipeline.write().overwrite().save("nfl-data/pipelines")

# COMMAND ----------

rfModel.write().overwrite().save("/nfl-data/models")

# COMMAND ----------

# MAGIC %fs
# MAGIC ls /nfl-data/pipelines/stages
## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")


##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold=0.5):
Example #27
0
pipeline = Pipeline(stages=[stringIndexer, vecAssembler])

# COMMAND ----------

# MAGIC %md
# MAGIC ## Scala
# MAGIC
# MAGIC Distributed XGBoost with Spark only has a Scala API, so we are going to create views of our DataFrames to use in Scala, as well as save our (untrained) pipeline to load in to Scala.

# COMMAND ----------

trainDF.createOrReplaceTempView("trainDF")
testDF.createOrReplaceTempView("testDF")

fileName = "/tmp/xgboost_feature_pipeline"
pipeline.write().overwrite().save(fileName)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data/Pipeline in Scala
# MAGIC
# MAGIC This section is only available in Scala because there is no distributed Python API for XGBoost in Spark yet.
# MAGIC
# MAGIC Let's load in our data/pipeline that we defined in Python.

# COMMAND ----------

# MAGIC %scala
# MAGIC import org.apache.spark.ml.Pipeline
# MAGIC
Example #28
0
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()

    continuousDataFrame = spark.createDataFrame([(4)], [ "feature"])
    binarizer = Binarizer(threshold=5, inputCol="feature", outputCol="binarized_feature")
    pipeline = Pipeline(stages=[binarizer])
    pipeline = pipeline.fit(continuousDataFrame)
    pipeline.write().overwrite().save("binarizer")
Example #29
0
test_model.take(1)

# In[13]:

import pyspark.ml.evaluation as ev

evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
                                             labelCol='INFANT_ALIVE_AT_REPORT')

print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

# In[14]:

pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline'
pipeline.write().overwrite().save(pipelinePath)

# In[15]:

loadedPipeline = Pipeline.load(pipelinePath)
loadedPipeline.fit(births_train).transform(births_test).take(1)

# In[16]:

from pyspark.ml import PipelineModel

modelPath = './model/infant_oneHotEncoder_Logistic_PipelineModel'
model.write().overwrite().save(modelPath)

loadedPipelineModel = PipelineModel.load(modelPath)
test_loadedModel = loadedPipelineModel.transform(births_test)
                       predictionCol="label")
elif algo == "xgboost":
    ## Create H2OXGBoost model
    algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                           featuresCols=[idf.getOutputCol()],
                           predictionCol="label")
## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")




##
## Make predictions on unlabeled data
## Spam detector
Example #31
0
assembler = VectorAssembler(inputCols=selected, outputCol="features")

# especificar el modelo:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(featuresCol="features",
                                    labelCol="five_star_rating")

# especificar el pipeline:
from pyspark.ml import Pipeline
stages = [filterer, converter, binarizer, extractor, assembler, classifier]
pipeline = Pipeline(stages=stages)

# ## Save and load the machine learning pipeline

# guardar la instancia del  `Pipeline` HDFS:
pipeline.write().overwrite().save("models/pipeline")

# si no queremos sobreescribirlo:
#```python
#pipeline.save("models/pipeline")
#```

# leer el pipeline desde el hdfs  :
pipeline_loaded = Pipeline.read().load("models/pipeline")

# se puede usar esto otro método:
#```python
#pipeline_loaded = Pipeline.load("models/pipeline")
#```

# ## entrenar el modelo