def build_model(training):
    #training = read_data()
    training.cache()

    columns = training.columns
    columns.remove("Occupancy")

    assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
    lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")

    pipeline = Pipeline(stages=[assembler, lr])

    param_grid = ParamGridBuilder() \
      .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
      .build()

    evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")

    validator = TrainValidationSplit(estimator=pipeline,
                                     estimatorParamMaps=param_grid,
                                     evaluator=evaluator,
                                     trainRatio=0.9)

    validator_model = validator.fit(training)
    return validator_model.bestModel
Example #2
0
def trainNaiveBayesModel(data, directory=""):
    tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
    remover = StopWordsRemover().setInputCol("words").setOutputCol(
        "filtered").setCaseSensitive(False)
    hashingTF = HashingTF().setNumFeatures(1000).setInputCol(
        "filtered").setOutputCol("rawFeatures")
    idf = IDF().setInputCol("rawFeatures").setOutputCol(
        "features").setMinDocFreq(0)
    nb = NaiveBayes(labelCol="label", featuresCol="features")
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

    paramGrid = ParamGridBuilder()\
      .addGrid(hashingTF.numFeatures,[200, 500, 1000, 5000]) \
      .addGrid(nb.smoothing, [0.5, 1, 1.5, 2]) \
      .build()

    crossval = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator().setMetricName(
            'areaUnderPR'
        ),  # set area Under precision-recall curve as the evaluation metric
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    cvModel = crossval.fit(data)
    modelName = directory + "NaiveBayesModel"
    cvModel.bestModel.write().overwrite().save(modelName)

    return modelName
Example #3
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
Example #4
0
    def test_fit_maximize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
        validationMetrics = tvsModel.validationMetrics

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
        self.assertEqual(len(grid), len(validationMetrics),
                         "validationMetrics has the same size of grid parameter")
        self.assertEqual(1.0, max(validationMetrics))
def random_forest_tuning(train_samples):
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                cacheNodeIds=True)
    ru = RandomUnderSampler().setIndexCol('id')
    pipeline = Pipeline().setStages([ru, rf])
    paramGrid = \
        (ParamGridBuilder()
         .addGrid(rf.numTrees, [50, 75, 100])
         .addGrid(rf.featureSubsetStrategy, ['sqrt'])
         .addGrid(rf.impurity, ['gini', 'entropy'])
         .addGrid(rf.maxDepth, [5, 15, 30])
         .addGrid(rf.minInstancesPerNode, [1])
         .addGrid(rf.subsamplingRate, [1.0, 0.6, 0.4])
         .addGrid(ru.targetImbalanceRatio, [1.0, 1.5, 2.0])
         .build())
    pr_evaluator = \
        BinaryClassificationEvaluator(labelCol="label",
                                      rawPredictionCol="rawPrediction",
                                      metricName="areaUnderPR")
    tvs = TrainValidationSplit(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=pr_evaluator,
                               trainRatio=0.8,
                               collectSubModels=True)

    model = tvs.fit(train_samples)

    return model
Example #6
0
def dtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxBin = conf["params"].get("maxBins",32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    seed = conf["params"].get("seed", None)
    varianceCol = conf["params"].get("varianceCol", None)   
    
    dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol)
    pipeline = Pipeline(stages=[featureIndexer, dt])
    
    print ("maxDepth : " , dt.getMaxDepth())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramgGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
Example #7
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0),
                                              (100, 100.0), (500, 500.0)] * 10,
                                             ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder().addGrid(iee.inducedError,
                                          [100.0, 0.0, 10000.0]).build()
        tvs = TrainValidationSplit(estimator=iee,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        tvsCopied = tvs.copy()
        tvsModelCopied = tvsModel.copy()

        for param in [
                lambda x: x.getCollectSubModels(),
                lambda x: x.getParallelism(),
                lambda x: x.getSeed(),
                lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvs), param(tvsCopied))

        for param in [
                lambda x: x.getSeed(),
                lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvsModel), param(tvsModelCopied))

        self.assertEqual(
            tvs.getEstimator().uid,
            tvsCopied.getEstimator().uid,
            "Copied TrainValidationSplit has the same uid of Estimator",
        )

        self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
        self.assertEqual(
            len(tvsModel.validationMetrics),
            len(tvsModelCopied.validationMetrics),
            "Copied validationMetrics has the same size of the original",
        )
        for index in range(len(tvsModel.validationMetrics)):
            self.assertEqual(tvsModel.validationMetrics[index],
                             tvsModelCopied.validationMetrics[index])

        tvsModel.validationMetrics[0] = "foo"
        self.assertNotEqual(
            tvsModelCopied.validationMetrics[0],
            "foo",
            "Changing the original validationMetrics should not affect the copied model",
        )
        tvsModel.subModels[0].getInducedError = lambda: "foo"
        self.assertNotEqual(
            tvsModelCopied.subModels[0].getInducedError(),
            "foo",
            "Changing the original subModels should not affect the copied model",
        )
def build_model(training):
  #training = read_data()
  training.cache()
  
  columns = training.columns
  columns.remove("Occupancy")
  
  assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
  lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")
  
  pipeline = Pipeline(stages=[assembler, lr])
  
  param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
    .build()
  
  evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")
  
  validator = TrainValidationSplit(estimator=pipeline,
                             estimatorParamMaps=param_grid,
                             evaluator=evaluator,
                             trainRatio=0.9)
  
  validator_model = validator.fit(training)
  return validator_model.bestModel
Example #9
0
    def test_fit_maximize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
        validationMetrics = tvsModel.validationMetrics

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
        self.assertEqual(len(grid), len(validationMetrics),
                         "validationMetrics has the same size of grid parameter")
        self.assertEqual(1.0, max(validationMetrics))
def test_meta_estimator_disable_post_training_autologging(dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    eval_dataset = dataset_regression.sample(fraction=0.3, seed=1)
    lrParamMaps = [
        {lr.maxIter: 1, lr.standardization: False},
        {lr.maxIter: 200, lr.standardization: True},
        {lr.maxIter: 2, lr.standardization: False},
    ]
    eva = RegressionEvaluator(metricName="rmse")
    estimator = TrainValidationSplit(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva)

    with mock.patch(
        "mlflow.pyspark.ml._AutologgingMetricsManager.register_model"
    ) as mock_register_model, mock.patch(
        "mlflow.sklearn._AutologgingMetricsManager.is_metric_value_loggable"
    ) as mock_is_metric_value_loggable, mock.patch(
        "mlflow.pyspark.ml._AutologgingMetricsManager.log_post_training_metric"
    ) as mock_log_post_training_metric, mock.patch(
        "mlflow.pyspark.ml._AutologgingMetricsManager.register_prediction_input_dataset"
    ) as mock_register_prediction_input_dataset:
        with mlflow.start_run():
            model = estimator.fit(dataset_regression)

        model.transform(eval_dataset)

        mock_register_model.assert_called_once()
        mock_is_metric_value_loggable.assert_not_called()
        mock_register_prediction_input_dataset.assert_not_called()
        mock_log_post_training_metric.assert_not_called()
Example #11
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
Example #12
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0),
                                              (100, 100.0), (500, 500.0)] * 10,
                                             ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsCopied = tvs.copy()
        tvsModelCopied = tvsModel.copy()

        self.assertEqual(
            tvs.getEstimator().uid,
            tvsCopied.getEstimator().uid,
            "Copied TrainValidationSplit has the same uid of Estimator")

        self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
        self.assertEqual(
            len(tvsModel.validationMetrics),
            len(tvsModelCopied.validationMetrics),
            "Copied validationMetrics has the same size of the original")
        for index in range(len(tvsModel.validationMetrics)):
            self.assertEqual(tvsModel.validationMetrics[index],
                             tvsModelCopied.validationMetrics[index])
Example #13
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsCopied = tvs.copy()
        tvsModelCopied = tvsModel.copy()

        self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid,
                         "Copied TrainValidationSplit has the same uid of Estimator")

        self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
        self.assertEqual(len(tvsModel.validationMetrics),
                         len(tvsModelCopied.validationMetrics),
                         "Copied validationMetrics has the same size of the original")
        for index in range(len(tvsModel.validationMetrics)):
            self.assertEqual(tvsModel.validationMetrics[index],
                             tvsModelCopied.validationMetrics[index])
Example #14
0
def __evaluate_algorithm(estimator, params, training, testing):
    training_validator = TrainValidationSplit(
        estimator=estimator,
        estimatorParamMaps=params,
        evaluator=BinaryClassificationEvaluator(),
        trainRatio=TRAINING_PORTION)
    model = training_validator.fit(training)

    predictions = model.transform(testing)
    subset = predictions.select("prediction", "label")

    # Cast labels and predictions to float.
    subset = subset.withColumn(
        "prediction",
        functions.round(subset['prediction']).cast('float'))
    subset = subset.withColumn("label",
                               functions.round(subset['label']).cast('float'))

    # Get some metrics.
    metrics = MulticlassMetrics(
        subset.select("prediction", "label").rdd.map(tuple))

    # Get the AUC value.
    evaluator = BinaryClassificationEvaluator()
    auc = evaluator.evaluate(predictions)

    return {
        'predictions': predictions,
        'model': model,
        'auc': auc,
        'accuracy': metrics.accuracy,
        'metrics': metrics
    }
Example #15
0
def pysparkLR():
    """
        TrainValidationSplit Test
    :return:
    """
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)

    train, test = df.randomSplit([0.9, 0.1], seed=12345)

    lr = RandomForestClassifier()

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.maxDepth, [4, 5]) \
        .addGrid(lr.numTrees, [10, 20]) \
        .build()


    tvs = TrainValidationSplit(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=MulticlassClassificationEvaluator(),
                               # 80% of the data will be used for training, 20% for validation.
                               trainRatio=0.8)

    model = tvs.fit(train)

    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    model.transform(test) \
        .select("features", "label", "prediction") \
        .show(500)
def cross_validate(train, estimator, param_grid, evaluator, train_ratio=.8):
    """Function that uses TrainValidationSplit to cross validate and tune 
    hyper-parameters. It then returnd a fitted model with the best 
    combination of parameters. 

    Args:
        train (DataFrame): the training data
        estimator: in this case a Pipeline object
        evaluator: how the model will be evaluated 
        train_ratio (float): the fraction of the data used to train the model

    Returns:
        pipeline: a TrainValidationSplitModel object fitted with the best parameters

    """
    print 'setting tvs'
    tvs = TrainValidationSplit(estimator=estimator,
                               estimatorParamMaps=param_grid,
                               evaluator=evaluator,
                               trainRatio=train_ratio)
    print 'fitting pipeline'
    pipeline = tvs.fit(train)
    print 'saving pipeline'
    pipeline.bestModel.save('s3://yellowtaxidata/best_model')

    return pipeline
Example #17
0
def logisticClassifier(df, conf):
    feature_col = conf["params"].get("featuresCol", "features")
    label_col = conf["params"].get("labelCol", "label")
    pred_col = conf["params"].get("predictionCol", "prediction")
    prob_col = conf["params"].get("probabilityCol", "probability")

    max_iter = conf["params"].get("maxIter", 100)
    reg_param = conf["params"].get("regParam", 0.0)
    elasticNet_param = conf["params"].get("elasticNetParam", 0.0)
    tolr = conf["params"].get("tol", 1e-6)
    fit_intercept = conf["params"].get("fitIntercept", True)
    thres = conf["params"].get("threshold", 0.5)
    thresh = conf["params"].get("thresholds", None)
    std = conf["params"].get("standardization", True)
    weight = conf["params"].get("weightCol", None)
    aggr = conf["params"].get("aggregationDepth", 2)
    fml = conf["params"].get("family", "auto")


    lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticNet_param, \
            tol=tolr, fitIntercept=fit_intercept, threshold=thres, standardization=std, \
              aggregationDepth=aggr, family=fml)

    if conf["tuning"]:
        if conf["tuning"].get("method").lower() == "crossval":
            logReg = LogisticRegression()
            paramgGrids = conf["tuning"].get("paramGrids")
            folds = conf["tuning"].get("methodParam", 2)
            pg = ParamGridBuilder()
            for key in paramgGrids:
                pg.addGrid(key, paramgGrids[key])

            grid = pg.build()
            evaluator = BinaryClassificationEvaluator()
            cv = CrossValidator(estimator=logReg,
                                estimatorParamMaps=grid,
                                evaluator=evaluator,
                                numFolds=folds)
            model = cv.fit(df)
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            tr = conf["tuning"].get("methodParam", 0.8)
            pg = ParamGridBuilder()
            for key in paramgGrids:
                pg.addGrid(key, paramgGrids[key])

            grid = pg.build()
            evaluator = BinaryClassificationEvaluator()
            tvs = TrainValidationSplit(estimator=lr,
                                       estimatorParamMaps=grid,
                                       evaluator=evaluator,
                                       trainRatio=tr)
            model = tvs.fit(df)

    elif conf["tuning"] == None:
        model = lr.fit(df)
    return model
Example #18
0
def randomforestRegression (df,conf):
    """input  : - Dataframe train (df)
                - Hyperparameter configuration (conf)
       output : - Random Forest Regression Model
    """     
# set params with default value (if value isn't set in rfr_params)
    feature_col = conf["params"].get("featuresCol", "features")
    label_col = conf["params"].get("labelCol", "label")
    pred_col = conf["params"].get("predictionCol", "prediction")
    max_depth = conf["params"].get("maxDepth", 5)
    num_trees = conf["params"].get("numTrees", 20)
    max_bins= conf["params"].get("maxBins", 32)
    seed = conf["params"].get("seed", None)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB", 256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    impurity = conf["params"].get("impurity", "variance")  
    subSamplingRate = conf["params"].get("subsamplingRate", 1.0)
    featureSubsetStrategy = conf["params"].get("featureSubsetStrategy", "auto")
    
    rfr = RandomForestRegressor(featuresCol=feature_col, labelCol=label_col,
                                predictionCol=pred_col, maxDepth=max_depth,
                                numTrees=num_trees, impurity=impurity)
    
    pipeline = Pipeline(stages=[featureIndexer, rfr])
    if conf["tuning"]:
        if conf["tuning"].get("method").lower() == "crossval":
            folds = conf["tuning"].get("methodParam", 4)
# Set the hiperparameter that we want to grid, ex: maxDepth and numTrees
            paramGrids = conf["tuning"].get("paramGrids")
            pg=ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
            grid = pg.build()
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=rfr, estimatorParamMaps=grid,
                            evaluator=evaluator, numFolds=folds)
            model = cv.fit(df)
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            tr = conf["tuning"].get("methodParam", 0.8)
# Set the hiperparameter that we want to grid, ex: maxDepth and numTrees
            paramGrids = conf["tuning"].get("paramGrids")
            pg=ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
            grid = pg.build()
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=rfr, estimatorParamMaps=grid,
                                   evaluator=evaluator, trainRatio=tr)
            model = tvs.fit(df)
    elif conf["tuning"] ==  None:
        model = pipeline.fit(df)
    return model
Example #19
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assert_param_maps_equal(
            loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def trainTVS(estimator, paramGrid, evaluator, data):
    """
        Train validation split.
    """
    tvs = TrainValidationSplit(
        estimator=estimator,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8,
        seed=7)
    return tvs.fit(data)  # model
Example #21
0
def aftsurvivalRegression(df, conf):
  """ AFT Survival Regression training
        Input  : - Dataframe of training (df)
                 - tuning and hiperparameter configuration (conf)
        output : - AFT survival regression model (model)
  """
  feature_col = conf["params"].get("featuresCol", "features")
  label_col = conf["params"].get("labelCol", "label")
  pred_col = conf["params"].get("predictionCol", "prediction")
  cens_col = conf["params"].get("censorCol", "censor")
  fit_intercept = conf["params"].get("fitIntercept",True)
  max_iter = conf["params"].get("maxIter", 100)
  tol = conf["params"].get("tol", )
  quant_p = conf["params"].get("quantileProbabilities", [0.01, 0.05, 0.1, 0.25, 
                                                        0.5, 0.75, 0.9, 0.95, 0.99])
  quant_col = conf["params"].get("quantilesCol", None)
  agg_depth = conf["params"].get("aggregationDepth", 2)
      
  afts = AFTSurvivalRegression(featuresCol=feature_col,labelCol=label_col,
                          predictionCol=pred_col, censorCol=cens_col,
                          maxIter=max_iter, fitIntercept=fit_intercept,
                          tol=tol, aggregationDepth=agg_depth)

  if conf["tuning"]:
    if conf["tuning"].get("method").lower() == "crossval":
      folds = conf["tuning"].get("methodParam", 2)
      # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth
      paramGrids = conf["tuning"].get("paramGrids")
      pg=ParamGridBuilder()
      for key in paramGrids:
          pg.addGrid(key, paramGrids[key])
      grid = pg.build()
      evaluator = RegressionEvaluator()
      cv = CrossValidator(estimator=afts, estimatorParamMaps=grid,
                          evaluator=evaluator, numFolds=folds)
      model = cv.fit(df)
      
    elif conf["tuning"].get("method").lower() == "trainvalsplit":
      tr = conf["tuning"].get("methodParam", 0.8)
      # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth
      paramGrids = conf["tuning"].get("paramGrids")
      pg=ParamGridBuilder()
      for key in paramGrids:
          pg.addGrid(key, paramGrids[key])
      grid = pg.build()
      evaluator = RegressionEvaluator()
      tvs = TrainValidationSplit(estimator=afts, estimatorParamMaps=grid,
                                 evaluator=evaluator, trainRatio=tr)
      model = tvs.fit(df)
  elif conf["tuning"] ==  None:
    model = afts.fit(df)
  return model
Example #22
0
def TVS(estimator, paramGrid, dataTrain, dataTest):
    # Definimos el TVS
    tvs = TrainValidationSplit(estimator=estimator,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% entrenamiento, 20% validacion
                           trainRatio=0.8)
    # Entrenamos el modelo con la mejor combinacion 
    # de parametros del grid por defecto
    model = tvs.fit(dataTrain)
    # Obtenemos predicciones sobre Test
    predictions = model.transform(dataTest)
    return predictions, model
def linSVC_with_tol_iter_fixed(df, tolerance, iterations):
    linSVC = LinearSVC(featuresCol='features', labelCol='label')
    grid = ParamGridBuilder().addGrid(linSVC.maxIter, [iterations]).addGrid(
        linSVC.tol, [tolerance]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=linSVC,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('linSVC:tol', tolerance, ':maxIter', iterations, ':result', result)
def binLR_with_tol_iter_fixed(df, tolerance, iterations):
    binLR = LogisticRegression(featuresCol='features', labelCol='label')
    grid = ParamGridBuilder().addGrid(binLR.maxIter, [iterations]).addGrid(
        binLR.tol, [tolerance]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=binLR,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('binLR:tol', tolerance, ':maxIter', iterations, 'result', result)
Example #25
0
def process(spark, train_data, test_data):
    #train_data - путь к файлу с данными для обучения модели
    #test_data - путь к файлу с данными для оценки качества модели
    #сейчас использую только train_data
    #запуск python PySparkMLFit.py train.parquet validate.parquet

    #загружаю train_data
    train_data = spark.read.parquet(train_data)

    #обучаю модель
    #add feature
    feature = VectorAssembler(inputCols=train_data.columns[:7],
                              outputCol="features")

    # Train a GBT model.
    gbt = GBTRegressor(labelCol="ctr", featuresCol="features", maxIter=10)

    #pipeline
    pipeline = Pipeline(stages=[feature, gbt])

    paramGrid = ParamGridBuilder().addGrid(
        gbt.maxDepth,
        [2, 3, 4, 5, 6, 7, 8, 9]).addGrid(gbt.maxBins,
                                          [10, 16, 20, 24, 32, 36]).build()

    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=RegressionEvaluator(labelCol="ctr",
                                      predictionCol="prediction",
                                      metricName="rmse"),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train_data)

    #делаю выборку для тестирования
    (training_data1, test_data) = train_data.randomSplit([0.8, 0.2], seed=42)

    #по тестовой выборке выделенной из train_data считаю rmse и вывожу его
    prediction = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="ctr",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(prediction)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    #сохраняю модель
    model.bestModel.write().overwrite().save("model")
Example #26
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Example #27
0
def model_training(training_data, param_info):
    # 获取参数表以及gbt模型
    param_grid, rf = model_setting(param_info)
    # 建立评估器,计算模式为准确值
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    # 建立超参数验证模型
    tvs = TrainValidationSplit(estimator=rf,
                               estimatorParamMaps=param_grid,
                               evaluator=evaluator,
                               trainRatio=0.8)
    # 训练模型
    model = tvs.fit(dataset=training_data)
    # 返回最优模型
    return model.bestModel
Example #28
0
def train(train_data):
    feature_columns = train_data.columns[:-1]
    assembler = VectorAssembler(inputCols=feature_columns,
                                outputCol='features')
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')
    grid_search = ParamGridBuilder()\
                .addGrid(dt.impurity, ['gini', 'entropy'])\
                .addGrid(dt.maxBins, [20, 30, 40, 50])\
                .addGrid(dt.maxDepth, [10, 15, 25])\
                .build()
    evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction',  # 多元分类 使用y_pred
        labelCol='label',
        metricName='accuracy')  # f1
    tvs = TrainValidationSplit(estimator=dt,
                               estimatorParamMaps=grid_search,
                               evaluator=evaluator,
                               trainRatio=0.8)
    tvs_pipeline = Pipeline(stages=[assembler, tvs])
    tvs_pipeline_model = tvs_pipeline.fit(train_data)

    best_model = tvs_pipeline_model.stages[-1]
    best_param = get_best_param(best_model)

    return best_param, tvs_pipeline_model
Example #29
0
def trainAndEvalModelByDecisionTreeRegressorAndTrainValidationSplit(
        stages, train_df, test_df, evaluator):
    '''
    使用 DecisionTreeRegressor 决策树回归和 TrainValidationSplit 建立机器学习Pipeline流程进行模型训练和验证,并找出最佳模型
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features')
    paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [
        5, 10, 15, 25
    ]).addGrid(dt.maxBins, [25, 35, 45, 50]).build(
    )  # 执行模型参数训练 4*4=16次,其中impurity="variance"固定不变,不用再参与训练,由于在line:108,创建 vectorIndexer 时,设置了maxCategories=24,因此这里maxBins要大于24
    tsv = TrainValidationSplit(estimator=dt,
                               evaluator=evaluator,
                               estimatorParamMaps=paramGrid,
                               trainRatio=0.8)
    tsvPipeline = Pipeline(stages=stages + [tsv])
    tsvPipelineModel = tsvPipeline.fit(train_df)
    bestModel = tsvPipelineModel.stages[2].bestModel
    print(
        '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = tsvPipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse='
        + str(rmse) + ') =======================')
    return (bestModel, predicts, rmse)
Example #30
0
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None):
    """Construct a list of TrainValidationSplit estimators for weather data
       where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples
       and 0 < `trainRatio` <= 1 determines the fraction of rows used for training.
       The RegressionEvaluator will use a non-default `metricName`, if specified.
    """
    feature_cols = ['latitude', 'longitude', 'elevation', 'doy']
    column_names = dict(featuresCol="features",
                        labelCol="tmax",
                        predictionCol="tmax_pred")

    getDOY = doy_query()
    sqlTrans = SQLTransformer(statement=getDOY)

    feature_assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol=column_names["featuresCol"])
    ev = (RegressionEvaluator().setLabelCol(
        column_names["labelCol"]).setPredictionCol(
            column_names["predictionCol"]))
    if metricName:
        ev = ev.setMetricName(metricName)
    tvs_list = []
    for est, pgb in estimator_gridbuilders:
        est = est.setParams(**column_names)

        pl = Pipeline(stages=[sqlTrans, feature_assembler, est])

        paramGrid = pgb.build()
        tvs_list.append(
            TrainValidationSplit(estimator=pl,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=ev,
                                 trainRatio=trainRatio))
    return tvs_list
def kNN_with_k_fixed(df, k):
    knn = KNNClassifier(featuresCol='features',
                        labelCol='label',
                        topTreeSize=1000,
                        topTreeLeafSize=10,
                        subTreeLeafSize=30)
    grid = ParamGridBuilder().addGrid(knn.k, [k]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=knn,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('kNN:k', k, result)
Example #32
0
def main(name='Loan_model'):
    logger = logging.getLogger(__name__)
    spark = SparkSession.builder.appName(f'{name}').getOrCreate()

    data = spark.read.csv(path, inferSchema=True, header=True)

    logger.info(f'Vectorising Features')
    data = get_features(data, spark, target)

    logger.info(f'Obtaining Weight balance')
    data = data.withColumn('weights', weight_balance(data, col('label')))

    logger.info(f'Create train and testing split 80-20')
    train, test = data.randomSplit([.8, .2], seed=1234)

    logger.info(f'Training and Optimising model')

    lr = LogisticRegression(
        featuresCol=data.columns[0],
        labelCol=data.columns[1],
        weightCol=data.columns[2],
        maxIter=100,
    )

    pipeline = Pipeline(stages=[lr])

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1]) \
        .addGrid(lr.elasticNetParam, [0.001, 0.01, 0.1, 1]) \
        .build()

    model_tune = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
        trainRatio=0.8)

    model = model_tune.fit(train)

    metrics = evaluate_model(model, test, spark)

    model.bestModel.write().overwrite().save(output_path)
    metrics.toPandas().to_csv(f'{output_path}testset_metrics.csv')

    logger.info(f'Model and metrics exported to {output_path}')

    return model, metrics
Example #33
0
    def _run_test_save_load_trained_model(self, LogisticRegressionCls,
                                          LogisticRegressionModelCls):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        lr = LogisticRegressionCls()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True,
            seed=42,
        )
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        lrModelPath = temp_path + "/lrModel"
        lrModel.save(lrModelPath)
        loadedLrModel = LogisticRegressionModelCls.load(lrModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath)
        for param in [
                lambda x: x.getSeed(),
                lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvsModel), param(loadedTvsModel))

        self.assertTrue(
            all(
                loadedTvsModel.isSet(param)
                for param in loadedTvsModel.params))
Example #34
0
def validate(estimator, train, grid):
    """ Elige los hiperparámetros de "estimator" a partir de "grid" y utilizando el 
    20% de los datos de "train" como partición de validación. Como métrica de 
    comparación, utiliza AUC.
    """
    tvs = TrainValidationSplit(
        estimator=estimator,
        estimatorParamMaps=grid,
        evaluator=BinaryClassificationEvaluator(labelCol="class"),
        trainRatio=0.8,
        seed=89)

    model = tvs.fit(train)
    for i, item in enumerate(model.getEstimatorParamMaps()):
        grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()]
        print(grid,
              model.getEvaluator().getMetricName(), model.validationMetrics[i])
def RForest_with_maxFeatures_maxDepth_fixed(df, max_depth, max_features):
    RForest = DecisionTreeClassifier(featuresCol='features',
                                     labelCol='label',
                                     impurity='gini',
                                     maxMemoryInMB=1024)
    grid = ParamGridBuilder().addGrid(RForest.maxDepth, [max_depth]).addGrid(
        RForest.maxBins, [max_features]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=RForest,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('RForest:maxDepth', max_depth, ':maxBins', max_features, ':result',
          result)
Example #36
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Example #37
0
    def test_save_load_nested_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)

        originalParamMap = tvs.getEstimatorParamMaps()
        loadedParamMap = loadedTvs.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")
    train, test = data.randomSplit([0.7, 0.3])
    lr = LinearRegression(maxIter=10, regParam=0.1)

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator(),
                               # 80% of the data will be used for training, 20% for validation.
                               trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)
    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    prediction = model.transform(test)
    for row in prediction.take(5):
        print(row)
    # $example off$
    spark.stop()
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()


# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")


# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)


# COMMAND ----------

tvsFitted = tvs.fit(train)


# COMMAND ----------