def build_model(training): #training = read_data() training.cache() columns = training.columns columns.remove("Occupancy") assembler = VectorAssembler(inputCols=columns, outputCol="featureVec") lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy") pipeline = Pipeline(stages=[assembler, lr]) param_grid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \ .build() evaluator = BinaryClassificationEvaluator(labelCol="Occupancy") validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.9) validator_model = validator.fit(training) return validator_model.bestModel
def trainNaiveBayesModel(data, directory=""): tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") remover = StopWordsRemover().setInputCol("words").setOutputCol( "filtered").setCaseSensitive(False) hashingTF = HashingTF().setNumFeatures(1000).setInputCol( "filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol( "features").setMinDocFreq(0) nb = NaiveBayes(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb]) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[200, 500, 1000, 5000]) \ .addGrid(nb.smoothing, [0.5, 1, 1.5, 2]) \ .build() crossval = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName( 'areaUnderPR' ), # set area Under precision-recall curve as the evaluation metric # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) cvModel = crossval.fit(data) modelName = directory + "NaiveBayesModel" cvModel.bestModel.write().overwrite().save(modelName) return modelName
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_fit_maximize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) bestModel = tvsModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) validationMetrics = tvsModel.validationMetrics self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") self.assertEqual(len(grid), len(validationMetrics), "validationMetrics has the same size of grid parameter") self.assertEqual(1.0, max(validationMetrics))
def random_forest_tuning(train_samples): rf = RandomForestClassifier(labelCol="label", featuresCol="features", cacheNodeIds=True) ru = RandomUnderSampler().setIndexCol('id') pipeline = Pipeline().setStages([ru, rf]) paramGrid = \ (ParamGridBuilder() .addGrid(rf.numTrees, [50, 75, 100]) .addGrid(rf.featureSubsetStrategy, ['sqrt']) .addGrid(rf.impurity, ['gini', 'entropy']) .addGrid(rf.maxDepth, [5, 15, 30]) .addGrid(rf.minInstancesPerNode, [1]) .addGrid(rf.subsamplingRate, [1.0, 0.6, 0.4]) .addGrid(ru.targetImbalanceRatio, [1.0, 1.5, 2.0]) .build()) pr_evaluator = \ BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR") tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=pr_evaluator, trainRatio=0.8, collectSubModels=True) model = tvs.fit(train_samples) return model
def dtRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : decisiontree_regression model [model] """ featuresCol = conf["params"].get("featuresCol") impurity = conf["params"].get("impurity", "variance") maxDepth = conf["params"].get("maxDepth", 5) maxBin = conf["params"].get("maxBins",32) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB",256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) checkpointInterval = conf["params"].get("checkpointInterval", 10) seed = conf["params"].get("seed", None) varianceCol = conf["params"].get("varianceCol", None) dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol) pipeline = Pipeline(stages=[featureIndexer, dt]) print ("maxDepth : " , dt.getMaxDepth()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = pipeline.fit(df) return model
def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() for param in [ lambda x: x.getCollectSubModels(), lambda x: x.getParallelism(), lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvs), param(tvsCopied)) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(tvsModelCopied)) self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator", ) self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original", ) for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index]) tvsModel.validationMetrics[0] = "foo" self.assertNotEqual( tvsModelCopied.validationMetrics[0], "foo", "Changing the original validationMetrics should not affect the copied model", ) tvsModel.subModels[0].getInducedError = lambda: "foo" self.assertNotEqual( tvsModelCopied.subModels[0].getInducedError(), "foo", "Changing the original subModels should not affect the copied model", )
def build_model(training): #training = read_data() training.cache() columns = training.columns columns.remove("Occupancy") assembler = VectorAssembler(inputCols=columns, outputCol="featureVec") lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy") pipeline = Pipeline(stages=[assembler, lr]) param_grid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \ .build() evaluator = BinaryClassificationEvaluator(labelCol="Occupancy") validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.9) validator_model = validator.fit(training) return validator_model.bestModel
def test_fit_maximize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) bestModel = tvsModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) validationMetrics = tvsModel.validationMetrics self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") self.assertEqual(len(grid), len(validationMetrics), "validationMetrics has the same size of grid parameter") self.assertEqual(1.0, max(validationMetrics))
def test_meta_estimator_disable_post_training_autologging(dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) eval_dataset = dataset_regression.sample(fraction=0.3, seed=1) lrParamMaps = [ {lr.maxIter: 1, lr.standardization: False}, {lr.maxIter: 200, lr.standardization: True}, {lr.maxIter: 2, lr.standardization: False}, ] eva = RegressionEvaluator(metricName="rmse") estimator = TrainValidationSplit(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.register_model" ) as mock_register_model, mock.patch( "mlflow.sklearn._AutologgingMetricsManager.is_metric_value_loggable" ) as mock_is_metric_value_loggable, mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.log_post_training_metric" ) as mock_log_post_training_metric, mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.register_prediction_input_dataset" ) as mock_register_prediction_input_dataset: with mlflow.start_run(): model = estimator.fit(dataset_regression) model.transform(eval_dataset) mock_register_model.assert_called_once() mock_is_metric_value_loggable.assert_not_called() mock_register_prediction_input_dataset.assert_not_called() mock_log_post_training_metric.assert_not_called()
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual(len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def __evaluate_algorithm(estimator, params, training, testing): training_validator = TrainValidationSplit( estimator=estimator, estimatorParamMaps=params, evaluator=BinaryClassificationEvaluator(), trainRatio=TRAINING_PORTION) model = training_validator.fit(training) predictions = model.transform(testing) subset = predictions.select("prediction", "label") # Cast labels and predictions to float. subset = subset.withColumn( "prediction", functions.round(subset['prediction']).cast('float')) subset = subset.withColumn("label", functions.round(subset['label']).cast('float')) # Get some metrics. metrics = MulticlassMetrics( subset.select("prediction", "label").rdd.map(tuple)) # Get the AUC value. evaluator = BinaryClassificationEvaluator() auc = evaluator.evaluate(predictions) return { 'predictions': predictions, 'model': model, 'auc': auc, 'accuracy': metrics.accuracy, 'metrics': metrics }
def pysparkLR(): """ TrainValidationSplit Test :return: """ spark = createLocalSparkSession() df = getDatasetMinist(spark) train, test = df.randomSplit([0.9, 0.1], seed=12345) lr = RandomForestClassifier() paramGrid = ParamGridBuilder() \ .addGrid(lr.maxDepth, [4, 5]) \ .addGrid(lr.numTrees, [10, 20]) \ .build() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best. model.transform(test) \ .select("features", "label", "prediction") \ .show(500)
def cross_validate(train, estimator, param_grid, evaluator, train_ratio=.8): """Function that uses TrainValidationSplit to cross validate and tune hyper-parameters. It then returnd a fitted model with the best combination of parameters. Args: train (DataFrame): the training data estimator: in this case a Pipeline object evaluator: how the model will be evaluated train_ratio (float): the fraction of the data used to train the model Returns: pipeline: a TrainValidationSplitModel object fitted with the best parameters """ print 'setting tvs' tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=train_ratio) print 'fitting pipeline' pipeline = tvs.fit(train) print 'saving pipeline' pipeline.bestModel.save('s3://yellowtaxidata/best_model') return pipeline
def logisticClassifier(df, conf): feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") prob_col = conf["params"].get("probabilityCol", "probability") max_iter = conf["params"].get("maxIter", 100) reg_param = conf["params"].get("regParam", 0.0) elasticNet_param = conf["params"].get("elasticNetParam", 0.0) tolr = conf["params"].get("tol", 1e-6) fit_intercept = conf["params"].get("fitIntercept", True) thres = conf["params"].get("threshold", 0.5) thresh = conf["params"].get("thresholds", None) std = conf["params"].get("standardization", True) weight = conf["params"].get("weightCol", None) aggr = conf["params"].get("aggregationDepth", 2) fml = conf["params"].get("family", "auto") lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticNet_param, \ tol=tolr, fitIntercept=fit_intercept, threshold=thres, standardization=std, \ aggregationDepth=aggr, family=fml) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": logReg = LogisticRegression() paramgGrids = conf["tuning"].get("paramGrids") folds = conf["tuning"].get("methodParam", 2) pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=logReg, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": paramgGrids = conf["tuning"].get("paramGrids") tr = conf["tuning"].get("methodParam", 0.8) pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = lr.fit(df) return model
def randomforestRegression (df,conf): """input : - Dataframe train (df) - Hyperparameter configuration (conf) output : - Random Forest Regression Model """ # set params with default value (if value isn't set in rfr_params) feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") max_depth = conf["params"].get("maxDepth", 5) num_trees = conf["params"].get("numTrees", 20) max_bins= conf["params"].get("maxBins", 32) seed = conf["params"].get("seed", None) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB", 256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) checkpointInterval = conf["params"].get("checkpointInterval", 10) impurity = conf["params"].get("impurity", "variance") subSamplingRate = conf["params"].get("subsamplingRate", 1.0) featureSubsetStrategy = conf["params"].get("featureSubsetStrategy", "auto") rfr = RandomForestRegressor(featuresCol=feature_col, labelCol=label_col, predictionCol=pred_col, maxDepth=max_depth, numTrees=num_trees, impurity=impurity) pipeline = Pipeline(stages=[featureIndexer, rfr]) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": folds = conf["tuning"].get("methodParam", 4) # Set the hiperparameter that we want to grid, ex: maxDepth and numTrees paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() cv = CrossValidator(estimator=rfr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": tr = conf["tuning"].get("methodParam", 0.8) # Set the hiperparameter that we want to grid, ex: maxDepth and numTrees paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=rfr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = pipeline.fit(df) return model
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assert_param_maps_equal( loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def trainTVS(estimator, paramGrid, evaluator, data): """ Train validation split. """ tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=evaluator, # 80% of the data will be used for training, 20% for validation. trainRatio=0.8, seed=7) return tvs.fit(data) # model
def aftsurvivalRegression(df, conf): """ AFT Survival Regression training Input : - Dataframe of training (df) - tuning and hiperparameter configuration (conf) output : - AFT survival regression model (model) """ feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") cens_col = conf["params"].get("censorCol", "censor") fit_intercept = conf["params"].get("fitIntercept",True) max_iter = conf["params"].get("maxIter", 100) tol = conf["params"].get("tol", ) quant_p = conf["params"].get("quantileProbabilities", [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) quant_col = conf["params"].get("quantilesCol", None) agg_depth = conf["params"].get("aggregationDepth", 2) afts = AFTSurvivalRegression(featuresCol=feature_col,labelCol=label_col, predictionCol=pred_col, censorCol=cens_col, maxIter=max_iter, fitIntercept=fit_intercept, tol=tol, aggregationDepth=agg_depth) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": folds = conf["tuning"].get("methodParam", 2) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() cv = CrossValidator(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": tr = conf["tuning"].get("methodParam", 0.8) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = afts.fit(df) return model
def TVS(estimator, paramGrid, dataTrain, dataTest): # Definimos el TVS tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), # 80% entrenamiento, 20% validacion trainRatio=0.8) # Entrenamos el modelo con la mejor combinacion # de parametros del grid por defecto model = tvs.fit(dataTrain) # Obtenemos predicciones sobre Test predictions = model.transform(dataTest) return predictions, model
def linSVC_with_tol_iter_fixed(df, tolerance, iterations): linSVC = LinearSVC(featuresCol='features', labelCol='label') grid = ParamGridBuilder().addGrid(linSVC.maxIter, [iterations]).addGrid( linSVC.tol, [tolerance]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=linSVC, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('linSVC:tol', tolerance, ':maxIter', iterations, ':result', result)
def binLR_with_tol_iter_fixed(df, tolerance, iterations): binLR = LogisticRegression(featuresCol='features', labelCol='label') grid = ParamGridBuilder().addGrid(binLR.maxIter, [iterations]).addGrid( binLR.tol, [tolerance]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=binLR, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('binLR:tol', tolerance, ':maxIter', iterations, 'result', result)
def process(spark, train_data, test_data): #train_data - путь к файлу с данными для обучения модели #test_data - путь к файлу с данными для оценки качества модели #сейчас использую только train_data #запуск python PySparkMLFit.py train.parquet validate.parquet #загружаю train_data train_data = spark.read.parquet(train_data) #обучаю модель #add feature feature = VectorAssembler(inputCols=train_data.columns[:7], outputCol="features") # Train a GBT model. gbt = GBTRegressor(labelCol="ctr", featuresCol="features", maxIter=10) #pipeline pipeline = Pipeline(stages=[feature, gbt]) paramGrid = ParamGridBuilder().addGrid( gbt.maxDepth, [2, 3, 4, 5, 6, 7, 8, 9]).addGrid(gbt.maxBins, [10, 16, 20, 24, 32, 36]).build() # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol="ctr", predictionCol="prediction", metricName="rmse"), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train_data) #делаю выборку для тестирования (training_data1, test_data) = train_data.randomSplit([0.8, 0.2], seed=42) #по тестовой выборке выделенной из train_data считаю rmse и вывожу его prediction = model.transform(test_data) evaluator = RegressionEvaluator(labelCol="ctr", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(prediction) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) #сохраняю модель model.bestModel.write().overwrite().save("model")
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def model_training(training_data, param_info): # 获取参数表以及gbt模型 param_grid, rf = model_setting(param_info) # 建立评估器,计算模式为准确值 evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') # 建立超参数验证模型 tvs = TrainValidationSplit(estimator=rf, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.8) # 训练模型 model = tvs.fit(dataset=training_data) # 返回最优模型 return model.bestModel
def train(train_data): feature_columns = train_data.columns[:-1] assembler = VectorAssembler(inputCols=feature_columns, outputCol='features') dt = DecisionTreeClassifier(featuresCol='features', labelCol='label') grid_search = ParamGridBuilder()\ .addGrid(dt.impurity, ['gini', 'entropy'])\ .addGrid(dt.maxBins, [20, 30, 40, 50])\ .addGrid(dt.maxDepth, [10, 15, 25])\ .build() evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', # 多元分类 使用y_pred labelCol='label', metricName='accuracy') # f1 tvs = TrainValidationSplit(estimator=dt, estimatorParamMaps=grid_search, evaluator=evaluator, trainRatio=0.8) tvs_pipeline = Pipeline(stages=[assembler, tvs]) tvs_pipeline_model = tvs_pipeline.fit(train_data) best_model = tvs_pipeline_model.stages[-1] best_param = get_best_param(best_model) return best_param, tvs_pipeline_model
def trainAndEvalModelByDecisionTreeRegressorAndTrainValidationSplit( stages, train_df, test_df, evaluator): ''' 使用 DecisionTreeRegressor 决策树回归和 TrainValidationSplit 建立机器学习Pipeline流程进行模型训练和验证,并找出最佳模型 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' print( '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练 =======================' ) dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features') paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [ 5, 10, 15, 25 ]).addGrid(dt.maxBins, [25, 35, 45, 50]).build( ) # 执行模型参数训练 4*4=16次,其中impurity="variance"固定不变,不用再参与训练,由于在line:108,创建 vectorIndexer 时,设置了maxCategories=24,因此这里maxBins要大于24 tsv = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) tsvPipeline = Pipeline(stages=stages + [tsv]) tsvPipelineModel = tsvPipeline.fit(train_df) bestModel = tsvPipelineModel.stages[2].bestModel print( '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 =======================' ) predicts = tsvPipelineModel.transform(test_df) rmse = evaluator.evaluate(predicts) print( '======================= 使用 DecisionTreeRegressor、TrainValidationSplit 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse=' + str(rmse) + ') =======================') return (bestModel, predicts, rmse)
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None): """Construct a list of TrainValidationSplit estimators for weather data where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples and 0 < `trainRatio` <= 1 determines the fraction of rows used for training. The RegressionEvaluator will use a non-default `metricName`, if specified. """ feature_cols = ['latitude', 'longitude', 'elevation', 'doy'] column_names = dict(featuresCol="features", labelCol="tmax", predictionCol="tmax_pred") getDOY = doy_query() sqlTrans = SQLTransformer(statement=getDOY) feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=column_names["featuresCol"]) ev = (RegressionEvaluator().setLabelCol( column_names["labelCol"]).setPredictionCol( column_names["predictionCol"])) if metricName: ev = ev.setMetricName(metricName) tvs_list = [] for est, pgb in estimator_gridbuilders: est = est.setParams(**column_names) pl = Pipeline(stages=[sqlTrans, feature_assembler, est]) paramGrid = pgb.build() tvs_list.append( TrainValidationSplit(estimator=pl, estimatorParamMaps=paramGrid, evaluator=ev, trainRatio=trainRatio)) return tvs_list
def kNN_with_k_fixed(df, k): knn = KNNClassifier(featuresCol='features', labelCol='label', topTreeSize=1000, topTreeLeafSize=10, subTreeLeafSize=30) grid = ParamGridBuilder().addGrid(knn.k, [k]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=knn, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('kNN:k', k, result)
def main(name='Loan_model'): logger = logging.getLogger(__name__) spark = SparkSession.builder.appName(f'{name}').getOrCreate() data = spark.read.csv(path, inferSchema=True, header=True) logger.info(f'Vectorising Features') data = get_features(data, spark, target) logger.info(f'Obtaining Weight balance') data = data.withColumn('weights', weight_balance(data, col('label'))) logger.info(f'Create train and testing split 80-20') train, test = data.randomSplit([.8, .2], seed=1234) logger.info(f'Training and Optimising model') lr = LogisticRegression( featuresCol=data.columns[0], labelCol=data.columns[1], weightCol=data.columns[2], maxIter=100, ) pipeline = Pipeline(stages=[lr]) paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1]) \ .addGrid(lr.elasticNetParam, [0.001, 0.01, 0.1, 1]) \ .build() model_tune = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'), trainRatio=0.8) model = model_tune.fit(train) metrics = evaluate_model(model, test, spark) model.bestModel.write().overwrite().save(output_path) metrics.toPandas().to_csv(f'{output_path}testset_metrics.csv') logger.info(f'Model and metrics exported to {output_path}') return model, metrics
def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, seed=42, ) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(loadedTvsModel)) self.assertTrue( all( loadedTvsModel.isSet(param) for param in loadedTvsModel.params))
def validate(estimator, train, grid): """ Elige los hiperparámetros de "estimator" a partir de "grid" y utilizando el 20% de los datos de "train" como partición de validación. Como métrica de comparación, utiliza AUC. """ tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=grid, evaluator=BinaryClassificationEvaluator(labelCol="class"), trainRatio=0.8, seed=89) model = tvs.fit(train) for i, item in enumerate(model.getEstimatorParamMaps()): grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()] print(grid, model.getEvaluator().getMetricName(), model.validationMetrics[i])
def RForest_with_maxFeatures_maxDepth_fixed(df, max_depth, max_features): RForest = DecisionTreeClassifier(featuresCol='features', labelCol='label', impurity='gini', maxMemoryInMB=1024) grid = ParamGridBuilder().addGrid(RForest.maxDepth, [max_depth]).addGrid( RForest.maxBins, [max_features]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=RForest, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('RForest:maxDepth', max_depth, ':maxBins', max_features, ':result', result)
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
def test_save_load_nested_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.1) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best. prediction = model.transform(test) for row in prediction.take(5): print(row) # $example off$ spark.stop()
.addGrid(lr.regParam, [0.1, 2.0])\ .build() # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator()\ .setMetricName("areaUnderROC")\ .setRawPredictionCol("prediction")\ .setLabelCol("label") # COMMAND ---------- from pyspark.ml.tuning import TrainValidationSplit tvs = TrainValidationSplit()\ .setTrainRatio(0.75)\ .setEstimatorParamMaps(params)\ .setEstimator(pipeline)\ .setEvaluator(evaluator) # COMMAND ---------- tvsFitted = tvs.fit(train) # COMMAND ----------