# COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator # Define a grid of hyperparameters to test: # - maxDepth: max depth of each decision tree in the GBT ensemble # - maxIter: iterations, i.e., number of trees in each GBT ensemble # In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [2, 5])\ .addGrid(gbt.maxIter, [10, 100])\ .build() # We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol()) # Declare the CrossValidator, which runs model tuning for us. cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid) # COMMAND ---------- # MAGIC %md Finally, we can tie our feature processing and model training stages together into a single `Pipeline`. # MAGIC # MAGIC ![Image of Pipeline](http://training.databricks.com/databricks_guide/5-pipeline.png) # COMMAND ---------- from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
gbt = GBTRegressor(labelCol="Survived") # COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator # Define a grid of hyperparameters to test: # - maxDepth: max depth of each decision tree in the GBT ensemble # - maxIter: iterations, i.e., number of trees in each GBT ensemble # In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [2, 5])\ .addGrid(gbt.maxIter, [10, 300])\ .build() # We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol()) # Declare the CrossValidator, which runs model tuning for us. cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid) # COMMAND ---------- from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv]) # COMMAND ---------- pipelineModel = pipeline.fit(train) # COMMAND ---------- predictions = pipelineModel.transform(test)
paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [2, 5])\ .addGrid(gbt.maxIter, [10, 100])\ .build() # 9.3 Instantiate evaluator object # We are performing cross validation # Results of prediction are made on held-back data # during cross-validation # Given a fitted gbt object, it evaluator = RegressionEvaluator( metricName="rmse", labelCol=gbt.getLabelCol(), # Actual col value predictionCol=gbt.getPredictionCol() # predicted col value ) # 9.4 Create CV object: cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid ) # 9.5 Instantiate pipeline objects pipeline = Pipeline(stages=[vectorassembler, vectorindexer, cv])