# MAGIC Reference Decision Trees: https://en.wikipedia.org/wiki/Decision_tree_learning

# COMMAND ----------

# MAGIC %md
# MAGIC ###  Decision Tree Models

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.setLabelCol("PE")
dt.setPredictionCol("Predicted_PE")
dt.setFeaturesCol("features")
dt.setMaxBins(100)

dtPipeline = Pipeline()
dtPipeline.setStages([vectorizer, dt])
# Let's just resuse our CrossValidator

crossval.setEstimator(dtPipeline)

paramGrid = ParamGridBuilder()\
  .addGrid(dt.maxDepth, range(2, 8))\
  .build()
crossval.setEstimatorParamMaps(paramGrid)

dtModel = crossval.fit(trainingSet)

# COMMAND ----------
# COMMAND ----------

# MAGIC %md
# MAGIC In Spark, data is partitioned by row. So when it needs to make a split, each worker has to compute summary statistics for every feature for  each split point. Then these summary statistics have to be aggregated (via tree reduce) for a split to be made.
# MAGIC
# MAGIC Think about it: What if worker 1 had the value `32` but none of the others had it. How could you communicate how good of a split that would be? So, Spark has a maxBins parameter for discretizing continuous variables into buckets, but the number of buckets has to be as large as the number of categorical variables.

# COMMAND ----------

# MAGIC %md
# MAGIC Let's go ahead and increase maxBins to `40`.

# COMMAND ----------

dt.setMaxBins(40)

# COMMAND ----------

# MAGIC %md
# MAGIC Take two.

# COMMAND ----------

pipelineModel = pipeline.fit(trainDF)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Visualize the Decision Tree