Example #1
0
resultsBestDtDf.write.save('/mnt/data/resultsBestDtDf.parquet',
                           format='parquet',
                           header=True,
                           mode="overwrite")

# COMMAND ----------

# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor

# Create a RandomForestRegressor
rf = RandomForestRegressor()

rf.setPredictionCol("Prediction_cuisine")\
  .setLabelCol("6714")\
  .setFeaturesCol("features")\
  .setSeed(190088121L)\
  .setMaxDepth(8)\
  .setNumTrees(25)

# Create a Pipeline
rfPipeline = Pipeline()

# Set the stages of the Pipeline
rfPipeline.setStages([vectorizer, rf])

# Let's first train on the entire dataset to see what we get
rfModel = rfPipeline.fit(trainingSetDF)

# COMMAND ----------
Example #2
0
trainingSetDF = split80DF
testSetDF = split20DF

# Guardamos en cache los datos para agilizar los cáluclos

trainingSetDF.cache()
testSetDF.cache()

# Árboles de decisión

rf = RandomForestRegressor()

# Para información sobre los parametros: print(rf.explainParams())

rf.setPredictionCol('Predicted_PE')\
  .setLabelCol('PE')\
  .setNumTrees(20)\
  .setMaxDepth(5)

# Forest Pipeline

pipeline = Pipeline(stages = [vectorizer, rf])

# Entrenamos el modelo

model = pipeline.fit(trainingSetDF)

# Podemos ver los detalles del árbol creado:

"""
    print("Nodos: " + str(model.stages[-1]._java_obj.parent().getNumTrees()))
    print("Profundidad: "+ str(model.stages[-1]._java_obj.parent().getMaxDepth()))