# Necesitamos especificar cuatro componentes para realizar el ajuste de hiperparámetros usando
# busqueda de la malla:
# * Estimator (i.e. machine learning algorithm)
# * Hyperparameter grid
# * Evaluator
# * Validation method


# ## Specify the estimator

# En este ejemplo usaremos la regresion lineal lasso para nuestro estimador :
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="star_rating", elasticNetParam=1.0)

# Usar el metodo  `explainParams`  para ver la lista de los hiperparametros:
print(lr.explainParams())

# Configurar  `elasticNetParam=1.0` corresponde al modelo Lasso  $l1$ (lasso) de regresion linear 
# Queremos encontrar un valor razonable de ese parametro que esta en el objeto `regParam`.
# [Elastic_net](https://en.wikipedia.org/wiki/Elastic_net_regularization)

# ## Especificar un grid de parametros 
# 

# usar la clase para especificar el grid (malla) de hiperparametros
# [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder)
from pyspark.ml.tuning import ParamGridBuilder
regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
grid = ParamGridBuilder().addGrid(lr.regParam, regParamList).build()

# El objeto resultante es simplemente una lista de mapas de parámetros:
Beispiel #2
0
# MAGIC * Set the features column to "features"
# MAGIC * Set the "ElasticNetParam" to 0.5 (this controlls the mix of l1 and l2 regularization--we'll just use an equal amount of each)
# MAGIC * Print the results of calling `explainParams` on `lrModel`.  This will show you all the possible parameters, and whether or not you have customized them.

# COMMAND ----------

from pyspark.ml.regression import LinearRegression

lrModel = LinearRegression()\
  .setLabelCol("count")\
  .setFeaturesCol("features")\
  .setElasticNetParam(0.5)

print("Printing out the model Parameters:")
print("-" * 20)
print(lrModel.explainParams())
print("-" * 20)

# COMMAND ----------

# MAGIC %md
# MAGIC * Use the `fit` method on `lrModel` to provide the `training` dataset for fitting.
# MAGIC * Store the results in `lrFitted`.

# COMMAND ----------

lrFitted = lrModel.fit(training)

# COMMAND ----------

# MAGIC %md
df = spark.read.load("/data/regression")


# COMMAND ----------

from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print lr.explainParams()
lrModel = lr.fit(df)


# COMMAND ----------

summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2


# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
Beispiel #4
0
#cache the splitted datasets 
testSetDF = split85DF.cache()
trainSetDF = split15DF.cache()

###########################
# model building in spark #
###########################
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

# model constructor 
lr = LinearRegression()

# understand model parameters 
lr.explainParams()		#individually call fnc on each param to set, like the following
lr.setPredictionCol("Prediction_PE")\	#rename the prediction col 
  .setLabelCol("PE")\		#col in df
  .setMaxIter(100)\
  .setRegParam(0.15)

###########################
# create a pipeline
# - pipeline contains a series of stages in sequential execution 
# - each stage either an estimator or a transformer 
# - pipeline.fit() may equal to one of the following:
#	* estimator.fit()
#	* transformer.transform()
# - the fitted model = pipelineModel 
###########################
# COMMAND ----------

# Let's cache these datasets for performance
testSet = split20.cache()
trainingSet = split80.cache()
print "Test count: ", testSet.count()
print "Training count: ", trainingSet.count()

# COMMAND ----------

from pyspark.ml.regression import LinearRegression

lr = LinearRegression()

print lr.explainParams()

# COMMAND ----------

# MAGIC %md The cell below is based on the Spark ML pipeline API. More information can be found in the Spark ML Programming Guide at https://spark.apache.org/docs/latest/ml-guide.html

# COMMAND ----------

from pyspark.ml import Pipeline

# Now we set the parameters for the method
lr.setPredictionCol("Predicted_PE")\
  .setLabelCol("PE")\
  .setMaxIter(100)\
  .setRegParam(0.1)