# Necesitamos especificar cuatro componentes para realizar el ajuste de hiperparámetros usando # busqueda de la malla: # * Estimator (i.e. machine learning algorithm) # * Hyperparameter grid # * Evaluator # * Validation method # ## Specify the estimator # En este ejemplo usaremos la regresion lineal lasso para nuestro estimador : from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol="features", labelCol="star_rating", elasticNetParam=1.0) # Usar el metodo `explainParams` para ver la lista de los hiperparametros: print(lr.explainParams()) # Configurar `elasticNetParam=1.0` corresponde al modelo Lasso $l1$ (lasso) de regresion linear # Queremos encontrar un valor razonable de ese parametro que esta en el objeto `regParam`. # [Elastic_net](https://en.wikipedia.org/wiki/Elastic_net_regularization) # ## Especificar un grid de parametros # # usar la clase para especificar el grid (malla) de hiperparametros # [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder) from pyspark.ml.tuning import ParamGridBuilder regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] grid = ParamGridBuilder().addGrid(lr.regParam, regParamList).build() # El objeto resultante es simplemente una lista de mapas de parámetros:
# MAGIC * Set the features column to "features" # MAGIC * Set the "ElasticNetParam" to 0.5 (this controlls the mix of l1 and l2 regularization--we'll just use an equal amount of each) # MAGIC * Print the results of calling `explainParams` on `lrModel`. This will show you all the possible parameters, and whether or not you have customized them. # COMMAND ---------- from pyspark.ml.regression import LinearRegression lrModel = LinearRegression()\ .setLabelCol("count")\ .setFeaturesCol("features")\ .setElasticNetParam(0.5) print("Printing out the model Parameters:") print("-" * 20) print(lrModel.explainParams()) print("-" * 20) # COMMAND ---------- # MAGIC %md # MAGIC * Use the `fit` method on `lrModel` to provide the `training` dataset for fitting. # MAGIC * Store the results in `lrFitted`. # COMMAND ---------- lrFitted = lrModel.fit(training) # COMMAND ---------- # MAGIC %md
df = spark.read.load("/data/regression") # COMMAND ---------- from pyspark.ml.regression import LinearRegression lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) print lr.explainParams() lrModel = lr.fit(df) # COMMAND ---------- summary = lrModel.summary summary.residuals.show() print summary.totalIterations print summary.objectiveHistory print summary.rootMeanSquaredError print summary.r2 # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams()
#cache the splitted datasets testSetDF = split85DF.cache() trainSetDF = split15DF.cache() ########################### # model building in spark # ########################### from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml import Pipeline # model constructor lr = LinearRegression() # understand model parameters lr.explainParams() #individually call fnc on each param to set, like the following lr.setPredictionCol("Prediction_PE")\ #rename the prediction col .setLabelCol("PE")\ #col in df .setMaxIter(100)\ .setRegParam(0.15) ########################### # create a pipeline # - pipeline contains a series of stages in sequential execution # - each stage either an estimator or a transformer # - pipeline.fit() may equal to one of the following: # * estimator.fit() # * transformer.transform() # - the fitted model = pipelineModel ###########################
# COMMAND ---------- # Let's cache these datasets for performance testSet = split20.cache() trainingSet = split80.cache() print "Test count: ", testSet.count() print "Training count: ", trainingSet.count() # COMMAND ---------- from pyspark.ml.regression import LinearRegression lr = LinearRegression() print lr.explainParams() # COMMAND ---------- # MAGIC %md The cell below is based on the Spark ML pipeline API. More information can be found in the Spark ML Programming Guide at https://spark.apache.org/docs/latest/ml-guide.html # COMMAND ---------- from pyspark.ml import Pipeline # Now we set the parameters for the method lr.setPredictionCol("Predicted_PE")\ .setLabelCol("PE")\ .setMaxIter(100)\ .setRegParam(0.1)