testSetDF = split85DF.cache() trainSetDF = split15DF.cache() ########################### # model building in spark # ########################### from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml import Pipeline # model constructor lr = LinearRegression() # understand model parameters lr.explainParams() #individually call fnc on each param to set, like the following lr.setPredictionCol("Prediction_PE")\ #rename the prediction col .setLabelCol("PE")\ #col in df .setMaxIter(100)\ .setRegParam(0.15) ########################### # create a pipeline # - pipeline contains a series of stages in sequential execution # - each stage either an estimator or a transformer # - pipeline.fit() may equal to one of the following: # * estimator.fit() # * transformer.transform() # - the fitted model = pipelineModel ########################### lrPipeline = Pipeline()
df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0) df_featured.printSchema() training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label")) vectorizer = VectorAssembler() vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"]) vectorizer.setOutputCol("features") # Let's initialize our linear regression learner lr = LinearRegression() lr.setPredictionCol("prediction")\ .setMaxIter(100)\ .setRegParam(0.1) # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() lrPipeline.setStages([vectorizer,lr]) lrModel = lrPipeline.fit(training_seti) predicted_df = lrModel.transform(training_seti) # display(predicted_df) test_seti = df_featured.select(col("pro").alias("pro_lag1"), col("pre").alias("pre_lag1"),col("hour"), col("ts")) predicted_test_df = lrModel.transform(test_seti)
# Converts the list of columns into a single vector column vectorizer = VectorAssembler() vectorizer.setInputCols(['Atmospheric_Temperature', 'Vacuum_Speed', 'Atmospheric_Pressure', 'Relative_Humidity']) vectorizer.setOutputCol('features') # splitting the dataset into training and test datasets in 80% - 20% ratio seed = 1800009193 (testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed) testSetDF.cache() trainSetDF.cache() # Create a Linear Regression Model lr = LinearRegression() # print(lr.explainParams()) lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1) # Create a ML Pipeline and set the stages lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr]) # Train the model with training dataset lrModel = lrPipeline.fit(trainSetDF) # Get the intercept and co-efficients of the equation intercept = lrModel.stages[1].intercept weights = lrModel.stages[1].coefficients # Get list of column names except output features = [col for col in trainSetDF.columns if col != "Power_Output"]
# ***** LINEAR REGRESSION MODEL **** from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml import Pipeline # Let's initialize our linear regression learner lr = LinearRegression() # We use explain params to dump the parameters we can use print(lr.explainParams()) # Now we set the parameters for the method lr.setPredictionCol("Predicted_PE")\ .setLabelCol("PE")\ .setMaxIter(100)\ .setRegParam(0.1) # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr]) # Let's first train on the entire dataset to see what we get lrModel = lrPipeline.fit(trainingSetDF) # The intercept is as follows: intercept = lrModel.stages[1].intercept # The coefficents (i.e., weights) are as follows:
# See which are the parameters print(lr.explainParams()) # Two parameters are not optional: # - The name of the label column to "PE" (i.e. which are the known values to # learn) # - The name of the prediction column to "Predicted_PE" (i.e. where the # predictions values should be stored) # In[18]: lr.setPredictionCol("Predicted_PE") .setLabelCol("PE") # We will also configure two parameters, which a re customary to the linear # regression # - the maximum number of iterations to 100 # - the regularization parameter to 0.1 # In[19]: lr.setMaxIter(100) .setRegParam(0.1) # ## Part 8 Create a pipeline #
# COMMAND ---------- # ***** LINEAR REGRESSION MODEL **** from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml import Pipeline # Let's initialize our linear regression learner lr = LinearRegression() # COMMAND ---------- # Now we set the parameters for the method lr.setPredictionCol("predicted_meter_reading")\ .setLabelCol("meter_reading")\ .setMaxIter(100)\ .setRegParam(0.15) # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr]) # Let's first train on the entire dataset to see what we get lrModel = lrPipeline.fit(trainingSetDF) # COMMAND ---------- # The intercept is as follows: