# COMMAND ---------- from pyspark.ml.feature import VectorAssembler pipeline = Pipeline() assembler = VectorAssembler() print assembler.explainParams() print '\n', pipeline.explainParams() # COMMAND ---------- # ANSWER # Set assembler params (assembler.setInputCols(['lengthFeatures', 'widthFeatures']).setOutputCol('featuresBucketized')) pipeline.setStages([lengthBucketizer, widthBucketizer, assembler]) irisAssembled = pipeline.fit(irisSeparateFeatures).transform( irisSeparateFeatures) display(irisAssembled) # COMMAND ---------- # TEST from pyspark.mllib.linalg import Vectors firstAssembly = irisAssembled.select('lengthFeatures', 'widthFeatures', 'featuresBucketized').first() Test.assertTrue( all(firstAssembly[2].toArray() == [firstAssembly[0], firstAssembly[1]]), 'incorrect value for column featuresBucketized')
from pyspark.ml.feature import VectorAssembler pipeline = Pipeline() assembler = VectorAssembler() print assembler.explainParams() print '\n',pipeline.explainParams() # COMMAND ---------- # ANSWER # Set assembler params (assembler .setInputCols(['lengthFeatures', 'widthFeatures']) .setOutputCol('featuresBucketized')) pipeline.setStages([lengthBucketizer, widthBucketizer, assembler]) irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures) display(irisAssembled) # COMMAND ---------- # TEST from pyspark.mllib.linalg import Vectors firstAssembly = irisAssembled.select('lengthFeatures', 'widthFeatures', 'featuresBucketized').first() Test.assertTrue(all(firstAssembly[2].toArray() == [firstAssembly[0], firstAssembly[1]]), 'incorrect value for column featuresBucketized') # COMMAND ---------- # MAGIC %md # MAGIC ## Part 4
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`. # COMMAND ---------- from pyspark.ml.feature import VectorAssembler pipeline = Pipeline() assembler = VectorAssembler() print assembler.explainParams() print '\n',pipeline.explainParams() # COMMAND ---------- # ANSWER # Set assembler params (assembler .setInputCols(['lengthFeatures', 'widthFeatures']) .setOutputCol('featuresBucketized')) pipeline.setStages([lengthBucketizer, widthBucketizer, assembler]) irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures) display(irisAssembled) # COMMAND ---------- # TEST from pyspark.mllib.linalg import Vectors firstAssembly = irisAssembled.select('lengthFeatures', 'widthFeatures', 'featuresBucketized').first() Test.assertTrue(all(firstAssembly[2].toArray() == [firstAssembly[0], firstAssembly[1]]), 'incorrect value for column featuresBucketized')
# In[18]: evaluator = RegressionEvaluator(labelCol="EP", predictionCol="prediction", metricName="rmse") evaluator.evaluate(df_pred) # In[19]: from pyspark.ml.pipeline import Pipeline, PipelineModel # In[20]: pipeline = Pipeline() print(pipeline.explainParams()) pipeline.setStages([vectorizer, lr]) pipelineModel = pipeline.fit(df) # In[21]: pipeline.getStages() # In[22]: lr_model = pipelineModel.stages[1] lr_model.coefficients # In[23]: pipelineModel.transform(df).show()