Example #1
0
# MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`.
# MAGIC  
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC  
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC  
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()
print '\n',pipeline.explainParams()

# COMMAND ----------

# ANSWER
# Set assembler params
(assembler
 .setInputCols(['lengthFeatures', 'widthFeatures'])
 .setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures)
display(irisAssembled)

# COMMAND ----------
# MAGIC Now that we have created two new features through bucketing, let's combine those two features into a `Vector` with `VectorAssembler`.  VectorAssembler can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler) package for Scala.
# MAGIC
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()
print '\n', pipeline.explainParams()

# COMMAND ----------

# ANSWER
# Set assembler params
(assembler.setInputCols(['lengthFeatures',
                         'widthFeatures']).setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(
    irisSeparateFeatures)
display(irisAssembled)

# COMMAND ----------
Example #3
0
# In[18]:

evaluator = RegressionEvaluator(labelCol="EP",
                                predictionCol="prediction",
                                metricName="rmse")
evaluator.evaluate(df_pred)

# In[19]:

from pyspark.ml.pipeline import Pipeline, PipelineModel

# In[20]:

pipeline = Pipeline()
print(pipeline.explainParams())
pipeline.setStages([vectorizer, lr])
pipelineModel = pipeline.fit(df)

# In[21]:

pipeline.getStages()

# In[22]:

lr_model = pipelineModel.stages[1]
lr_model.coefficients

# In[23]:

pipelineModel.transform(df).show()