def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ),
             (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ),
             (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"])
        model = PolynomialExpansion(degree=2,
                                    inputCol="dense",
                                    outputCol="expanded")

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml PolynomialExpansion',
            [('dense', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().expanded.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().dense.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPolynomialExpansion")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['expanded'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #2
0
 def polynomial_expansion(self, df, column):
     """
     按列 构造多项式特征PolynomialExpansion
     """
     print('PolynomialExpansionExample')
     # 按列交叉构造多项式特征
     # 1 x1 x2
     # 2 x1 x2 x1x2 x1^2 x2^2
     # 3 x1 x2 x1x2 x1^2 x2^2 x1^2x2 x1x2^2 x1^3 x2^3
     polyExpasion = PolynomialExpansion(degree=2,
                                        inputCol=column,
                                        outputCol=column + '_poly')
     polyDF = polyExpasion.transform(df)
     return polyDF
def polynomial_expansion_usecase():
    """
        多项式扩展数据特征
    """
    spark = getSparkSession()

    df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                                (Vectors.dense([0.0, 0.0]), ),
                                (Vectors.dense([3.0, -1.0]), )], ["features"])

    polyExpansion = PolynomialExpansion(degree=3,
                                        inputCol="features",
                                        outputCol="polyFeatures")
    polyDF = polyExpansion.transform(df)

    polyDF.show(truncate=False)
Example #4
0
def train_new_feature_pipeline(df: DataFrame,
                               degree: int = 3) -> PipelineModel:
    """Create a new feature pipeline and fit to training data

    :param df: raw Iris spark sql data frame
    :type df: DataFrame
    :param degree: degree of polynomial feature expansion
    :type degree: int

    :returns: fitted feature pipeline
    :rtype: PipelineModel    
    """
    assembler = VectorAssembler(
        inputCols=[
            "sepal_length_cm",
            "sepal_width_cm",
            "petal_length_cm",
            "petal_width_cm",
        ],
        outputCol="features",
    )
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=True)
    polyExpansion = PolynomialExpansion(degree=degree,
                                        inputCol="scaledFeatures",
                                        outputCol="polyFeatures")
    pipeline = Pipeline(stages=[assembler, scaler, polyExpansion])
    pipeline_model = pipeline.fit(df)
    return pipeline_model
def fit(season):

    cols = [
        "air_temperature_est", "dew_temperature_est", "wind_speed_est",
        "meter", "month", "day", "hour"
    ]
    imputer = Imputer(
        strategy="median",
        inputCols=["air_temperature", "dew_temperature", "wind_speed"],
        outputCols=[
            "air_temperature_est", "dew_temperature_est", "wind_speed_est"
        ])
    vector = VectorAssembler(inputCols=cols,
                             outputCol="vector",
                             handleInvalid="error")
    poly = PolynomialExpansion(degree=4,
                               inputCol="vector",
                               outputCol="features")
    regression = LinearRegression(featuresCol="features",
                                  labelCol="meter_reading",
                                  predictionCol="prediction")
    pipeline = Pipeline(stages=[imputer, vector, poly, regression])
    evaluator = RegressionEvaluator(labelCol="meter_reading",
                                    predictionCol="prediction",
                                    metricName="rmse")

    params = ParamGridBuilder() \
       .addGrid(imputer.strategy, ["mean", "median"]) \
       .addGrid(poly.degree, [3]) \
       .addGrid(regression.fitIntercept, [True, False]) \
       .addGrid(regression.maxIter, [100]) \
       .addGrid(regression.standardization, [True, False]) \
       .build()

    validator = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=params,
                               evaluator=evaluator,
                               numFolds=3,
                               seed=51)
    model = validator.fit(season)

    return model.bestModel
    return v_assembler.transform(data)

if __name__ == "__main__":
    train_ratio = 0.8
    test_ratio = 1 - train_ratio

    # create SparkSession - the entry to the cluster
    spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("Linear regression with pipeline - Boston").getOrCreate()

    data = prepare_data("BostonHousing.csv")

    # split data into train and test DataFrames
    train, test = data.randomSplit([train_ratio, test_ratio])

    poly_exp = PolynomialExpansion(degree=3, inputCol="features", outputCol="poly_features")

    lr = LinearRegression(regParam=0.1, featuresCol="poly_features")

    pipeline = Pipeline(stages=[poly_exp, lr])
    # fit the model
    model = pipeline.fit(train)

    evaluator = RegressionEvaluator()

    prediction_and_labels = model.transform(train).select("prediction", "label")
    print("Precision train: " + str(evaluator.evaluate(prediction_and_labels)))

    prediction_and_labels = model.transform(test).select("prediction", "label")
    print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))
Example #7
0
from ast import literal_eval

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml import Pipeline

def get_spark():
  return(SparkSession.builder.appName("polyreg").getOrCreate())



vass = VectorAssembler(inputCols=["x",], outputCol="feat") 
poly = PolynomialExpansion(degree=3, inputCol="feat", outputCol="features")
lm   = LinearRegression(maxIter=5, regParam=0.0, solver="normal")

# Configure an ML pipeline
# Assignment by string to simplify the getParams() function
stages=["vass","poly","lm"]
pipe = Pipeline(stages=[eval(s) for s in stages])

def printParams():
  """Print method that will be called in parser script"""
  txt = "Model Pipeline Parameters:"
  for i in range(len(stages)):
    txt+= "\n Stage "+str(i).zfill(2)+": "+stages[i]
    pm=pipe.getStages()[i].extractParamMap()
    for p in pm.keys():
      txt+= "\n           "+stages[i]+"."+p.name+": "+str(pm[p])
Example #8
0
                          "pclass_imputed", "sibsp_imputed", "parch_imputed",
                          "sexIndexed_imputed", "embarkedIndexed_imputed",
                          "age_imputed", "fare_imputed"
                      ])

    # Step - 4: Make Vectors from dataframe's columns using special Vector Assmebler
    assembler = VectorAssembler(inputCols=[
        "pclass_imputed", "sibsp_imputed", "parch_imputed",
        "sexIndexed_imputed", "embarkedIndexed_imputed", "age_imputed",
        "fare_imputed"
    ],
                                outputCol="unscaled_features")

    # Step - 5: Define Polynomial Expansion with degree=2
    polyExpansion = PolynomialExpansion(degree=2,
                                        inputCol="unscaled_features",
                                        outputCol="polyFeatures")

    # Step - 5: Define Scaler
    scaler = MinMaxScaler(inputCol="polyFeatures", outputCol="unnorm_features")

    # Step - 6: Define Normalizer
    normalizer = Normalizer(p=1.0,
                            inputCol="unnorm_features",
                            outputCol="features")

    # Step - 7: Set up the Decision Tree Classifier
    trainer = DecisionTreeClassifier(labelCol="survived",
                                     featuresCol="features")

    # Step - 8: Build the Pipeline
Example #9
0
print bestDTModel._java_obj.parent().getMaxDepth()

# COMMAND ----------

# MAGIC %md
# MAGIC #### Random forest and `PolynomialExpansion`
# MAGIC  
# MAGIC Next, we'll build a random forest.  Since we only have a few features and random forests tend to work better with a lot of features, we'll expand our features using `PolynomialExpansion`.

# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion

px = (PolynomialExpansion()
      .setInputCol('features')
      .setOutputCol('polyFeatures'))

print px.explainParams()

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll use the `RandomForestClassifier` to build our random forest model.

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier

rf = (RandomForestClassifier()
      .setLabelCol('indexed')
list_to_vec = udf(lambda x: Vectors.dense(x), VectorUDT())

df_with_vectors = T1_df4.select(
    'B1err', 'ratio', 'T1',
    list_to_vec(T1_df4["B1Knots"]).alias("B1Knots"),
    list_to_vec(T1_df4["RatioKnots"]).alias("RatioKnots"))

vec = VectorAssembler(inputCols=["B1err", "ratio", "B1Knots", "RatioKnots"],
                      outputCol="features")

T1_df5 = vec.transform(df_with_vectors)

#Polynomial Exapnsion with interactions

polyExpansion = PolynomialExpansion(degree=2,
                                    inputCol="features",
                                    outputCol="Interaction")
polyDF = polyExpansion.transform(T1_df5)

#Regression Time!
lr = LinearRegression(labelCol="T1", featuresCol="Interaction")
model = lr.fit(polyDF)

#Now we want to interpolate data onto 100*100 grid:
x1 = np.linspace(0.1, 2, 100)  #B1err
x2 = np.linspace(0.0005, 2.5, 100)  #Ratio
x1_2 = np.zeros([100, 100])
x2_2 = np.zeros([100, 100])
for i in range(0, len(x1)):
    for j in range(0, len(x2)):
        x1_2[i, j] = x1[i]
max_value = data.agg(F.max("elevation")).collect()[0][0]
print("Min/max elevation: " + str(min_value) + " and " + str(max_value))
min_value = data.agg(F.min("ablation_rate")).collect()[0][0]
max_value = data.agg(F.max("ablation_rate")).collect()[0][0]
print("Min/max ablation rate: " + str(min_value) + " and " + str(max_value))

# Transform independent variable columns into vector of features
vectorAssembler = VectorAssembler(inputCols=["elevation", "time"],
                                  outputCol="features")
vector_data = vectorAssembler.transform(data)
vector_data = vector_data.select(["features", "ablation_rate"])
vector_data.show(vector_data.count(), truncate=False)

# Convert to polynomial features
polyExpansion = PolynomialExpansion(degree=1,
                                    inputCol='features',
                                    outputCol='polyFeatures')
poly_data = polyExpansion.transform(vector_data)
poly_data = poly_data.select(["polyFeatures", "ablation_rate"])
poly_data.show(truncate=False)

# Split into training and test data sets
splits = poly_data.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]
print("Train data count")
print(train_df.count())
print("Test data count")
print(test_df.count())

lr = LinearRegression(featuresCol='polyFeatures',
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PolynomialExpansionExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (Vectors.dense([2.0, 1.0]),),
        (Vectors.dense([0.0, 0.0]),),
        (Vectors.dense([3.0, -1.0]),)
    ], ["features"])

    polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
    polyDF = polyExpansion.transform(df)

    polyDF.show(truncate=False)
    # $example off$

    spark.stop()
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()

    # $example on$
    df = spark\
        .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                          (Vectors.dense([0.0, 0.0]),),
                          (Vectors.dense([0.6, -1.1]),)],
                         ["features"])
    px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
    polyDF = px.transform(df)
    for expanded in polyDF.select("polyFeatures").take(3):
        print(expanded)
    # $example off$

    spark.stop()
Example #14
0
def polynomial_expansion(dataset, inputCol, degree = 3):
    from pyspark.ml.feature import PolynomialExpansion
    return PolynomialExpansion(degree=degree, inputCol=inputCol, outputCol=inputCol+'_pe').transform(dataset)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))


# COMMAND ----------

from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
Example #16
0
"""
Created on Sun Jun 25 21:00:59 2017

@author: vishal
"""

from __future__ import print_function
from pyspark.sql import SparkSession

session = SparkSession.builder.appName('Polynomial Expension').getOrCreate()

from pyspark.ml.linalg import Vectors

df = session.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                              (Vectors.dense([0.0, 0.0]), ),
                              (Vectors.dense([3.0, -1.0]), )], ["features"])

#df.show()
from pyspark.ml.feature import PolynomialExpansion

polyExpansion = PolynomialExpansion(degree=2,
                                    inputCol="features",
                                    outputCol="pe_feature")

ps_df = polyExpansion.transform(df)
print(df.first())
print(ps_df.first())

#ps_df.select('pe_feature').show()

session.stop()
Example #17
0
# +-----+--------------------------------------------------------------------+
# |label|features                                                            |
# +-----+--------------------------------------------------------------------+
# |24.0 |[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9]  |
# |21.6 |[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9] |
# |34.7 |[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83]|
# |33.4 |[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63]|
# |36.2 |[0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9] |
# +-----+--------------------------------------------------------------------+
# only showing top 5 rows

assembler = VectorAssembler(inputCols=["features"], outputCol="assembled")

# 交互作用項の追加
pe = PolynomialExpansion().setInputCol("features").setOutputCol("polyfeatures")

regressor = LinearRegression().setStandardization(False).setSolver(
    "l-bfgs").setLabelCol("label")

# パラメータチューニングの設定
paramGrid = (ParamGridBuilder().addGrid(pe.degree, [2, 3]).addGrid(
    regressor.maxIter,
    [10, 25, 50]).addGrid(regressor.regParam, [0.0, 0.01, 0.1]).addGrid(
        regressor.featuresCol,
        ["features", "features", "polyFeatures"]).addGrid(
            regressor.featuresCol, ["polyfeatures"]).build())

# 評価にはRMSEを使う
evaluator = RegressionEvaluator(metricName="rmse")
    # Plotting Dataset
    f, axarr = plt.subplots(2, sharex=True)
    # Converting "features" DenseVector column to NPy Array
    npFeatures = np.array([])
    for i in train_2.collect():
        npFeatures = np.append(npFeatures, i['feature'].toArray())
    # Converting "label" DenseVector column to NPy Array
    npLabels = np.array([])
    for i in train_2.collect():
        npLabels = np.append(npLabels, i['label'])
    axarr[0].plot(npFeatures, npLabels, label="Data", linewidth=2)

    # Pipeline: Polynomial expansion, Linear Regression and label vs. prediction charts for every degree
    for degree in [5, 6, 7]:
        px = PolynomialExpansion(degree=degree,
                                 inputCol="feature",
                                 outputCol="features")
        lr = LinearRegression(maxIter=5)
        pipeline = Pipeline(stages=[px, lr])
        model = pipeline.fit(train_2)
        # lr.write.overwrite().save("D:\\Users\\festevem\Desktop\Modelos\modelo1")    # No va de ninguna manera
        npPredictions = np.array([])
        for i in model.transform(train_2).collect():
            npPredictions = np.append(npPredictions, (i['prediction']))
        # Model plot
        axarr[0].plot(npFeatures, npPredictions, label="Degree %d" % degree)
        print("Degree " + str(degree) + " model coefficients: " +
              str(model.stages[1].coefficients))
        print("Degree " + str(degree) + " model intercept: " +
              str(model.stages[1].intercept))
        print("Degree " + str(degree) + " model Mean Squared Error: " +
Example #19
0
def linear_regression_to_predict_number_of_deaths():
    """
    Worldwide cancer mortality figures are considered for females aged 20-70 years old.
        > The mean death numbers by age are visualised initially.
        > As the data is curved, a polynomial regression is performed on it with
        the intention of predicting death numbers in the 50-54 age range
    """
    # Relevant columns for 20-70yrs are Deaths[10-19]
    column_index = [str(x) for x in range(10, 20)]

    sum_columns = (', '.join([f'sum(df.Deaths{x})' for x in column_index]))
    cancer_mortality_data.createOrReplaceTempView('df')
    female_yearly_totals = helper.spark.sql(
        f'select df.Year, {sum_columns} from df '
        'where df.Sex=2 '
        'group by df.Year '
        'order by df.Year asc').toDF('Year',
                                     *[f'Deaths{x}' for x in column_index])
    print(female_yearly_totals.show())

    # Check Pearson Correlation Coefficient between independent variables and target variable (Deaths16)
    for i in female_yearly_totals.columns:
        if i != 'Year':
            correlation = female_yearly_totals.stat.corr('Deaths16', i)
            print(f'Correlation between {i} and Deaths16: {correlation}')

    # plot mean deaths by age to find out the shape of this dataset
    plot_df = female_yearly_totals.toPandas()
    plot_df.drop(
        columns=['Year'],
        inplace=True)  # drop the Year column as it's not needed for this graph
    x_axis = [x for x in range(0, len(plot_df.columns))]
    y_axis = [int(plot_df[y].mean()) for y in plot_df.columns]

    # get age range labels
    age_ranges = [
        helper.age_ranges.select('00').filter(
            (helper.age_ranges['index'] == str(x))).collect()[0][0]
        for x in column_index
    ]
    fig = plt.figure(figsize=(9, 6))  # set plotted figure size
    plt.xticks(np.arange(len(age_ranges)), age_ranges)
    plt.ylabel('Yearly Average Deaths')
    plt.xlabel('Age')
    plt.title('Cancer Deaths (Female, 20-70 years old)')
    plt.plot(x_axis, y_axis)
    ax = plt.gca()
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    plt.savefig(f'{output_dir}/female_cancer_deaths.png')
    plt.clf()

    # split into training and test sets
    (training, test) = female_yearly_totals.randomSplit([.7, .3])
    training.cache()
    test.cache()

    # exclude 'Deaths16' (50-54 years old range) from features in training
    column_index.remove('16')
    vectorised = VectorAssembler(
        inputCols=[f'Deaths{x}' for x in column_index], outputCol='features')

    poly_expansion = PolynomialExpansion(degree=3,
                                         inputCol='features',
                                         outputCol='poly_features')

    # set label column to 'Deaths16' as this is the column value being predicted
    lr = LinearRegression(
        maxIter=10,
        regParam=0.5).setLabelCol('Deaths16').setPredictionCol('predicted')

    lr_pipeline = Pipeline()
    lr_pipeline.setStages([vectorised, poly_expansion, lr])

    # Fit the model
    model = lr_pipeline.fit(training)

    # predict using test data
    predictions = model.transform(test).select('Year', 'Deaths16',
                                               'poly_features', 'predicted')
    print(predictions.show())

    model_details = model.stages[2]
    print('_____________\nModel details:\n_____________')
    # Print the coefficients and intercept for generalized linear regression model
    print('Coefficients: ' + str(model_details.coefficients))
    print('Intercept: ' + str(model_details.intercept))

    # Summarize the model over the training set and print out some metrics
    summary = model_details.summary
    print('Coefficient Standard Errors: ' +
          str(summary.coefficientStandardErrors))
    print('T Values: ' + str(summary.tValues))
    print('P Values: ' + str(summary.pValues))
    print('r^2: ' + str(summary.r2))
    print('Mean Squared Error: ' + str(summary.meanSquaredError))
    print('Mean Absolute Error: ' + str(summary.meanAbsoluteError))
    print('Explained variance: ' + str(summary.explainedVariance))
    print('Degrees Of Freedom: ' + str(summary.degreesOfFreedom))
    print('Deviance Residuals: ' + str(summary.devianceResiduals))

    # Evaluation metrics for test dataset
    # Create an RMSE evaluator using the label and predicted columns
    reg_eval = RegressionEvaluator(predictionCol='predicted',
                                   labelCol='Deaths16',
                                   metricName='rmse')

    # Run the evaluator on the DataFrame
    print('_____________\nPrediction evaluation:\n_____________')
    rmse = reg_eval.evaluate(predictions)
    print(f'Root Mean Squared Error: {rmse}')

    # Mean Square Error
    mse = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mse'})
    print(f'Mean Square Error: {mse}')

    # Mean Absolute Error
    mae = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mae'})
    print(f'Mean Absolute Error: {mae}')

    # r2 - coefficient of determination
    r2 = reg_eval.evaluate(predictions, {reg_eval.metricName: 'r2'})
    print(f'r^2: {r2}')
Example #20
0
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

# COMMAND ----------

from pyspark.ml.feature import PCA

pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion

pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol(
    "polyFeatures")
pe.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer

tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
dataDF = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                                     (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"])
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(dataDF)
for expanded in polyDF.select("polyFeatures").take(3):
    print expanded

"""OUTPUT 
Row(polyFeatures=DenseVector([-2.0, 4.0, 2.3, -4.6, 5.29]))
Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]))
Row(polyFeatures=DenseVector([0.6, 0.36, -1.1, -0.66, 1.21]))"""

"""
Row(polyFeatures=DenseVector([-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12,167]))
Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
Row(polyFeatures=DenseVector([0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.72
, -1.331]))"""
Example #22
0
		encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	else:
		encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	dataset = encoder_model.transform(dataset)
	feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4']
	assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
	dataset = assembler.transform(dataset)
	scaler_model = None
	if args.mode == 'train':
		scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
		scaler_model = scaler.fit(dataset)
		scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	else:
		scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	dataset = scaler_model.transform(dataset)
	polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures')
	dataset = polyExpansion.transform(dataset)
	dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache()
	glr = None
	if args.mode == 'train':
		glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred')
		paramGrid = ParamGridBuilder() \
					.addGrid(glr.link, ['logit']) \
					.addGrid(glr.regParam, [1e-5]) \
					.build()
		tvs = TrainValidationSplit(estimator=glr, \
									estimatorParamMaps=paramGrid, \
									evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \
									trainRatio=0.7)
		tvs_model = tvs.fit(dataset)
		print('----> {}'.format(tvs_model.validationMetrics))
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

if __name__ == "__main__":

    spark = SparkSession.builder.appName("polynomial").master(
        "local").getOrCreate()

    df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                                (Vectors.dense([0.0, 0.0]), ),
                                (Vectors.dense([3.0, -1.0]), )], ["features"])

    polyExpansion = PolynomialExpansion(inputCol="features",
                                        outputCol="polyFeatures",
                                        degree=3)

    polyDf = polyExpansion.transform(df)
    polyDf.show(truncate=False)

    spark.stop()
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

# COMMAND ----------

###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                            (Vectors.dense([0.0, 0.0]), ),
                            (Vectors.dense([3.0, -1.0]), )], ["features"])

polyExpansion = PolynomialExpansion(degree=3,
                                    inputCol="features",
                                    outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)

# COMMAND ----------

###Discrete cosine transform (DCT) transforms a real valued sequence from time domain into frequency domain
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ),
                            (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ),
                            (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )],
                           ["features"])
# create a Spark dataframe with two columns: labels and features
# the labels column contains the last element of the list, the features column
# contains dense vectors of all elements except the last
df = splits.map(
    lambda x: Row(labels=float(x[-1]), features=Vectors.dense(x[:-1]))).toDF()

# instantiate a StandardScaler object and set the following parameters:
# withMean and withStd to 'True'
# inputCol to 'features' and outputCol to 'scaledfeatures'
ss = StandardScaler(withMean=True,
                    withStd=True,
                    inputCol='features',
                    outputCol='scaledfeatures')

# instantiate a PolynomialExpansion object and set the following parameters:
# degree = 2
# inputCol to 'scaledfeatures' and outputCol to 'epandedfeatures'
pe = PolynomialExpansion(degree=2,
                         inputCol='scaledfeatures',
                         outputCol='expandedfeatures')

# instantiate a Pipeline object and set stages parameter to a list containing ss and pe
pl = Pipeline(stages=[ss, pe])

# call the fit method of the Pipeline transformer and create a PipelineModel
model = pl.fit(df)

# call the transform method of the PipelineModel, input the Spark dataframe df
# print column attributes to confirm the expected dataframe transformation
transformed = model.transform(df)
print transformed.columns