def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ), (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ), (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"]) model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().expanded.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().dense.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def polynomial_expansion(self, df, column): """ 按列 构造多项式特征PolynomialExpansion """ print('PolynomialExpansionExample') # 按列交叉构造多项式特征 # 1 x1 x2 # 2 x1 x2 x1x2 x1^2 x2^2 # 3 x1 x2 x1x2 x1^2 x2^2 x1^2x2 x1x2^2 x1^3 x2^3 polyExpasion = PolynomialExpansion(degree=2, inputCol=column, outputCol=column + '_poly') polyDF = polyExpasion.transform(df) return polyDF
def polynomial_expansion_usecase(): """ 多项式扩展数据特征 """ spark = getSparkSession() df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False)
def train_new_feature_pipeline(df: DataFrame, degree: int = 3) -> PipelineModel: """Create a new feature pipeline and fit to training data :param df: raw Iris spark sql data frame :type df: DataFrame :param degree: degree of polynomial feature expansion :type degree: int :returns: fitted feature pipeline :rtype: PipelineModel """ assembler = VectorAssembler( inputCols=[ "sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm", ], outputCol="features", ) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) polyExpansion = PolynomialExpansion(degree=degree, inputCol="scaledFeatures", outputCol="polyFeatures") pipeline = Pipeline(stages=[assembler, scaler, polyExpansion]) pipeline_model = pipeline.fit(df) return pipeline_model
def fit(season): cols = [ "air_temperature_est", "dew_temperature_est", "wind_speed_est", "meter", "month", "day", "hour" ] imputer = Imputer( strategy="median", inputCols=["air_temperature", "dew_temperature", "wind_speed"], outputCols=[ "air_temperature_est", "dew_temperature_est", "wind_speed_est" ]) vector = VectorAssembler(inputCols=cols, outputCol="vector", handleInvalid="error") poly = PolynomialExpansion(degree=4, inputCol="vector", outputCol="features") regression = LinearRegression(featuresCol="features", labelCol="meter_reading", predictionCol="prediction") pipeline = Pipeline(stages=[imputer, vector, poly, regression]) evaluator = RegressionEvaluator(labelCol="meter_reading", predictionCol="prediction", metricName="rmse") params = ParamGridBuilder() \ .addGrid(imputer.strategy, ["mean", "median"]) \ .addGrid(poly.degree, [3]) \ .addGrid(regression.fitIntercept, [True, False]) \ .addGrid(regression.maxIter, [100]) \ .addGrid(regression.standardization, [True, False]) \ .build() validator = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=3, seed=51) model = validator.fit(season) return model.bestModel
return v_assembler.transform(data) if __name__ == "__main__": train_ratio = 0.8 test_ratio = 1 - train_ratio # create SparkSession - the entry to the cluster spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("Linear regression with pipeline - Boston").getOrCreate() data = prepare_data("BostonHousing.csv") # split data into train and test DataFrames train, test = data.randomSplit([train_ratio, test_ratio]) poly_exp = PolynomialExpansion(degree=3, inputCol="features", outputCol="poly_features") lr = LinearRegression(regParam=0.1, featuresCol="poly_features") pipeline = Pipeline(stages=[poly_exp, lr]) # fit the model model = pipeline.fit(train) evaluator = RegressionEvaluator() prediction_and_labels = model.transform(train).select("prediction", "label") print("Precision train: " + str(evaluator.evaluate(prediction_and_labels))) prediction_and_labels = model.transform(test).select("prediction", "label") print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))
from ast import literal_eval from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml import Pipeline def get_spark(): return(SparkSession.builder.appName("polyreg").getOrCreate()) vass = VectorAssembler(inputCols=["x",], outputCol="feat") poly = PolynomialExpansion(degree=3, inputCol="feat", outputCol="features") lm = LinearRegression(maxIter=5, regParam=0.0, solver="normal") # Configure an ML pipeline # Assignment by string to simplify the getParams() function stages=["vass","poly","lm"] pipe = Pipeline(stages=[eval(s) for s in stages]) def printParams(): """Print method that will be called in parser script""" txt = "Model Pipeline Parameters:" for i in range(len(stages)): txt+= "\n Stage "+str(i).zfill(2)+": "+stages[i] pm=pipe.getStages()[i].extractParamMap() for p in pm.keys(): txt+= "\n "+stages[i]+"."+p.name+": "+str(pm[p])
"pclass_imputed", "sibsp_imputed", "parch_imputed", "sexIndexed_imputed", "embarkedIndexed_imputed", "age_imputed", "fare_imputed" ]) # Step - 4: Make Vectors from dataframe's columns using special Vector Assmebler assembler = VectorAssembler(inputCols=[ "pclass_imputed", "sibsp_imputed", "parch_imputed", "sexIndexed_imputed", "embarkedIndexed_imputed", "age_imputed", "fare_imputed" ], outputCol="unscaled_features") # Step - 5: Define Polynomial Expansion with degree=2 polyExpansion = PolynomialExpansion(degree=2, inputCol="unscaled_features", outputCol="polyFeatures") # Step - 5: Define Scaler scaler = MinMaxScaler(inputCol="polyFeatures", outputCol="unnorm_features") # Step - 6: Define Normalizer normalizer = Normalizer(p=1.0, inputCol="unnorm_features", outputCol="features") # Step - 7: Set up the Decision Tree Classifier trainer = DecisionTreeClassifier(labelCol="survived", featuresCol="features") # Step - 8: Build the Pipeline
print bestDTModel._java_obj.parent().getMaxDepth() # COMMAND ---------- # MAGIC %md # MAGIC #### Random forest and `PolynomialExpansion` # MAGIC # MAGIC Next, we'll build a random forest. Since we only have a few features and random forests tend to work better with a lot of features, we'll expand our features using `PolynomialExpansion`. # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion px = (PolynomialExpansion() .setInputCol('features') .setOutputCol('polyFeatures')) print px.explainParams() # COMMAND ---------- # MAGIC %md # MAGIC Next, we'll use the `RandomForestClassifier` to build our random forest model. # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rf = (RandomForestClassifier() .setLabelCol('indexed')
list_to_vec = udf(lambda x: Vectors.dense(x), VectorUDT()) df_with_vectors = T1_df4.select( 'B1err', 'ratio', 'T1', list_to_vec(T1_df4["B1Knots"]).alias("B1Knots"), list_to_vec(T1_df4["RatioKnots"]).alias("RatioKnots")) vec = VectorAssembler(inputCols=["B1err", "ratio", "B1Knots", "RatioKnots"], outputCol="features") T1_df5 = vec.transform(df_with_vectors) #Polynomial Exapnsion with interactions polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="Interaction") polyDF = polyExpansion.transform(T1_df5) #Regression Time! lr = LinearRegression(labelCol="T1", featuresCol="Interaction") model = lr.fit(polyDF) #Now we want to interpolate data onto 100*100 grid: x1 = np.linspace(0.1, 2, 100) #B1err x2 = np.linspace(0.0005, 2.5, 100) #Ratio x1_2 = np.zeros([100, 100]) x2_2 = np.zeros([100, 100]) for i in range(0, len(x1)): for j in range(0, len(x2)): x1_2[i, j] = x1[i]
max_value = data.agg(F.max("elevation")).collect()[0][0] print("Min/max elevation: " + str(min_value) + " and " + str(max_value)) min_value = data.agg(F.min("ablation_rate")).collect()[0][0] max_value = data.agg(F.max("ablation_rate")).collect()[0][0] print("Min/max ablation rate: " + str(min_value) + " and " + str(max_value)) # Transform independent variable columns into vector of features vectorAssembler = VectorAssembler(inputCols=["elevation", "time"], outputCol="features") vector_data = vectorAssembler.transform(data) vector_data = vector_data.select(["features", "ablation_rate"]) vector_data.show(vector_data.count(), truncate=False) # Convert to polynomial features polyExpansion = PolynomialExpansion(degree=1, inputCol='features', outputCol='polyFeatures') poly_data = polyExpansion.transform(vector_data) poly_data = poly_data.select(["polyFeatures", "ablation_rate"]) poly_data.show(truncate=False) # Split into training and test data sets splits = poly_data.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] print("Train data count") print(train_df.count()) print("Test data count") print(test_df.count()) lr = LinearRegression(featuresCol='polyFeatures',
# from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PolynomialExpansionExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (Vectors.dense([2.0, 1.0]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([3.0, -1.0]),) ], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False) # $example off$ spark.stop()
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate() # $example on$ df = spark\ .createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ spark.stop()
def polynomial_expansion(dataset, inputCol, degree = 3): from pyspark.ml.feature import PolynomialExpansion return PolynomialExpansion(degree=degree, inputCol=inputCol, outputCol=inputCol+'_pe').transform(dataset)
for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2) pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\
""" Created on Sun Jun 25 21:00:59 2017 @author: vishal """ from __future__ import print_function from pyspark.sql import SparkSession session = SparkSession.builder.appName('Polynomial Expension').getOrCreate() from pyspark.ml.linalg import Vectors df = session.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) #df.show() from pyspark.ml.feature import PolynomialExpansion polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="pe_feature") ps_df = polyExpansion.transform(df) print(df.first()) print(ps_df.first()) #ps_df.select('pe_feature').show() session.stop()
# +-----+--------------------------------------------------------------------+ # |label|features | # +-----+--------------------------------------------------------------------+ # |24.0 |[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9] | # |21.6 |[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9] | # |34.7 |[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83]| # |33.4 |[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63]| # |36.2 |[0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9] | # +-----+--------------------------------------------------------------------+ # only showing top 5 rows assembler = VectorAssembler(inputCols=["features"], outputCol="assembled") # 交互作用項の追加 pe = PolynomialExpansion().setInputCol("features").setOutputCol("polyfeatures") regressor = LinearRegression().setStandardization(False).setSolver( "l-bfgs").setLabelCol("label") # パラメータチューニングの設定 paramGrid = (ParamGridBuilder().addGrid(pe.degree, [2, 3]).addGrid( regressor.maxIter, [10, 25, 50]).addGrid(regressor.regParam, [0.0, 0.01, 0.1]).addGrid( regressor.featuresCol, ["features", "features", "polyFeatures"]).addGrid( regressor.featuresCol, ["polyfeatures"]).build()) # 評価にはRMSEを使う evaluator = RegressionEvaluator(metricName="rmse")
# Plotting Dataset f, axarr = plt.subplots(2, sharex=True) # Converting "features" DenseVector column to NPy Array npFeatures = np.array([]) for i in train_2.collect(): npFeatures = np.append(npFeatures, i['feature'].toArray()) # Converting "label" DenseVector column to NPy Array npLabels = np.array([]) for i in train_2.collect(): npLabels = np.append(npLabels, i['label']) axarr[0].plot(npFeatures, npLabels, label="Data", linewidth=2) # Pipeline: Polynomial expansion, Linear Regression and label vs. prediction charts for every degree for degree in [5, 6, 7]: px = PolynomialExpansion(degree=degree, inputCol="feature", outputCol="features") lr = LinearRegression(maxIter=5) pipeline = Pipeline(stages=[px, lr]) model = pipeline.fit(train_2) # lr.write.overwrite().save("D:\\Users\\festevem\Desktop\Modelos\modelo1") # No va de ninguna manera npPredictions = np.array([]) for i in model.transform(train_2).collect(): npPredictions = np.append(npPredictions, (i['prediction'])) # Model plot axarr[0].plot(npFeatures, npPredictions, label="Degree %d" % degree) print("Degree " + str(degree) + " model coefficients: " + str(model.stages[1].coefficients)) print("Degree " + str(degree) + " model intercept: " + str(model.stages[1].intercept)) print("Degree " + str(degree) + " model Mean Squared Error: " +
def linear_regression_to_predict_number_of_deaths(): """ Worldwide cancer mortality figures are considered for females aged 20-70 years old. > The mean death numbers by age are visualised initially. > As the data is curved, a polynomial regression is performed on it with the intention of predicting death numbers in the 50-54 age range """ # Relevant columns for 20-70yrs are Deaths[10-19] column_index = [str(x) for x in range(10, 20)] sum_columns = (', '.join([f'sum(df.Deaths{x})' for x in column_index])) cancer_mortality_data.createOrReplaceTempView('df') female_yearly_totals = helper.spark.sql( f'select df.Year, {sum_columns} from df ' 'where df.Sex=2 ' 'group by df.Year ' 'order by df.Year asc').toDF('Year', *[f'Deaths{x}' for x in column_index]) print(female_yearly_totals.show()) # Check Pearson Correlation Coefficient between independent variables and target variable (Deaths16) for i in female_yearly_totals.columns: if i != 'Year': correlation = female_yearly_totals.stat.corr('Deaths16', i) print(f'Correlation between {i} and Deaths16: {correlation}') # plot mean deaths by age to find out the shape of this dataset plot_df = female_yearly_totals.toPandas() plot_df.drop( columns=['Year'], inplace=True) # drop the Year column as it's not needed for this graph x_axis = [x for x in range(0, len(plot_df.columns))] y_axis = [int(plot_df[y].mean()) for y in plot_df.columns] # get age range labels age_ranges = [ helper.age_ranges.select('00').filter( (helper.age_ranges['index'] == str(x))).collect()[0][0] for x in column_index ] fig = plt.figure(figsize=(9, 6)) # set plotted figure size plt.xticks(np.arange(len(age_ranges)), age_ranges) plt.ylabel('Yearly Average Deaths') plt.xlabel('Age') plt.title('Cancer Deaths (Female, 20-70 years old)') plt.plot(x_axis, y_axis) ax = plt.gca() ax.yaxis.set_major_formatter(ticker.EngFormatter()) if not os.path.exists(output_dir): os.mkdir(output_dir) plt.savefig(f'{output_dir}/female_cancer_deaths.png') plt.clf() # split into training and test sets (training, test) = female_yearly_totals.randomSplit([.7, .3]) training.cache() test.cache() # exclude 'Deaths16' (50-54 years old range) from features in training column_index.remove('16') vectorised = VectorAssembler( inputCols=[f'Deaths{x}' for x in column_index], outputCol='features') poly_expansion = PolynomialExpansion(degree=3, inputCol='features', outputCol='poly_features') # set label column to 'Deaths16' as this is the column value being predicted lr = LinearRegression( maxIter=10, regParam=0.5).setLabelCol('Deaths16').setPredictionCol('predicted') lr_pipeline = Pipeline() lr_pipeline.setStages([vectorised, poly_expansion, lr]) # Fit the model model = lr_pipeline.fit(training) # predict using test data predictions = model.transform(test).select('Year', 'Deaths16', 'poly_features', 'predicted') print(predictions.show()) model_details = model.stages[2] print('_____________\nModel details:\n_____________') # Print the coefficients and intercept for generalized linear regression model print('Coefficients: ' + str(model_details.coefficients)) print('Intercept: ' + str(model_details.intercept)) # Summarize the model over the training set and print out some metrics summary = model_details.summary print('Coefficient Standard Errors: ' + str(summary.coefficientStandardErrors)) print('T Values: ' + str(summary.tValues)) print('P Values: ' + str(summary.pValues)) print('r^2: ' + str(summary.r2)) print('Mean Squared Error: ' + str(summary.meanSquaredError)) print('Mean Absolute Error: ' + str(summary.meanAbsoluteError)) print('Explained variance: ' + str(summary.explainedVariance)) print('Degrees Of Freedom: ' + str(summary.degreesOfFreedom)) print('Deviance Residuals: ' + str(summary.devianceResiduals)) # Evaluation metrics for test dataset # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol='predicted', labelCol='Deaths16', metricName='rmse') # Run the evaluator on the DataFrame print('_____________\nPrediction evaluation:\n_____________') rmse = reg_eval.evaluate(predictions) print(f'Root Mean Squared Error: {rmse}') # Mean Square Error mse = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mse'}) print(f'Mean Square Error: {mse}') # Mean Absolute Error mae = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mae'}) print(f'Mean Absolute Error: {mae}') # r2 - coefficient of determination r2 = reg_eval.evaluate(predictions, {reg_eval.metricName: 'r2'}) print(f'r^2: {r2}')
for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol( "polyFeatures") pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\
from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) dataDF = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(dataDF) for expanded in polyDF.select("polyFeatures").take(3): print expanded """OUTPUT Row(polyFeatures=DenseVector([-2.0, 4.0, 2.3, -4.6, 5.29])) Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0])) Row(polyFeatures=DenseVector([0.6, 0.36, -1.1, -0.66, 1.21]))""" """ Row(polyFeatures=DenseVector([-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12,167])) Row(polyFeatures=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])) Row(polyFeatures=DenseVector([0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.72 , -1.331]))"""
encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') else: encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') dataset = encoder_model.transform(dataset) feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4'] assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') dataset = assembler.transform(dataset) scaler_model = None if args.mode == 'train': scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True) scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset) polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures') dataset = polyExpansion.transform(dataset) dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache() glr = None if args.mode == 'train': glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred') paramGrid = ParamGridBuilder() \ .addGrid(glr.link, ['logit']) \ .addGrid(glr.regParam, [1e-5]) \ .build() tvs = TrainValidationSplit(estimator=glr, \ estimatorParamMaps=paramGrid, \ evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \ trainRatio=0.7) tvs_model = tvs.fit(dataset) print('----> {}'.format(tvs_model.validationMetrics))
from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("polynomial").master( "local").getOrCreate() df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(inputCol="features", outputCol="polyFeatures", degree=3) polyDf = polyExpansion.transform(df) polyDf.show(truncate=False) spark.stop()
result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # COMMAND ---------- ###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False) # COMMAND ---------- ###Discrete cosine transform (DCT) transforms a real valued sequence from time domain into frequency domain from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ), (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )], ["features"])
# create a Spark dataframe with two columns: labels and features # the labels column contains the last element of the list, the features column # contains dense vectors of all elements except the last df = splits.map( lambda x: Row(labels=float(x[-1]), features=Vectors.dense(x[:-1]))).toDF() # instantiate a StandardScaler object and set the following parameters: # withMean and withStd to 'True' # inputCol to 'features' and outputCol to 'scaledfeatures' ss = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='scaledfeatures') # instantiate a PolynomialExpansion object and set the following parameters: # degree = 2 # inputCol to 'scaledfeatures' and outputCol to 'epandedfeatures' pe = PolynomialExpansion(degree=2, inputCol='scaledfeatures', outputCol='expandedfeatures') # instantiate a Pipeline object and set stages parameter to a list containing ss and pe pl = Pipeline(stages=[ss, pe]) # call the fit method of the Pipeline transformer and create a PipelineModel model = pl.fit(df) # call the transform method of the PipelineModel, input the Spark dataframe df # print column attributes to confirm the expected dataframe transformation transformed = model.transform(df) print transformed.columns