def standardScaler(self): from pyspark.ml.feature import StandardScaler dataFrame = self.session.read.format("libsvm").load( self.dataDir + "/data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scalerModel = scaler.fit(dataFrame) scaledData = scalerModel.transform(dataFrame) scaledData.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
def test_maxabs_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def sl_by_libsvm(spark, file_name, in_folder, out_folder): rdd_name = os.path.join(out_folder, file_name) if os.path.exists(rdd_name): data = spark.read.format("libsvm").load(rdd_name) else: data = read_csv(spark, os.path.join(in_folder, file_name)) data.show(truncate=False) print("read one file use time:" + str(time.time() - exec_start_time)) data = data.withColumn("failure", data["failure"].cast("double")) # ut = MLUtils.convertVectorColumnsToML(df, "indexedFeatures") # ut = ut.withColumnRenamed("failure", "label").withColumnRenamed("indexedFeatures", "features") # ut = ut.withColumn("label", ut["label"].cast("double")) scaler = MaxAbsScaler(inputCol="indexedFeatures", outputCol="features") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(data) # rescale each feature to range [-1, 1]. data = scalerModel.transform(data) data.show(truncate=False) data = data.select("failure", "features") data.write.format("libsvm").save(rdd_name) data.show(truncate=False) return data
def max_abs_scale(dataFrame, inputColNames): assembledDF = getAssembledDataFrame(dataFrame, inputColNames) scaler=MaxAbsScaler(inputCol="features",\ outputCol="scaled features") scalerModel = scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") return scaledDF
def maxAbsScalerModel(df, conf): """ input: spark-dataFrame, conf [configuration params] return value: scaler, model """ inp = conf.get("inputCol", None) output = conf.get("outputCol", None) scaler = MaxAbsScaler(inputCol = inp, outputCol = output) model = scaler.fit(df) return scaler, model
def test_clear_param(self): df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") model = maScaler.fit(df) self.assertTrue(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol(), "scaled") model.clear(model.outputCol) self.assertFalse(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler') output = model.transform(df) self.assertEqual(model.getOutputCol(), output.schema.names[1])
def max_abs_scaler(self, df, column): """ 按列 特征绝对值归一化MaxAbsScaler """ print('MaxAbsScalerExample') # 把“每一列”都缩放到[-1,1]之间——最大绝对值缩放 scaler = MaxAbsScaler(inputCol=column, outputCol=column + '_maxabs') scalerModel = scaler.fit(df) scaledData = scalerModel.transform(df) return scaledData
def scaling(dataFrame, inputColName): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=MaxAbsScaler(inputCol="features", \ outputCol=outputColName) scalerModel=scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' to range (-1, 1) and create a new column '{1:s}'.".format(inputColName, outputColName)) return scaledDF
def get_scaler(self, option): """Set up scaler for dataset.""" if option == 'standard': scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) elif option == 'minmax': scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") elif option == 'maxabs': scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") else: scaler = None return scaler
def maxabs_scale(self, columns='*'): ''' rescale the columns by dividing by the max absolute value ''' if columns == "*": columns = self._df.schema.names else: assert isinstance(columns, list), "Error: columns argument must be a list!" for column in columns: outputcol = column + '_scaled' assembler = VectorAssembler(inputCols=[column], outputCol='features') df = assembler.transform(self._df) scaler = MaxAbsScaler(inputCol='features', outputCol=outputcol) df = scaler.fit(df).transform(df).drop('features') to_float = udf(lambda x: float(x[0])) self._df = df.withColumn(outputcol, to_float(outputcol)) return self._df
def ml_pipeline_factory(inputCols, classifier, param_gird=None): """ Helper function to build a Spark ML pipeline based on a given classifier for the given feature names in the given DataFrame. Result is a `CrossValidator` that needs to be fitted with `.fit(df)` to the training/validation set. INPUT: - inputCols (list [string]): list of string names of the feature columns of `df` that shall be considered. - classifier: a classifier instance from pyspark.ml.classification - param_grid: a ParamGrid that was built for the passed classifier based on pyspark.ml.ParamGridBuilder OUTPUT: - result (CrossValidator): A Spark ML `CrossValidator`. """ # VectorAssembler vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features", handleInvalid='skip') # Normalizer / Scaler """ TODO Apply Standardization instead of scaling to account for outliers """ maScaler = MaxAbsScaler(inputCol="features", outputCol="features_scaled") # Define a pipeline pipe = Pipeline(stages=[vecAssembler, maScaler, classifier]) # Use cross-validation cv = CrossValidator(estimator=pipe, evaluator=MulticlassClassificationEvaluator( labelCol='churn', metricName='f1'), estimatorParamMaps=param_gird, numFolds=3, parallelism=4) return cv
sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol( "features").setOutputCol("features_minmax_scaled") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features").setOutputCol( "features_MaxAbs_scaled") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ----------
).transform(df) temp_normalized_vector_col = temp_col_name(assembled) trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,}) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" ) if scaler_model is None: scaler = MinMaxScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans) output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded) scaler = MaxAbsScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) output_df = scaler.fit(assembled_wo_nans).transform(assembled) # convert the resulting vector back to numeric temp_flattened_vector_col = temp_col_name(output_df) output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col)) # keep only the final scaled column. output_column = input_column if output_column is None or not output_column else output_column output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column) output_df = output_df.withColumn(output_column, output_column_value) final_columns = list(dict.fromkeys((list(df.columns) + [output_column]))) output_df = output_df.select(final_columns) return default_spark_with_trained_parameters(output_df, trained_parameters)
from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ----------
def process(self, data_input, data_output): """ An spark process to do feature engineering :param data_input: data input filename :param data_output: data output filename """ df = self.spark.read.parquet(data_input).select('SHP_DATE_CREATED_ID', 'SHP_DATETIME_CREATED_ID', 'SHP_DATE_HANDLING_ID', 'SHP_DATETIME_HANDLING_ID', 'SHP_SENDER_ID', 'SHP_ORDER_COST', 'CAT_CATEG_ID_L7', 'SHP_ADD_ZIP_CODE', 'SHP_DATE_SHIPPED_ID', 'SHP_DATETIME_SHIPPED_ID', 'HT_REAL') # 1. SHP_ORDER_COST_INT: Se tranforma la columna SHP_ORDER_COST de float a integer. df = df.withColumn("SHP_ORDER_COST_INT", (df["SHP_ORDER_COST"].cast(IntegerType()))) # 2. SHP_DAY: Se añade una columna para indicar que dia de la semana se acredito el pago. shp_day_udf = udf(self.shp_day, IntegerType()) df = df.withColumn('SHP_DAY', shp_day_udf(df['SHP_DATE_HANDLING_ID'])) # 3. WKND_DAY: Se añade una columna para indicar si el pago se acredito durante el fin de semana. weekend_day_udf = udf(self.weekend_day, IntegerType()) df = df.withColumn('WKND_DAY', weekend_day_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WKND_DAY').show(10) # 4. MONTH_NUM: Se añanade una columna para indicar el mes del pago. week_number_udf = udf(self.week_number, IntegerType()) df = df.withColumn('WK_NUM', week_number_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WK_NUM').show(10) # 5. *WK_NUM*: Se añanade una columna para indicar la semana del año en la que se realizó el pago. month_number_udf = udf(self.month_number, IntegerType()) df = df.withColumn('MONTH_NUM', month_number_udf(df['SHP_DATE_HANDLING_ID'])) # 6. *TIMESTAMP*: Se añanade un TIMESTAMP de las fechas. get_timestamp_udf = udf(self.get_timestamp, IntegerType()) df = df.withColumn('SHP_DATE_HANDLING_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_HANDLING_ID'])) df = df.withColumn('SHP_DATE_CREATED_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_CREATED_ID'])) my_handling_time_udf = udf(self.my_handling_time, IntegerType()) df = df.withColumn('HT', my_handling_time_udf(array('SHP_DATETIME_SHIPPED_ID', 'SHP_DATETIME_HANDLING_ID'))) shp_sender_indexer = StringIndexer(inputCol="SHP_SENDER_ID", outputCol="SHP_SENDER_ID_NUM").fit(df) df = shp_sender_indexer.transform(df) shp_sender_indexer = StringIndexer(inputCol="CAT_CATEG_ID_L7", outputCol="CAT_CATEG_ID_L7_NUM").fit(df) df = shp_sender_indexer.transform(df) #create the vector assembler vec_assembler = VectorAssembler(inputCols=['SHP_DATE_HANDLING_TIMESTAMP', 'SHP_DATE_CREATED_TIMESTAMP','SHP_SENDER_ID_NUM', 'CAT_CATEG_ID_L7_NUM', 'SHP_ORDER_COST_INT', 'SHP_DAY', 'WKND_DAY', 'WK_NUM', 'MONTH_NUM', 'SHP_ADD_ZIP_CODE'], outputCol='features') #transform the values features_df = vec_assembler.transform(df) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(features_df) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(features_df) #Save dataset as parquet scaledData.write.format("parquet").mode('overwrite').option("header", "true").save(data_output)
#Precise daily perspective fc_5 = ["Hour", "Minute"] + [enc.getOutputCol() for enc in encoders] #Unprecise daily perspective fc_6 = ["Hour"] + [enc.getOutputCol() for enc in encoders] fcs = [fc_1, fc_2, fc_3, fc_4, fc_5, fc_6] #=========== END FC ===========# standard_scaler = StandardScaler(inputCol="Features", outputCol="scaledFeatures", withStd=False, withMean=True) min_max_scaler = MinMaxScaler(inputCol="Features", outputCol="scaledFeatures") max_abs_scaler = MaxAbsScaler(inputCol="Features", outputCol="scaledFeatures") norm_standard_scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures", withStd=False, withMean=True) norm_min_max_scaler = MinMaxScaler(inputCol="normFeatures", outputCol="scaledFeatures") norm_max_abs_scaler = MaxAbsScaler(inputCol="normFeatures", outputCol="scaledFeatures") normalizer = Normalizer(inputCol="Features", outputCol="normFeatures") ######END PIPELINE from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier, DecisionTreeClassifier
@author: luogan """ from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), ( 2, Vectors.dense([4.0, 10.0, 8.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
def scaler(input_features): scaler = MaxAbsScaler(inputCol="raw_features", outputCol="features") scalerModel = scaler.fit(input_features) scaledData = scalerModel.transform(input_features).drop("raw_features") #scaledData.show(3) return scaledData