def sl_by_libsvm(spark, file_name, in_folder, out_folder): rdd_name = os.path.join(out_folder, file_name) if os.path.exists(rdd_name): data = spark.read.format("libsvm").load(rdd_name) else: data = read_csv(spark, os.path.join(in_folder, file_name)) data.show(truncate=False) print("read one file use time:" + str(time.time() - exec_start_time)) data = data.withColumn("failure", data["failure"].cast("double")) # ut = MLUtils.convertVectorColumnsToML(df, "indexedFeatures") # ut = ut.withColumnRenamed("failure", "label").withColumnRenamed("indexedFeatures", "features") # ut = ut.withColumn("label", ut["label"].cast("double")) scaler = MaxAbsScaler(inputCol="indexedFeatures", outputCol="features") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(data) # rescale each feature to range [-1, 1]. data = scalerModel.transform(data) data.show(truncate=False) data = data.select("failure", "features") data.write.format("libsvm").save(rdd_name) data.show(truncate=False) return data
def test_maxabs_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def max_abs_scale(dataFrame, inputColNames): assembledDF = getAssembledDataFrame(dataFrame, inputColNames) scaler=MaxAbsScaler(inputCol="features",\ outputCol="scaled features") scalerModel = scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") return scaledDF
def maxAbsScalerModel(df, conf): """ input: spark-dataFrame, conf [configuration params] return value: scaler, model """ inp = conf.get("inputCol", None) output = conf.get("outputCol", None) scaler = MaxAbsScaler(inputCol = inp, outputCol = output) model = scaler.fit(df) return scaler, model
def test_clear_param(self): df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") model = maScaler.fit(df) self.assertTrue(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol(), "scaled") model.clear(model.outputCol) self.assertFalse(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler') output = model.transform(df) self.assertEqual(model.getOutputCol(), output.schema.names[1])
def max_abs_scaler(self, df, column): """ 按列 特征绝对值归一化MaxAbsScaler """ print('MaxAbsScalerExample') # 把“每一列”都缩放到[-1,1]之间——最大绝对值缩放 scaler = MaxAbsScaler(inputCol=column, outputCol=column + '_maxabs') scalerModel = scaler.fit(df) scaledData = scalerModel.transform(df) return scaledData
def scaling(dataFrame, inputColName): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=MaxAbsScaler(inputCol="features", \ outputCol=outputColName) scalerModel=scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' to range (-1, 1) and create a new column '{1:s}'.".format(inputColName, outputColName)) return scaledDF
def maxabs_scale(self, columns='*'): ''' rescale the columns by dividing by the max absolute value ''' if columns == "*": columns = self._df.schema.names else: assert isinstance(columns, list), "Error: columns argument must be a list!" for column in columns: outputcol = column + '_scaled' assembler = VectorAssembler(inputCols=[column], outputCol='features') df = assembler.transform(self._df) scaler = MaxAbsScaler(inputCol='features', outputCol=outputcol) df = scaler.fit(df).transform(df).drop('features') to_float = udf(lambda x: float(x[0])) self._df = df.withColumn(outputcol, to_float(outputcol)) return self._df
# COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol( "features").setOutputCol("features_minmax_scaled") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features").setOutputCol( "features_MaxAbs_scaled") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ----------
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import MaxAbsScaler # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate() # $example on$ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ spark.stop()
def scaler(input_features): scaler = MaxAbsScaler(inputCol="raw_features", outputCol="features") scalerModel = scaler.fit(input_features) scaledData = scalerModel.transform(input_features).drop("raw_features") #scaledData.show(3) return scaledData
assembler = VectorAssembler(inputCols=[ "event_id_index", "age_index", "longitude_index", "device_model_index", "latitude_index", "phone_brand_index" ], outputCol="features") res = assembler.transform(df_r) res = res.drop("event_id_index").drop("age_index").drop( "longitude_index").drop("device_model_index").drop("device_id_index").drop( "phone_brand_index") res.show() scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(res) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(res) scaledData.show() scaledData.select("scaledFeatures").show() res = scaledData.limit(100000) (trainingData, testingData) = res.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="group_index", featuresCol="scaledFeatures") rfModel = rf.fit(trainingData) predictions = rfModel.transform(testingData) predictions.show() evaluator = MulticlassClassificationEvaluator(labelCol="group_index",
sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ----------
def process(self, data_input, data_output): """ An spark process to do feature engineering :param data_input: data input filename :param data_output: data output filename """ df = self.spark.read.parquet(data_input).select('SHP_DATE_CREATED_ID', 'SHP_DATETIME_CREATED_ID', 'SHP_DATE_HANDLING_ID', 'SHP_DATETIME_HANDLING_ID', 'SHP_SENDER_ID', 'SHP_ORDER_COST', 'CAT_CATEG_ID_L7', 'SHP_ADD_ZIP_CODE', 'SHP_DATE_SHIPPED_ID', 'SHP_DATETIME_SHIPPED_ID', 'HT_REAL') # 1. SHP_ORDER_COST_INT: Se tranforma la columna SHP_ORDER_COST de float a integer. df = df.withColumn("SHP_ORDER_COST_INT", (df["SHP_ORDER_COST"].cast(IntegerType()))) # 2. SHP_DAY: Se añade una columna para indicar que dia de la semana se acredito el pago. shp_day_udf = udf(self.shp_day, IntegerType()) df = df.withColumn('SHP_DAY', shp_day_udf(df['SHP_DATE_HANDLING_ID'])) # 3. WKND_DAY: Se añade una columna para indicar si el pago se acredito durante el fin de semana. weekend_day_udf = udf(self.weekend_day, IntegerType()) df = df.withColumn('WKND_DAY', weekend_day_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WKND_DAY').show(10) # 4. MONTH_NUM: Se añanade una columna para indicar el mes del pago. week_number_udf = udf(self.week_number, IntegerType()) df = df.withColumn('WK_NUM', week_number_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WK_NUM').show(10) # 5. *WK_NUM*: Se añanade una columna para indicar la semana del año en la que se realizó el pago. month_number_udf = udf(self.month_number, IntegerType()) df = df.withColumn('MONTH_NUM', month_number_udf(df['SHP_DATE_HANDLING_ID'])) # 6. *TIMESTAMP*: Se añanade un TIMESTAMP de las fechas. get_timestamp_udf = udf(self.get_timestamp, IntegerType()) df = df.withColumn('SHP_DATE_HANDLING_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_HANDLING_ID'])) df = df.withColumn('SHP_DATE_CREATED_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_CREATED_ID'])) my_handling_time_udf = udf(self.my_handling_time, IntegerType()) df = df.withColumn('HT', my_handling_time_udf(array('SHP_DATETIME_SHIPPED_ID', 'SHP_DATETIME_HANDLING_ID'))) shp_sender_indexer = StringIndexer(inputCol="SHP_SENDER_ID", outputCol="SHP_SENDER_ID_NUM").fit(df) df = shp_sender_indexer.transform(df) shp_sender_indexer = StringIndexer(inputCol="CAT_CATEG_ID_L7", outputCol="CAT_CATEG_ID_L7_NUM").fit(df) df = shp_sender_indexer.transform(df) #create the vector assembler vec_assembler = VectorAssembler(inputCols=['SHP_DATE_HANDLING_TIMESTAMP', 'SHP_DATE_CREATED_TIMESTAMP','SHP_SENDER_ID_NUM', 'CAT_CATEG_ID_L7_NUM', 'SHP_ORDER_COST_INT', 'SHP_DAY', 'WKND_DAY', 'WK_NUM', 'MONTH_NUM', 'SHP_ADD_ZIP_CODE'], outputCol='features') #transform the values features_df = vec_assembler.transform(df) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(features_df) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(features_df) #Save dataset as parquet scaledData.write.format("parquet").mode('overwrite').option("header", "true").save(data_output)
dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers") dfJoin.show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume") inputFeatures = ["avg-sentiment","avg-followers","avg-volume"] assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features") dfJoin2 = assembler.transform(dfJoin1) # COMMAND ---------- # Scaling features scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(dfJoin2) scaledData = scalerModel.transform(dfJoin2) scaledData.select("features", "scaledFeatures").show() # COMMAND ---------- #Elbow method import numpy as np cost = np.zeros(10) for k in range(2,10): kmeans = KMeans().setK(k).setFeaturesCol("scaledFeatures").setPredictionCol("prediction").setMaxIter(1).setSeed(1) model = kmeans.fit(scaledData) cost[k] = model.computeCost(scaledData) # COMMAND ----------
@author: luogan """ from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), ( 2, Vectors.dense([4.0, 10.0, 8.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
dfGoogle = dfJoin.select('*').where(dfJoin.company == 'GOOGLE') dfNetflix = dfJoin.select('*').where(dfJoin.company == 'NETFLIX') dfSnapchat = dfJoin.select('*').where(dfJoin.company == 'SNAPCHAT') dfMicrosoft = dfJoin.select('*').where(dfJoin.company == 'MICROSOFT') dfFacebook.describe().toPandas().transpose() display(dfFacebook) # COMMAND ---------- #Feature scaling using MaxAbsScaler vectorAssembler = VectorAssembler( inputCols=['avg-sentiment', 'avg-followers', 'avg-volume'], outputCol='features') v_dffacebook = vectorAssembler.transform(dfFacebook) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(v_dffacebook) scaledData = scalerModel.transform(v_dffacebook) scaledData.select("features", "scaledFeatures").show() v_dffacebook1 = scaledData.select(['features', 'scaledFeatures', 'avg-close']) v_dffacebook1.show() # COMMAND ---------- #Train test split train_df, test_df = v_dffacebook1.randomSplit([0.8, 0.2]) # COMMAND ---------- #Linear Regression model lr = LinearRegression(featuresCol='features', labelCol='avg-close', maxIter=10) lr_model = lr.fit(train_df)