def normalize(data_file, train_file, test_file): """ 归一化 :param data_file: :param train_file: :param test_file: :return: """ from pyspark.ml.feature import StandardScaler, MinMaxScaler sess = get_spark_sesssion() dataframe = sess.read.load(data_file, format=os.path.splitext(data_file)[ 1][1:], header=True, inferSchema=True) train_frame = sess.read.load(data_file, format=os.path.splitext(train_file)[ 1][1:], header=True, inferSchema=True) test_frame = sess.read.load(data_file, format=os.path.splitext(test_file)[ 1][1:], header=True, inferSchema=True) columns = dataframe.columns for c in columns: if c in real_cnt_feats or c == 'action_installed': print(c) model = MinMaxScaler(outputCol='std_' + c, inputCol=c) model = model.fit(dataframe) train_frame = model.transform(train_frame) train_frame = train_frame.drop(c) train_frame = train_frame.withColumnRenamed('std_' + c, c) test_frame = model.transform(test_frame) test_frame = test_frame.drop(c) test_frame = test_frame.withColumnRenamed('std_' + c, c) save_pandas(train_frame.toPandas(), 'train_nm.csv', index=False) save_pandas(test_frame.toPandas(), 'test_nm.csv', index=False)
def minMaxScaler(df: DataFrame, inputCol) -> DataFrame: ''' 数值类型数据归一化 :param df: :return: rating userRatingCount userAvgRating userRatingStddev userReleaseYearStddev ''' # minMax 处理 outputCol = inputCol + 'Scaler' double2vec = udf(f=tmpDouble2vec, returnType=VectorUDT()) df = df.withColumn(inputCol, double2vec(col(inputCol))) ratingScaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol).fit(df) df = ratingScaler.transform(df) # 取放缩后的值 to_array = udf(lambda x: x.toArray().tolist(), ArrayType(FloatType())) df = df.withColumn(inputCol, to_array(col(outputCol)).getItem(0)) df = df.withColumn(inputCol, format_number(col(inputCol), 3)).drop(outputCol) return df
# COMMAND ---------- # MAGIC %md ## split data into training and test data # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler # Initialize the `standardScaler` scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = scaler.fit(df_input) # COMMAND ---------- # Transform the data in `df` with the scaler scaled_df = scaler.transform(df_input) scaled_df.first() # COMMAND ---------- train_data, test_data = scaled_df.randomSplit([.8, .2], seed=7) from pyspark.ml.classification import RandomForestClassifier, LogisticRegression # Initialize `lr` lr = LogisticRegression(labelCol="label", featuresCol='features_scaled', maxIter=10, regParam=0.3, elasticNetParam=0.8) rfr = RandomForestClassifier(labelCol='label', featuresCol='features_scaled')