Exemple #1
0
def normalize(data_file, train_file, test_file):
    """
    归一化
    :param data_file:
    :param train_file:
    :param test_file:
    :return:
    """
    from pyspark.ml.feature import StandardScaler, MinMaxScaler
    sess = get_spark_sesssion()
    dataframe = sess.read.load(data_file, format=os.path.splitext(data_file)[
                                                     1][1:], header=True, inferSchema=True)

    train_frame = sess.read.load(data_file, format=os.path.splitext(train_file)[
                                                       1][1:], header=True, inferSchema=True)

    test_frame = sess.read.load(data_file, format=os.path.splitext(test_file)[
                                                      1][1:], header=True, inferSchema=True)
    columns = dataframe.columns
    for c in columns:
        if c in real_cnt_feats or c == 'action_installed':
            print(c)
            model = MinMaxScaler(outputCol='std_' + c, inputCol=c)
            model = model.fit(dataframe)
            train_frame = model.transform(train_frame)
            train_frame = train_frame.drop(c)
            train_frame = train_frame.withColumnRenamed('std_' + c, c)

            test_frame = model.transform(test_frame)
            test_frame = test_frame.drop(c)
            test_frame = test_frame.withColumnRenamed('std_' + c, c)

    save_pandas(train_frame.toPandas(), 'train_nm.csv', index=False)
    save_pandas(test_frame.toPandas(), 'test_nm.csv', index=False)
def minMaxScaler(df: DataFrame, inputCol) -> DataFrame:
    '''
    数值类型数据归一化
    :param df:
    :return:
    rating userRatingCount userAvgRating userRatingStddev  userReleaseYearStddev
    '''
    # minMax 处理
    outputCol = inputCol + 'Scaler'
    double2vec = udf(f=tmpDouble2vec, returnType=VectorUDT())
    df = df.withColumn(inputCol, double2vec(col(inputCol)))
    ratingScaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol).fit(df)
    df = ratingScaler.transform(df)

    # 取放缩后的值
    to_array = udf(lambda x: x.toArray().tolist(), ArrayType(FloatType()))
    df = df.withColumn(inputCol, to_array(col(outputCol)).getItem(0))
    df = df.withColumn(inputCol, format_number(col(inputCol),
                                               3)).drop(outputCol)

    return df
Exemple #3
0
# COMMAND ----------

# MAGIC %md ## split data into training and test data

# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
# Initialize the `standardScaler`
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")
# Fit the DataFrame to the scaler
scaler = scaler.fit(df_input)

# COMMAND ----------

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df_input)
scaled_df.first()

# COMMAND ----------

train_data, test_data = scaled_df.randomSplit([.8, .2], seed=7)

from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

# Initialize `lr`
lr = LogisticRegression(labelCol="label",
                        featuresCol='features_scaled',
                        maxIter=10,
                        regParam=0.3,
                        elasticNetParam=0.8)
rfr = RandomForestClassifier(labelCol='label', featuresCol='features_scaled')