Exemple #1
0
def clean_dist_df(
    dist_df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    # filter data
    dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply',
                             'Classification', 'Mat ID', '# POGs')

    ### Rename column
    dist_df = dist_df.withColumnRenamed("Name", "SKU")
    dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply")
    dist_df = dist_df.withColumnRenamed("Mat ID", "MatID")
    dist_df = dist_df.withColumnRenamed("# POGs", "POGS")

    # Conver columns to `FloatType()`
    dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float'))
    dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float'))
    dist_df = dist_df.withColumn("DaysSupply",
                                 dist_df.DaysSupply.cast('float'))
    dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer'))
    dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer'))
    return dist_df
def clean_dataset(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:

    ## Select the target features
    df = df.select('Index', 'Date Detail', 'Company', 'Business Unit',
                   'Concept_NEW', 'Product Category',
                   'Company and Cost Centre', 'SKU', 'POS Net Sales',
                   'Rank Total')

    ## Reanme columns
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Date Detail", "Date")
    df = df.withColumnRenamed("Product Category", "Category")
    df = df.withColumnRenamed("Company and Cost Centre", "Store")
    df = df.withColumnRenamed("Business Unit", "BusinessUnit")
    df = df.withColumnRenamed("Rank Total", "rank")

    ## Column type cast
    columns = ['NetSales', 'rank']
    df = convertColumn(df, columns, FloatType())
    # Replace none to 0
    df = df.na.fill(0)
    return df
def clean_data(df: pyspark.sql.dataframe.DataFrame,
               spark: pyspark.sql.session.SparkSession):
    """
    Apply data processing. 
        1)  Rename columns name
        2)  Columns type cast
        3)  Remove the closed store
        4)  Short SKU name by removing itemID
        5)  Remove items if no sales in the whole month, since they are not OOS
        6)  Remove items if no stock in the whole month, since they are not OOS
        7)  Add more rows to ensure each item in each store has the full-month records
        8)  Replace none to 0
        9)  Convert float number between -1 and 1 to 0
        10)  Save the cleaned dataset
    """

    ### 1)  Rename column
    df = df.withColumnRenamed("POS Margin on Net Sales", "Margin")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Stock Balance Qty", "StockQty")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")

    # 2)  Conver the `df` columns to `FloatType()`
    columns = ['NetSales', 'QtySold', 'Margin', 'StockQty']
    df = convertColumn(df, columns, FloatType())
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMMdd"))

    # 3)  Remove the closed store
    df = remove_closed_store(df)

    # 4)  Short SKU name by removing itemID
    """
    short_column_udf = udf(lambda name: short_column(name), StringType())
    count = df.count()
    df = df.withColumn("SKU", short_column_udf(col("SKU")))
    assert df.count() == count, "Some error here" # test on overall dataset
    print(df.count())
    """

    # 5)  Remove items if no sales in the whole month, since they are not OOS
    df = remove_no_sale_item(df)

    # 6)  Remove items if no stock in the whole month, since they are not OOS
    df = remove_no_stock_item(df)

    # 7)  Add more rows to ensure each item in each store has the full-month records
    date_generated = create_list_dates(df)
    df = clean_and_add_date(df, date_generated, spark)

    # 8)  Replace none to 0
    df = df.fillna(0)

    # 9)  convert float number between -1 and 1 to 0
    #clean_numeric_column_udf = udf(lambda name: clean_numeric_column(name), FloatType())
    #df = df.withColumn("StockQty", clean_numeric_column(col("StockQty")))

    # 10)  save the cleaned dataset, overwrite the old one.
    #df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/cleanedData") # only specify folder name
    print("Data processing finished.")

    return df, date_generated
Exemple #4
0
def Data_clean_and_merge(
        df: pyspark.sql.dataframe.DataFrame,
        Subcat_info: pyspark.sql.dataframe.DataFrame,
        Vendor_info: pyspark.sql.dataframe.DataFrame,
        store_name: str,
        begin_date="2019-04-01",
        end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame:
    # select useful columns
    Subcat_info = Subcat_info.select('SKU', 'SubCategory')
    Vendor_info = Vendor_info.select('SKU', 'Vendor')
    # clean data entry: remove ID
    split_col = split(Subcat_info['SKU'], '-')
    Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0))
    Subcat_info = Subcat_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col2 = split(Vendor_info['SKU'], '-')
    Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0))
    Vendor_info = Vendor_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col = split(Subcat_info['SubCategory'], '-')
    split_col2 = split(Vendor_info['Vendor'], '-')
    Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1))
    Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1))
    # filter data
    df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales',
                   'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)')

    # Check only one store
    df = df.filter(df.Store == store_name)

    # Remove comma from integer (e.g. 1,333 to 1333)
    udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType())
    #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    df = df.select(*[udf(column).alias(column) for column in df.columns])

    # filter data, and keep only half years
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))
    df = df.filter(df.Date >= begin_date)
    df = df.filter(df.Date < end_date)  # April - Sep

    # separate Item name to SKU and ID
    split_col = split(df['item'], '-')
    df = df.withColumn('MatID', split_col.getItem(0))
    df = df.withColumn('MatID',
                       regexp_replace(col("MatID"), "[ZNDF]",
                                      ""))  # remove letters from matID
    df = df.withColumn('SKU', split_col.getItem(1))

    ### Rename column
    df = df.withColumnRenamed("Sales Type", "SalesType")
    df = df.withColumnRenamed("POS Gross Sales", "GrossSales")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("POS Total Discount", "TotalDiscount")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")
    df = df.withColumnRenamed("POS COGS (INV)", "COGS")

    # Assign all column names to `columns`
    columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    # Conver the `df` columns to `FloatType()`
    df = convertColumn(df, columns, FloatType())

    # drop unnecessary items
    columns_to_drop = ['item']
    df = df.drop(*columns_to_drop)
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))

    # Create the new columns
    df = df.withColumn("Price", df.GrossSales / df.QtySold)
    df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS))
    df = df.withColumn("SellMargin", (df.NetSales + df.COGS))

    # add subcategory column
    df = df.join(Subcat_info.select("MatID", 'SubCategory'),
                 on=["MatID"],
                 how="left")
    df = df.join(Vendor_info.select("MatID", 'Vendor'),
                 on=["MatID"],
                 how="left")
    return df