def clean_and_add_date(
    df: pyspark.sql.dataframe.DataFrame, date_generated: list,
    spark: pyspark.sql.session.SparkSession
) -> pyspark.sql.dataframe.DataFrame:
    """
    Add more rows to ensure each item in each store has the full-month records 
       (since if both stock and sales are 0, the raw date can miss the relevant column)
    """
    # Create a list of dates, start from the first day of dataset, end with the last day of dataset
    date_df = spark.createDataFrame(date_generated,
                                    DateType())  # create a Date df
    date_df = date_df.withColumnRenamed("value", "Date")

    # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("dfView")

    # get temporary table with distinct combinnation of SKU and Store
    ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same
    sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView")

    # Cross join two dataset to create full schema
    schema = sqlDF.crossJoin(date_df)  # using crossjoin to quickly add
    #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result
    #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \
    #'row. But we get '+str(schema.count()) # we need add rows

    # left join origial dataset with new schema
    df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right')
    #assert df.count() == count # test on overall dataset
    return df
Beispiel #2
0
def calculate_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean`
    """
    df_geometric_mean = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyQtySold')))))
    df_geometric_mean = df_geometric_mean.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat')

    df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyNetSale')))))
    df_geometric_mean2 = df_geometric_mean2.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat')

    # join the column to the original dataset
    df_new = df.join(df_geometric_mean,
                     on=['month', 'SubCategory'],
                     how='inner')
    df_new = df_new.join(df_geometric_mean2,
                         on=['month', 'SubCategory'],
                         how='inner')
    #assert df.count() == df_new.count()
    return df_new
Beispiel #3
0
def find_removed_item(month_merge_late: pyspark.sql.dataframe.DataFrame,
                      month_merge_early: pyspark.sql.dataframe.DataFrame,
                      dist_df: pyspark.sql.dataframe.DataFrame,
                      output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    removed item:
       The items are in distribution report, but have no sales from July-Sep (or Sep-Nov)
    """
    Removed_df = dist_df.join(month_merge_late, on=["MatID"],
                              how="left").fillna(
                                  0, subset=['totalMonthlyGrossSale'])
    Removed_df = dist_df.join(month_merge_early, on=["MatID"],
                              how="inner").fillna(
                                  0, subset=['totalMonthlyGrossSale'])
    Removed_item = Removed_df.filter(Removed_df.totalMonthlyGrossSale == 0)
    Removed_item = Removed_item.select(output_columns)
    return Removed_item
Beispiel #4
0
def find_check_item(month_merge: pyspark.sql.dataframe.DataFrame,
                    dist_df: pyspark.sql.dataframe.DataFrame,
                    output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    checked item:
       The items are in distribution report, but have no sales from Apr-Sep (or Sep-Nov)
    """
    check_df = dist_df.join(month_merge, on=["MatID"], how="left").fillna(
        0, subset=['totalMonthlyGrossSale'])
    check_item = check_df.filter(check_df.totalMonthlyGrossSale == 0)
    check_item = check_item.select(output_columns)
    return check_item
Beispiel #5
0
def calculate_mean_std_and_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month
    """
    df_group = df.groupby('month', 'SubCategory')
    df = calculate_geometric_mean(df)
    df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\
    .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\
    .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat")

    df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat")

    df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\
    .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat")

    # join to get final dataset
    df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner')
    return df
Beispiel #6
0
def find_new_item(month_merge_late: pyspark.sql.dataframe.DataFrame,
                  dist_df: pyspark.sql.dataframe.DataFrame,
                  output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    new item: 
        The items are not in distribution report, but have sale history from July-Sep (or Sep-Nov)
    """
    New_df = dist_df.join(month_merge_late, on=["MatID"],
                          how="right").fillna(0,
                                              subset=['totalMonthlyGrossSale'])
    New_item = New_df.filter(
        New_df.totalMonthlyGrossSale != 0)  # new item is sold during July- Sep
    New_item = New_item.filter(col(
        "Classification").isNull())  # new item has no classification records
    New_item = New_item.select(output_columns)
    return New_item
Beispiel #7
0
def Identify_and_output_issused_items(
    month_merge: pyspark.sql.dataframe.DataFrame,
    month_merge_early: pyspark.sql.dataframe.DataFrame,
    month_merge_late: pyspark.sql.dataframe.DataFrame,
    dist_df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Identify 5 kinds of issued items:
       1. check_item
       2. Removed_item
       3. New_item
       4. Incorrect_record_items
       5. Depth2_items
       
    Save them as 5 csv files into Output/IssuedItem folder
    """
    ## Define the output columns we want to keep in the output dataset
    # for check_item, Removed_item, and New_item
    output_columns = [
        'SKU', 'MatID', 'month', 'Price', 'POGS', 'Facings', 'Capacity',
        'Classification', 'SubCategory', 'totalMonthlyNetSale',
        'totalMonthlyQtySold', 'SellMargin'
    ]

    # for Incorrect_record_items, and Depth2_items
    output_columns2 = [
        'MatID', 'SKU', 'Facings', 'Capacity', 'POGS', 'Depth', 'DaysSupply',
        'Classification', 'month', 'SubCategory', 'Vendor',
        'totalMonthlyNetSale', 'totalMonthlyGrossSale', 'avgCOGS',
        'totalMonthlyQtySold', 'Price', 'SellMargin', 'avgFrontMargin'
    ]

    ## check_item
    check_item = find_check_item(month_merge, dist_df, output_columns)
    Removed_item = find_removed_item(month_merge_late, month_merge_early,
                                     dist_df, output_columns)
    New_item = find_new_item(month_merge_late, dist_df, output_columns)

    ## Removed_item
    Removed_item = Removed_item.toPandas()
    print(
        "Find {} Removed items, save them in Output/IssuedItem/removed_SKU.csv"
        .format(len(Removed_item['MatID'].unique())))
    Removed_item.to_csv('../data/Output/IssuedItem/removed_SKU.csv',
                        index=False,
                        encoding='utf-8')
    ## New_item
    New_item = New_item.toPandas()
    print(
        "Find {} New items, save them in Output/IssuedItem/new_SKU.csv".format(
            New_item['MatID'].nunique()))
    New_item.to_csv('../data/Output/IssuedItem/new_SKU.csv',
                    index=False,
                    encoding='utf-8')

    check_item = check_item.toPandas()
    print(
        "Find {} checked items, save them in Output/IssuedItem/checked_SKU.csv"
        .format(len(check_item['MatID'].unique())))
    check_item.to_csv('../data/Output/IssuedItem/checked_SKU.csv',
                      index=False,
                      encoding='utf-8')

    # Join the DataFrames
    month_merge = dist_df.join(month_merge_late, on="MatID", how="inner")

    ## Incorrect_record_items
    Incorrect_record_items = find_Incorrect_record_items(
        month_merge, output_columns2).toPandas()
    print(
        "Find {} Incorrect_record_items, save them in Output/IssuedItem/Incorrect_record_items.csv"
        .format(len(Incorrect_record_items['MatID'].unique())))
    Incorrect_record_items.to_csv(
        '../data/Output/IssuedItem/Incorrect_record_items.csv',
        index=False,
        encoding='utf-8')
    ## Depth less than 2 items
    Depth2_items = find_Depth2_items(month_merge, output_columns2).toPandas()
    print(
        "Find {} Depth < 2 items, save them in Output/IssuedItem/Depth2_items.csv"
        .format(len(Depth2_items['MatID'].unique())))
    Depth2_items.to_csv('../data/Output/IssuedItem/Depth2_items.csv',
                        index=False,
                        encoding='utf-8')
    return month_merge  # full dataset
Beispiel #8
0
def Data_clean_and_merge(
        df: pyspark.sql.dataframe.DataFrame,
        Subcat_info: pyspark.sql.dataframe.DataFrame,
        Vendor_info: pyspark.sql.dataframe.DataFrame,
        store_name: str,
        begin_date="2019-04-01",
        end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame:
    # select useful columns
    Subcat_info = Subcat_info.select('SKU', 'SubCategory')
    Vendor_info = Vendor_info.select('SKU', 'Vendor')
    # clean data entry: remove ID
    split_col = split(Subcat_info['SKU'], '-')
    Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0))
    Subcat_info = Subcat_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col2 = split(Vendor_info['SKU'], '-')
    Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0))
    Vendor_info = Vendor_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col = split(Subcat_info['SubCategory'], '-')
    split_col2 = split(Vendor_info['Vendor'], '-')
    Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1))
    Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1))
    # filter data
    df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales',
                   'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)')

    # Check only one store
    df = df.filter(df.Store == store_name)

    # Remove comma from integer (e.g. 1,333 to 1333)
    udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType())
    #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    df = df.select(*[udf(column).alias(column) for column in df.columns])

    # filter data, and keep only half years
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))
    df = df.filter(df.Date >= begin_date)
    df = df.filter(df.Date < end_date)  # April - Sep

    # separate Item name to SKU and ID
    split_col = split(df['item'], '-')
    df = df.withColumn('MatID', split_col.getItem(0))
    df = df.withColumn('MatID',
                       regexp_replace(col("MatID"), "[ZNDF]",
                                      ""))  # remove letters from matID
    df = df.withColumn('SKU', split_col.getItem(1))

    ### Rename column
    df = df.withColumnRenamed("Sales Type", "SalesType")
    df = df.withColumnRenamed("POS Gross Sales", "GrossSales")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("POS Total Discount", "TotalDiscount")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")
    df = df.withColumnRenamed("POS COGS (INV)", "COGS")

    # Assign all column names to `columns`
    columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    # Conver the `df` columns to `FloatType()`
    df = convertColumn(df, columns, FloatType())

    # drop unnecessary items
    columns_to_drop = ['item']
    df = df.drop(*columns_to_drop)
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))

    # Create the new columns
    df = df.withColumn("Price", df.GrossSales / df.QtySold)
    df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS))
    df = df.withColumn("SellMargin", (df.NetSales + df.COGS))

    # add subcategory column
    df = df.join(Subcat_info.select("MatID", 'SubCategory'),
                 on=["MatID"],
                 how="left")
    df = df.join(Vendor_info.select("MatID", 'Vendor'),
                 on=["MatID"],
                 how="left")
    return df