def clean_and_add_date( df: pyspark.sql.dataframe.DataFrame, date_generated: list, spark: pyspark.sql.session.SparkSession ) -> pyspark.sql.dataframe.DataFrame: """ Add more rows to ensure each item in each store has the full-month records (since if both stock and sales are 0, the raw date can miss the relevant column) """ # Create a list of dates, start from the first day of dataset, end with the last day of dataset date_df = spark.createDataFrame(date_generated, DateType()) # create a Date df date_df = date_df.withColumnRenamed("value", "Date") # Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("dfView") # get temporary table with distinct combinnation of SKU and Store ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView") # Cross join two dataset to create full schema schema = sqlDF.crossJoin(date_df) # using crossjoin to quickly add #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \ #'row. But we get '+str(schema.count()) # we need add rows # left join origial dataset with new schema df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right') #assert df.count() == count # test on overall dataset return df
def calculate_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean` """ df_geometric_mean = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyQtySold'))))) df_geometric_mean = df_geometric_mean.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat') df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyNetSale'))))) df_geometric_mean2 = df_geometric_mean2.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat') # join the column to the original dataset df_new = df.join(df_geometric_mean, on=['month', 'SubCategory'], how='inner') df_new = df_new.join(df_geometric_mean2, on=['month', 'SubCategory'], how='inner') #assert df.count() == df_new.count() return df_new
def find_removed_item(month_merge_late: pyspark.sql.dataframe.DataFrame, month_merge_early: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ removed item: The items are in distribution report, but have no sales from July-Sep (or Sep-Nov) """ Removed_df = dist_df.join(month_merge_late, on=["MatID"], how="left").fillna( 0, subset=['totalMonthlyGrossSale']) Removed_df = dist_df.join(month_merge_early, on=["MatID"], how="inner").fillna( 0, subset=['totalMonthlyGrossSale']) Removed_item = Removed_df.filter(Removed_df.totalMonthlyGrossSale == 0) Removed_item = Removed_item.select(output_columns) return Removed_item
def find_check_item(month_merge: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ checked item: The items are in distribution report, but have no sales from Apr-Sep (or Sep-Nov) """ check_df = dist_df.join(month_merge, on=["MatID"], how="left").fillna( 0, subset=['totalMonthlyGrossSale']) check_item = check_df.filter(check_df.totalMonthlyGrossSale == 0) check_item = check_item.select(output_columns) return check_item
def calculate_mean_std_and_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month """ df_group = df.groupby('month', 'SubCategory') df = calculate_geometric_mean(df) df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\ .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\ .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat") df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat") df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\ .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat") # join to get final dataset df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner') return df
def find_new_item(month_merge_late: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ new item: The items are not in distribution report, but have sale history from July-Sep (or Sep-Nov) """ New_df = dist_df.join(month_merge_late, on=["MatID"], how="right").fillna(0, subset=['totalMonthlyGrossSale']) New_item = New_df.filter( New_df.totalMonthlyGrossSale != 0) # new item is sold during July- Sep New_item = New_item.filter(col( "Classification").isNull()) # new item has no classification records New_item = New_item.select(output_columns) return New_item
def Identify_and_output_issused_items( month_merge: pyspark.sql.dataframe.DataFrame, month_merge_early: pyspark.sql.dataframe.DataFrame, month_merge_late: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Identify 5 kinds of issued items: 1. check_item 2. Removed_item 3. New_item 4. Incorrect_record_items 5. Depth2_items Save them as 5 csv files into Output/IssuedItem folder """ ## Define the output columns we want to keep in the output dataset # for check_item, Removed_item, and New_item output_columns = [ 'SKU', 'MatID', 'month', 'Price', 'POGS', 'Facings', 'Capacity', 'Classification', 'SubCategory', 'totalMonthlyNetSale', 'totalMonthlyQtySold', 'SellMargin' ] # for Incorrect_record_items, and Depth2_items output_columns2 = [ 'MatID', 'SKU', 'Facings', 'Capacity', 'POGS', 'Depth', 'DaysSupply', 'Classification', 'month', 'SubCategory', 'Vendor', 'totalMonthlyNetSale', 'totalMonthlyGrossSale', 'avgCOGS', 'totalMonthlyQtySold', 'Price', 'SellMargin', 'avgFrontMargin' ] ## check_item check_item = find_check_item(month_merge, dist_df, output_columns) Removed_item = find_removed_item(month_merge_late, month_merge_early, dist_df, output_columns) New_item = find_new_item(month_merge_late, dist_df, output_columns) ## Removed_item Removed_item = Removed_item.toPandas() print( "Find {} Removed items, save them in Output/IssuedItem/removed_SKU.csv" .format(len(Removed_item['MatID'].unique()))) Removed_item.to_csv('../data/Output/IssuedItem/removed_SKU.csv', index=False, encoding='utf-8') ## New_item New_item = New_item.toPandas() print( "Find {} New items, save them in Output/IssuedItem/new_SKU.csv".format( New_item['MatID'].nunique())) New_item.to_csv('../data/Output/IssuedItem/new_SKU.csv', index=False, encoding='utf-8') check_item = check_item.toPandas() print( "Find {} checked items, save them in Output/IssuedItem/checked_SKU.csv" .format(len(check_item['MatID'].unique()))) check_item.to_csv('../data/Output/IssuedItem/checked_SKU.csv', index=False, encoding='utf-8') # Join the DataFrames month_merge = dist_df.join(month_merge_late, on="MatID", how="inner") ## Incorrect_record_items Incorrect_record_items = find_Incorrect_record_items( month_merge, output_columns2).toPandas() print( "Find {} Incorrect_record_items, save them in Output/IssuedItem/Incorrect_record_items.csv" .format(len(Incorrect_record_items['MatID'].unique()))) Incorrect_record_items.to_csv( '../data/Output/IssuedItem/Incorrect_record_items.csv', index=False, encoding='utf-8') ## Depth less than 2 items Depth2_items = find_Depth2_items(month_merge, output_columns2).toPandas() print( "Find {} Depth < 2 items, save them in Output/IssuedItem/Depth2_items.csv" .format(len(Depth2_items['MatID'].unique()))) Depth2_items.to_csv('../data/Output/IssuedItem/Depth2_items.csv', index=False, encoding='utf-8') return month_merge # full dataset
def Data_clean_and_merge( df: pyspark.sql.dataframe.DataFrame, Subcat_info: pyspark.sql.dataframe.DataFrame, Vendor_info: pyspark.sql.dataframe.DataFrame, store_name: str, begin_date="2019-04-01", end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame: # select useful columns Subcat_info = Subcat_info.select('SKU', 'SubCategory') Vendor_info = Vendor_info.select('SKU', 'Vendor') # clean data entry: remove ID split_col = split(Subcat_info['SKU'], '-') Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0)) Subcat_info = Subcat_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col2 = split(Vendor_info['SKU'], '-') Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0)) Vendor_info = Vendor_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col = split(Subcat_info['SubCategory'], '-') split_col2 = split(Vendor_info['Vendor'], '-') Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1)) Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1)) # filter data df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales', 'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)') # Check only one store df = df.filter(df.Store == store_name) # Remove comma from integer (e.g. 1,333 to 1333) udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType()) #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] df = df.select(*[udf(column).alias(column) for column in df.columns]) # filter data, and keep only half years # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) df = df.filter(df.Date >= begin_date) df = df.filter(df.Date < end_date) # April - Sep # separate Item name to SKU and ID split_col = split(df['item'], '-') df = df.withColumn('MatID', split_col.getItem(0)) df = df.withColumn('MatID', regexp_replace(col("MatID"), "[ZNDF]", "")) # remove letters from matID df = df.withColumn('SKU', split_col.getItem(1)) ### Rename column df = df.withColumnRenamed("Sales Type", "SalesType") df = df.withColumnRenamed("POS Gross Sales", "GrossSales") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("POS Total Discount", "TotalDiscount") df = df.withColumnRenamed("POS Qty Sold", "QtySold") df = df.withColumnRenamed("POS COGS (INV)", "COGS") # Assign all column names to `columns` columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] # Conver the `df` columns to `FloatType()` df = convertColumn(df, columns, FloatType()) # drop unnecessary items columns_to_drop = ['item'] df = df.drop(*columns_to_drop) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) # Create the new columns df = df.withColumn("Price", df.GrossSales / df.QtySold) df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS)) df = df.withColumn("SellMargin", (df.NetSales + df.COGS)) # add subcategory column df = df.join(Subcat_info.select("MatID", 'SubCategory'), on=["MatID"], how="left") df = df.join(Vendor_info.select("MatID", 'Vendor'), on=["MatID"], how="left") return df