Example #1
0
def calculate_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean`
    """
    df_geometric_mean = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyQtySold')))))
    df_geometric_mean = df_geometric_mean.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat')

    df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyNetSale')))))
    df_geometric_mean2 = df_geometric_mean2.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat')

    # join the column to the original dataset
    df_new = df.join(df_geometric_mean,
                     on=['month', 'SubCategory'],
                     how='inner')
    df_new = df_new.join(df_geometric_mean2,
                         on=['month', 'SubCategory'],
                         how='inner')
    #assert df.count() == df_new.count()
    return df_new
def remove_no_stock_item(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    hassale_item = df.groupBy("SKU", "Store").agg({
        "StockQty": "sum"
    }).filter(col('sum(StockQty)') != 0).drop('sum(StockQty)')

    new_df = hassale_item.join(df, on=["SKU", "Store"], how='inner')
    return new_df