Beispiel #1
0
def find_and_analysis_atLeastOneMonth_SKU(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    For SKU, which "soldQty < capacity" in at least one month,
       1. Calcuaate the average of REAL NS, Standard dev of avg NS;
          REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months
       2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty
    
    Output: 2 dataset: 
       1. df_atLeastOneMonth: fulldataset
       2. unchange Depth SKU
       3. Changed Depth SKU
       4. df_full is the combination of unchanged_SKU and changed_SKU
    """

    ## Find at least one month SKU
    df = df.withColumn(
        'qty_less_than_capacity',
        when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0))
    df_atLeastOneMonth = df.filter(
        df.qty_less_than_capacity ==
        1)  # find SKU which qtySold> capacity at least on month

    ## Calculate the average of REAL NS;
    df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy(
        'MatID', "SubCategory", 'Vendor')  # Group by each SKU
    ## get the average net-sales of each product
    SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed(
        "avg(totalMonthlyQtySold)", "AvgQtySold")
    SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU")

    ## Join datasets
    df_1 = SKU_avg_Qty.join(df_atLeastOneMonth,
                            on=["MatID", 'SubCategory', 'Vendor'],
                            how="right")
    df_1 = df_1.join(SKU_avg_std,
                     on=["MatID", 'SubCategory', 'Vendor'],
                     how="left")
    df_1 = df_1.withColumn('Capacity_to_avg_qty',
                           (col('Capacity') / col("AvgQtySold")))
    df_1 = df_1.withColumn('Facing_to_avg_qty',
                           (col('Facings') / col("AvgQtySold")))
    # Calculate the ratio of average qty sold to the std of SKU
    df_1 = df_1.withColumn('StdQty_to_AvgQty',
                           (col('Qty_std_by_SKU') / col("AvgQtySold")))

    # if no standard derivation, means that this SKU is sold only one month
    df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates()
    # separate SKU to 2 groups
    unchanged_SKU = df_full.filter(col('Depth') < 3)
    changed_SKU = df_full.filter(col('ProposedDepth') == 3)

    return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
Beispiel #2
0
def find_Incorrect_record_items(
        month_merge: pyspark.sql.dataframe.DataFrame,
        output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    The items has extremely high ratio of capacity/facing. (Ratio >6)
    """
    Incorrect_record_items = month_merge.filter(
        col('Capacity') / col('Facings') > 6)
    Incorrect_record_items = Incorrect_record_items.withColumn(
        "Depth",
        col('Capacity') / col('Facings')).select(output_columns)
    return Incorrect_record_items
Beispiel #3
0
def find_Depth2_items(month_merge: pyspark.sql.dataframe.DataFrame,
                      output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    same as Capacity < Facings*2. They are issued items: for example:
       1. Capacity = Facings = 1, incorrect
       2. Facing = 3, Capacity = 5, incorrect
       3. Capacity should > Facing. 
    """
    Depth2_items = month_merge.filter(col('Capacity') / col('Facings') < 2)
    Depth2_items = Depth2_items.withColumn(
        "Depth",
        col('Capacity') / col('Facings')).select(output_columns)
    return Depth2_items
Beispiel #4
0
def estimate_segments(
    df: pyspark.sql.dataframe.DataFrame,
    target_field: str = None,
    max_segments: int = 30,
    include_columns: List[str] = [],
    unique_perc_bounds: Tuple[float, float] = [None, 0.8],
    null_perc_bounds: Tuple[float, float] = [None, 0.2],
) -> Optional[Union[List[Dict], List[str]]]:
    """
    Estimates the most important features and values on which to segment
    data profiling using entropy-based methods.

    If no target column provided, maximum entropy column is substituted.

    :param df: the dataframe of data to profile
    :param target_field: target field (optional)
    :param max_segments: upper threshold for total combinations of segments,
    default 30
    :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance.
    :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive.
    :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive.
    :return: a list of segmentation feature names
    """
    current_split_columns = []
    segments = []
    segments_used = 1
    max_entropy_column = (float("-inf"), None)

    if not unique_perc_bounds[0]:
        unique_perc_bounds[0] = float("-inf")
    if not unique_perc_bounds[1]:
        unique_perc_bounds[1] = float("inf")
    if not null_perc_bounds[0]:
        null_perc_bounds[0] = float("-inf")
    if not null_perc_bounds[1]:
        null_perc_bounds[1] = float("inf")

    valid_column_names = set()

    count = df.count()

    print("Limiting to categorical (string) data columns...")
    valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)}

    print("Gathering cardinality information...")
    n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names}
    print("Gathering missing value information...")
    n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names}

    print("Finding valid columns for autosegmentation...")
    for col in valid_column_names.copy():
        null_perc = 0.0 if count == 0 else n_nulls[col] / count
        unique_perc = 0.0 if count == 0 else n_uniques[col] / count
        if (
            col in segments
            or n_uniques[col] <= 1
            or null_perc < null_perc_bounds[0]
            or null_perc >= null_perc_bounds[1]
            or unique_perc < unique_perc_bounds[0]
            or unique_perc >= unique_perc_bounds[1]
        ):
            valid_column_names.remove(col)

    if not valid_column_names:
        return []

    if not target_field:
        print("Finding alternative target field since none were specified...")
        for col in valid_column_names:
            col_entropy = _simple_entropy(df, col)
            if n_uniques[col] > 1:
                col_entropy /= math.log(n_uniques[col])
            if col_entropy > max_entropy_column[0]:
                max_entropy_column = (col_entropy, col)
        target_field = max_entropy_column[1]

    print(f"Using {target_field} column as target field.")
    assert target_field in df.columns
    valid_column_names.add(target_field)
    valid_column_names = list(valid_column_names)

    countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache()

    print("Calculating segments...")
    while segments_used < max_segments:
        valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))}
        _, segment_column_name = _find_best_split(
            countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques
        )

        if not segment_column_name:
            break

        segments.append(segment_column_name)
        current_split_columns.append(segment_column_name)
        segments_used *= n_uniques[segment_column_name]

    return segments
Beispiel #5
0
def Data_clean_and_merge(
        df: pyspark.sql.dataframe.DataFrame,
        Subcat_info: pyspark.sql.dataframe.DataFrame,
        Vendor_info: pyspark.sql.dataframe.DataFrame,
        store_name: str,
        begin_date="2019-04-01",
        end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame:
    # select useful columns
    Subcat_info = Subcat_info.select('SKU', 'SubCategory')
    Vendor_info = Vendor_info.select('SKU', 'Vendor')
    # clean data entry: remove ID
    split_col = split(Subcat_info['SKU'], '-')
    Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0))
    Subcat_info = Subcat_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col2 = split(Vendor_info['SKU'], '-')
    Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0))
    Vendor_info = Vendor_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col = split(Subcat_info['SubCategory'], '-')
    split_col2 = split(Vendor_info['Vendor'], '-')
    Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1))
    Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1))
    # filter data
    df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales',
                   'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)')

    # Check only one store
    df = df.filter(df.Store == store_name)

    # Remove comma from integer (e.g. 1,333 to 1333)
    udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType())
    #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    df = df.select(*[udf(column).alias(column) for column in df.columns])

    # filter data, and keep only half years
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))
    df = df.filter(df.Date >= begin_date)
    df = df.filter(df.Date < end_date)  # April - Sep

    # separate Item name to SKU and ID
    split_col = split(df['item'], '-')
    df = df.withColumn('MatID', split_col.getItem(0))
    df = df.withColumn('MatID',
                       regexp_replace(col("MatID"), "[ZNDF]",
                                      ""))  # remove letters from matID
    df = df.withColumn('SKU', split_col.getItem(1))

    ### Rename column
    df = df.withColumnRenamed("Sales Type", "SalesType")
    df = df.withColumnRenamed("POS Gross Sales", "GrossSales")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("POS Total Discount", "TotalDiscount")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")
    df = df.withColumnRenamed("POS COGS (INV)", "COGS")

    # Assign all column names to `columns`
    columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    # Conver the `df` columns to `FloatType()`
    df = convertColumn(df, columns, FloatType())

    # drop unnecessary items
    columns_to_drop = ['item']
    df = df.drop(*columns_to_drop)
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))

    # Create the new columns
    df = df.withColumn("Price", df.GrossSales / df.QtySold)
    df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS))
    df = df.withColumn("SellMargin", (df.NetSales + df.COGS))

    # add subcategory column
    df = df.join(Subcat_info.select("MatID", 'SubCategory'),
                 on=["MatID"],
                 how="left")
    df = df.join(Vendor_info.select("MatID", 'Vendor'),
                 on=["MatID"],
                 how="left")
    return df
Beispiel #6
0
def compute_summary(df: pyspark.sql.dataframe.DataFrame,
                    valuecol: str = 'amtallowed',
                    groupbycol=None,
                    percentiles=[]):
    """Compute summary statistics.
    
    Computes following columns by default for each category in groupbycol:
      
        - default percentiles: [.01,  .25, .50, .75, .99]   
        - mean
        - num_zeros
        - percent_zeros
        - stddev
        - sum
    
    Args:
        df (`pyspark.sql.dataframe.DataFrame`) 
        valuecol (str) : A numeric column to compute statistics on.
        groupbycol (str) : A categorical column to group the statistics on.
        percentiles (list) : list of percentiles in addition to default percentiles
        
    Returns:
        `pyspark.sql.dataframe.DataFrame`
    """
    assert groupbycol is not None, 'groupby should be a column name (str) in dataframe, not NoneType'
    assert isinstance(
        groupbycol,
        str), 'groupbycol should be a single column name of type str'

    default_percentiles = [.01, .25, .50, .75, .99]
    percentiles = sorted(list(set(default_percentiles + percentiles)))

    # percentile_cols = [f'p{str(p).split(".")[-1]}' for p in percentiles]
    # formatted version - keeps exactly 2 decimal places, above version drops trailing zeros
    def make_alias(p):
        """Returns formatted column name
        
        Example:
            >>>make_alias(.2)
            'p20'
            >>>make_alias(.20)
            'p20'
            >>>make_alias(.201)
            'p20'
        """
        return 'p' + f'{p:.2f}'[-2:]

    percentile_cols = [make_alias(p) for p in percentiles]

    dfagg = (
        df.filter(F.col(valuecol) >= 0).withColumn(
            'is_zero',
            F.when(F.col(valuecol) <= 1e-2,
                   1).otherwise(0)).groupby(groupbycol).agg(
                       F.sum(valuecol).alias('sum'),
                       F.count(groupbycol).alias('count'),
                       *[
                           get_percentile(valuecol, p, make_alias(p))
                           for p in percentiles
                       ],  # expand list of exprs
                       F.mean(valuecol).alias('mean'),
                       F.stddev(valuecol).alias('stddev'),
                       F.sum('is_zero').alias('num_zeros')).withColumn(
                           'percent_zeros',
                           F.col('num_zeros') / F.col('count') * 100).select(
                               groupbycol,
                               'sum',
                               'count',
                               'mean',
                               *percentile_cols,  # expand list of columns
                               'stddev',
                               'num_zeros',
                               'percent_zeros')  #'providerzip', 
    )
    return dfagg