Example #1
0
def _simple_entropy(df: pyspark.sql.dataframe.DataFrame, column_name: str) -> float:
    count = df.count()
    testdf = df.select(column_name).groupby(column_name).agg((F.count(column_name) / count).alias("p"))
    result = testdf.groupby().agg(-F.sum(F.col("p") * F.log2("p"))).collect()[0][0]
    if not result:
        return 0.0
    return result
Example #2
0
    def transform_spark_df(self,
                           sdf: pyspark.sql.dataframe.DataFrame,
                           path_to_write: str,
                           parquet_write_mode: str = 'append',
                           repartition_val: int = 22) -> None:
        for categ_col in [x for x in sdf.columns if x in self.features_to_transform]:

            if f'transform_{categ_col}' not in globals().keys() or True:
                tr_func = self.transform_func.format(
                    transformer_path=self.transformer_path,
                    col_name=categ_col,
                    own_module=self.own_module
                )

                exec(f'global transform_{categ_col}\n{tr_func}')

            sdf = eval(f'''sdf.withColumn(
                '{categ_col}',
                transform_{categ_col}(*[F.lower(F.regexp_replace(F.col('{categ_col}'), ' ', ''))]))'''
                       )

        sdf = sdf.cache()

        sdf.repartition(repartition_val).write \
            .mode(parquet_write_mode) \
            .option('compression', 'none') \
            .parquet(path_to_write)
def clean_and_add_date(
    df: pyspark.sql.dataframe.DataFrame, date_generated: list,
    spark: pyspark.sql.session.SparkSession
) -> pyspark.sql.dataframe.DataFrame:
    """
    Add more rows to ensure each item in each store has the full-month records 
       (since if both stock and sales are 0, the raw date can miss the relevant column)
    """
    # Create a list of dates, start from the first day of dataset, end with the last day of dataset
    date_df = spark.createDataFrame(date_generated,
                                    DateType())  # create a Date df
    date_df = date_df.withColumnRenamed("value", "Date")

    # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("dfView")

    # get temporary table with distinct combinnation of SKU and Store
    ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same
    sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView")

    # Cross join two dataset to create full schema
    schema = sqlDF.crossJoin(date_df)  # using crossjoin to quickly add
    #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result
    #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \
    #'row. But we get '+str(schema.count()) # we need add rows

    # left join origial dataset with new schema
    df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right')
    #assert df.count() == count # test on overall dataset
    return df
Example #4
0
def calculate_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean`
    """
    df_geometric_mean = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyQtySold')))))
    df_geometric_mean = df_geometric_mean.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat')

    df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg(
        exp(avg(log(col('totalMonthlyNetSale')))))
    df_geometric_mean2 = df_geometric_mean2.withColumnRenamed(
        'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat')

    # join the column to the original dataset
    df_new = df.join(df_geometric_mean,
                     on=['month', 'SubCategory'],
                     how='inner')
    df_new = df_new.join(df_geometric_mean2,
                         on=['month', 'SubCategory'],
                         how='inner')
    #assert df.count() == df_new.count()
    return df_new
Example #5
0
def find_and_analysis_atLeastOneMonth_SKU(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    For SKU, which "soldQty < capacity" in at least one month,
       1. Calcuaate the average of REAL NS, Standard dev of avg NS;
          REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months
       2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty
    
    Output: 2 dataset: 
       1. df_atLeastOneMonth: fulldataset
       2. unchange Depth SKU
       3. Changed Depth SKU
       4. df_full is the combination of unchanged_SKU and changed_SKU
    """

    ## Find at least one month SKU
    df = df.withColumn(
        'qty_less_than_capacity',
        when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0))
    df_atLeastOneMonth = df.filter(
        df.qty_less_than_capacity ==
        1)  # find SKU which qtySold> capacity at least on month

    ## Calculate the average of REAL NS;
    df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy(
        'MatID', "SubCategory", 'Vendor')  # Group by each SKU
    ## get the average net-sales of each product
    SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed(
        "avg(totalMonthlyQtySold)", "AvgQtySold")
    SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU")

    ## Join datasets
    df_1 = SKU_avg_Qty.join(df_atLeastOneMonth,
                            on=["MatID", 'SubCategory', 'Vendor'],
                            how="right")
    df_1 = df_1.join(SKU_avg_std,
                     on=["MatID", 'SubCategory', 'Vendor'],
                     how="left")
    df_1 = df_1.withColumn('Capacity_to_avg_qty',
                           (col('Capacity') / col("AvgQtySold")))
    df_1 = df_1.withColumn('Facing_to_avg_qty',
                           (col('Facings') / col("AvgQtySold")))
    # Calculate the ratio of average qty sold to the std of SKU
    df_1 = df_1.withColumn('StdQty_to_AvgQty',
                           (col('Qty_std_by_SKU') / col("AvgQtySold")))

    # if no standard derivation, means that this SKU is sold only one month
    df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates()
    # separate SKU to 2 groups
    unchanged_SKU = df_full.filter(col('Depth') < 3)
    changed_SKU = df_full.filter(col('ProposedDepth') == 3)

    return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
def get_store_item_concept_list(df: pyspark.sql.dataframe.DataFrame,
                                spark) -> list:
    """
    Get the list of combinations of SKU, concept in stores
    """
    # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("dfView")
    # Query and create new dataframe
    sqlDF = spark.sql("SELECT DISTINCT SKU, Store, Concept_NEW FROM dfView")
    store_item_list = sqlDF.rdd.map(tuple).collect()
    return store_item_list
Example #7
0
def calculate_Capacity_to_sales(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    1. Capacity / Qty sold
    2. Capacity / NetSales
    """
    df = df.withColumn("Capacity_to_qty",
                       (df.Capacity / df.totalMonthlyQtySold))
    df = df.withColumn("Capacity_to_sales",
                       (df.Capacity / df.totalMonthlyNetSale))
    return df
Example #8
0
def find_and_analysis_fullMonth_SKU(
        df_atLeastOneMonth: pyspark.sql.dataframe.DataFrame, split_month: int,
        spark) -> pyspark.sql.dataframe.DataFrame:
    """
    Find SKU, which "soldQty < capacity" in every month
    """
    full_month_items = select_full_month_item(
        df_atLeastOneMonth.toPandas(),
        month_list=[split_month, split_month + 1,
                    split_month + 2])  # three month data since split_month
    full_month_SKU_info = get_full_month_SKU_info(
        full_month_items, df_atLeastOneMonth.select(selected_column_fullMonth),
        spark).dropDuplicates()

    return full_month_SKU_info
Example #9
0
def calculate_Depths(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Depth =  Capacity / Facings
    ProposedDepth = 3, if Depth >= 4. Otherwise empty
    VarianceDepth = ProposedDepth - Depth (should be negative)
    """
    df = df.withColumn("Depth", (df.Capacity / df.Facings))
    df = df.withColumn("ProposedDepth",
                       when(col('Depth') >= 4, 3).otherwise(''))
    df = df.withColumn(
        "VarianceDepth",
        when(col('Depth') >= 4, (df.ProposedDepth - df.Depth)).otherwise(''))
    return df
def get_concept_list(df: pyspark.sql.dataframe.DataFrame) -> list:
    # Query and create new dataframe
    concept_list = [
        row.Concept_NEW
        for row in df.select("Concept_NEW").distinct().collect()
    ]
    return concept_list
def get_repartition_value(sdf: pyspark.sql.dataframe.DataFrame,
                          target_size: int = 245,
                          compression: str = 'none') -> int:
    lenght = sdf.count()
    df_1_row = sdf.limit(int(1e4))
    tmp_file_name = 'test_file'
    while check_hdfs_file_ex(tmp_file_name):
        tmp_file_name += '_'
    df_1_row.coalesce(1).write.option('compression', compression)\
        .mode('overwrite').parquet(tmp_file_name)
    row_byte_weight = int(sh.hdfs('dfs', '-du', tmp_file_name)\
        .stdout.decode('utf-8').split('\n')[-2].split(' ')[0])
    sh.hdfs('dfs', '-rm', '-R', '-skipTrash', tmp_file_name)
    nd_rep_val = int(row_byte_weight * lenght / target_size / (1024 * 1024) /
                     1e4)
    return 1 if nd_rep_val < 1 else nd_rep_val
Example #12
0
def _find_best_split(
    countdf: pyspark.sql.dataframe.DataFrame,
    prev_split_columns: List[str],
    valid_column_names: List[str],
    target_column_name: str,
    normalization: Optional[Dict[str, int]] = None,
) -> Tuple[float, str]:
    total_count = countdf.count()

    max_score_tuple = 0.0, None
    pre_split_entropy = _weighted_entropy(countdf, total_count, prev_split_columns, target_column_name, True)

    for column_name in valid_column_names:
        if column_name == target_column_name:
            continue
        new_split_columns = prev_split_columns[:]
        new_split_columns.append(column_name)
        post_split_entropy = _weighted_entropy(countdf, total_count, new_split_columns, target_column_name, True)
        value = pre_split_entropy - post_split_entropy

        if normalization and normalization[column_name] > 0:
            value /= math.log(normalization[column_name])

        if value > max_score_tuple[0]:
            max_score_tuple = value, column_name

    return max_score_tuple
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: object,
                  newType: object) -> object:
    """
    Convert the data type of DataFrame columns
    """
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df
Example #14
0
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: list,
                  newType) -> pyspark.sql.dataframe.DataFrame:
    """
    A custom function to convert the data type of DataFrame columns
    """
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df
Example #15
0
def find_removed_item(month_merge_late: pyspark.sql.dataframe.DataFrame,
                      month_merge_early: pyspark.sql.dataframe.DataFrame,
                      dist_df: pyspark.sql.dataframe.DataFrame,
                      output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    removed item:
       The items are in distribution report, but have no sales from July-Sep (or Sep-Nov)
    """
    Removed_df = dist_df.join(month_merge_late, on=["MatID"],
                              how="left").fillna(
                                  0, subset=['totalMonthlyGrossSale'])
    Removed_df = dist_df.join(month_merge_early, on=["MatID"],
                              how="inner").fillna(
                                  0, subset=['totalMonthlyGrossSale'])
    Removed_item = Removed_df.filter(Removed_df.totalMonthlyGrossSale == 0)
    Removed_item = Removed_item.select(output_columns)
    return Removed_item
def remove_no_stock_item(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    hassale_item = df.groupBy("SKU", "Store").agg({
        "StockQty": "sum"
    }).filter(col('sum(StockQty)') != 0).drop('sum(StockQty)')

    new_df = hassale_item.join(df, on=["SKU", "Store"], how='inner')
    return new_df
def create_list_dates(df: pyspark.sql.dataframe.DataFrame) -> list:
    """
    Create a list of dates, 
        start from the first day of dataset
        end with the last day of dataset
    
    :param df: dataframe
    :return: a list of dates
    """
    end = df.agg({"Date": "max"}).collect()[0][0] + timedelta(days=1)
    start = df.agg({"Date": "min"}).collect()[0][0]
    date_generated = [
        start + timedelta(days=x) for x in range(0, (end - start).days)
    ]

    # Test the output
    #test_list_dates(date_generated, end, start)
    return date_generated
Example #18
0
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame,
                     cols: list,
                     result_name: str,
                     rename_fields: Dict[str, str] = None):
    """
    Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]`

    Args:
        df: The input Spark DataFrame
        cols: The array columns that should be zipped
        result_name: The name of the column that will contain the newly created struct
        rename_fields: dictionary mapping column names to new struct field names.
            Used to rename columns in the newly created struct.

    Returns: `df.withColumn(result_name, zip(explode(cols)))`

    """
    df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols)))

    if rename_fields:  # create schema of new struct by simply renaming the top-level struct fields
        old_schema: t.StructType = df.schema[result_name].dataType

        # rename field if field ist in `old_schema.fieldNames()`
        new_field_names = [
            rename_fields[field] if field in rename_fields else field
            for field in old_schema.fieldNames()
        ]

        new_schema = t.StructType([
            t.StructField(name, field.dataType)
            for name, field in zip(new_field_names, old_schema.fields)
        ])

        df = df.withColumn(result_name, f.col(result_name).cast(new_schema))

        # # old method using withColumn and a new struct; breaks with PySpark 3.0
        # df = df.withColumn(target_struct, f.struct(*[
        #     f.col(target_struct + "." + actualName).alias(targetName)
        #     for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames())
        # ]))

    return df
Example #19
0
def merge_dataset(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    # Generate sale information for each product in each month
    month_df = df.select('MatID', "SKU",
                         year("Date").alias('year'),
                         month("Date").alias('month'), 'GrossSales',
                         'NetSales', 'COGS', 'QtySold', 'Price', 'SellMargin',
                         'FrontMargin', 'SubCategory',
                         'Vendor').groupBy("month", 'MatID', "SubCategory",
                                           'Vendor')
    ## get the average net-sales of each product
    month_avg_NetSale = month_df.avg("NetSales").withColumnRenamed(
        "avg(NetSales)", "totalMonthlyNetSale")
    ## get the average gross-sales of each product
    month_avg_GrossSale = month_df.avg("GrossSales").withColumnRenamed(
        "avg(GrossSales)", "totalMonthlyGrossSale")
    month_avg_COGS = month_df.avg("COGS").withColumnRenamed(
        "avg(COGS)", "avgCOGS")
    month_avg_QtySold = month_df.avg("QtySold").withColumnRenamed(
        "avg(QtySold)", "totalMonthlyQtySold")
    month_avg_Price = month_df.avg("Price").withColumnRenamed(
        "avg(Price)", "Price")
    month_avg_SM = month_df.avg("SellMargin").withColumnRenamed(
        "avg(SellMargin)", "SellMargin")
    month_avg_FM = month_df.avg("FrontMargin").withColumnRenamed(
        "avg(FrontMargin)", "avgFrontMargin")

    month_merge = month_avg_NetSale.join(
        month_avg_GrossSale,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")
    month_merge = month_merge.join(
        month_avg_COGS,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")
    month_merge = month_merge.join(
        month_avg_QtySold,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")
    month_merge = month_merge.join(
        month_avg_Price,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")
    month_merge = month_merge.join(
        month_avg_SM,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")
    month_merge = month_merge.join(
        month_avg_FM,
        on=["MatID", 'month', 'SubCategory', 'Vendor'],
        how="inner")

    return month_merge
Example #20
0
def clean_dist_df(
    dist_df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    # filter data
    dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply',
                             'Classification', 'Mat ID', '# POGs')

    ### Rename column
    dist_df = dist_df.withColumnRenamed("Name", "SKU")
    dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply")
    dist_df = dist_df.withColumnRenamed("Mat ID", "MatID")
    dist_df = dist_df.withColumnRenamed("# POGs", "POGS")

    # Conver columns to `FloatType()`
    dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float'))
    dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float'))
    dist_df = dist_df.withColumn("DaysSupply",
                                 dist_df.DaysSupply.cast('float'))
    dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer'))
    dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer'))
    return dist_df
Example #21
0
def find_Incorrect_record_items(
        month_merge: pyspark.sql.dataframe.DataFrame,
        output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    The items has extremely high ratio of capacity/facing. (Ratio >6)
    """
    Incorrect_record_items = month_merge.filter(
        col('Capacity') / col('Facings') > 6)
    Incorrect_record_items = Incorrect_record_items.withColumn(
        "Depth",
        col('Capacity') / col('Facings')).select(output_columns)
    return Incorrect_record_items
Example #22
0
def find_check_item(month_merge: pyspark.sql.dataframe.DataFrame,
                    dist_df: pyspark.sql.dataframe.DataFrame,
                    output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    checked item:
       The items are in distribution report, but have no sales from Apr-Sep (or Sep-Nov)
    """
    check_df = dist_df.join(month_merge, on=["MatID"], how="left").fillna(
        0, subset=['totalMonthlyGrossSale'])
    check_item = check_df.filter(check_df.totalMonthlyGrossSale == 0)
    check_item = check_item.select(output_columns)
    return check_item
Example #23
0
def Group_and_save_atLeastOneMonth_SKU(
        unchanged_SKU: pyspark.sql.dataframe.DataFrame,
        changed_SKU: pyspark.sql.dataframe.DataFrame):
    """
    Separate unadjusted SKU to three sheets within same excel file: Capacity_to_avg_qty<3, 
                                             Capacity_to_avg_qty<9 and Capacity_to_avg_qty>=3,
                                             Capacity_to_avg_qty>=9
                                            
    """
    # Separate SKU and save to excel files.
    changed_SKU.toPandas().to_csv(
        '../data/Output/atLeastOneMonth/adjusted_SKU.csv',
        index=False,
        encoding='utf-8')
    print(
        "Save adjusted SKU(atLeastOneMonth) to Output/atLeastOneMonth/adjusted_SKU.csv"
    )

    unchanged_SKU = unchanged_SKU.toPandas()
    unchanged_SKU1 = unchanged_SKU.query('Capacity_to_avg_qty<3')
    unchanged_SKU2 = unchanged_SKU.query(
        'Capacity_to_avg_qty<9 and Capacity_to_avg_qty>=3')
    unchanged_SKU3 = unchanged_SKU.query('Capacity_to_avg_qty>=9')
    writer = ExcelWriter('../data/Output/atLeastOneMonth/unadjusted_SKU.xlsx')
    unchanged_SKU1.to_excel(writer, 'lessThan3', index=False)
    unchanged_SKU2.to_excel(writer, 'between3And9', index=False)
    unchanged_SKU3.to_excel(writer, 'moreThan9', index=False)
    writer.save()
    print(
        "Save unadjusted SKU(atLeastOneMonth) to Output/atLeastOneMonth/unadjusted_SKU.xlsx"
    )
Example #24
0
def find_Depth2_items(month_merge: pyspark.sql.dataframe.DataFrame,
                      output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    same as Capacity < Facings*2. They are issued items: for example:
       1. Capacity = Facings = 1, incorrect
       2. Facing = 3, Capacity = 5, incorrect
       3. Capacity should > Facing. 
    """
    Depth2_items = month_merge.filter(col('Capacity') / col('Facings') < 2)
    Depth2_items = Depth2_items.withColumn(
        "Depth",
        col('Capacity') / col('Facings')).select(output_columns)
    return Depth2_items
Example #25
0
def calculate_mean_std_and_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month
    """
    df_group = df.groupby('month', 'SubCategory')
    df = calculate_geometric_mean(df)
    df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\
    .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\
    .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat")

    df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat")

    df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\
    .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat")

    # join to get final dataset
    df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner')
    return df
Example #26
0
def find_new_item(month_merge_late: pyspark.sql.dataframe.DataFrame,
                  dist_df: pyspark.sql.dataframe.DataFrame,
                  output_columns: list) -> pyspark.sql.dataframe.DataFrame:
    """
    new item: 
        The items are not in distribution report, but have sale history from July-Sep (or Sep-Nov)
    """
    New_df = dist_df.join(month_merge_late, on=["MatID"],
                          how="right").fillna(0,
                                              subset=['totalMonthlyGrossSale'])
    New_item = New_df.filter(
        New_df.totalMonthlyGrossSale != 0)  # new item is sold during July- Sep
    New_item = New_item.filter(col(
        "Classification").isNull())  # new item has no classification records
    New_item = New_item.select(output_columns)
    return New_item
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame):
    name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}'
    while os.path.exists(name):
        name += '_'
    if check_hdfs_file_ex(name):
        sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    for column in sdf.dtypes:
        if 'date' in column[1]:
            sdf = sdf.withColumn(
                column[0],
                F.col(column[0]).cast(T.TimestampType()).alias(column[0]))
    sdf.write.mode('overwrite').parquet(name)
    sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd()))
    sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    data = pd.read_parquet(name + '/')
    os.system(f'rm -r {os.getcwd()}/{name}')
    return data
Example #28
0
def _weighted_entropy(
    countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True
) -> float:
    """Entropy calculation across many ."""
    split_columns_plus_target = split_columns[:]
    split_columns_plus_target.append(target_column_name)
    groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count"))

    w = Window.partitionBy(split_columns)
    groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn(
        "weight", F.sum(groupdf["group_count"] / total_count).over(w)
    )

    entropydf = groupdf.groupby(split_columns).agg(
        (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight")
    )

    if weighted:
        result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0]
    else:
        result = entropydf.groupby().sum("entropy").collect()[0][0]

    return result
def clean_dataset(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:

    ## Select the target features
    df = df.select('Index', 'Date Detail', 'Company', 'Business Unit',
                   'Concept_NEW', 'Product Category',
                   'Company and Cost Centre', 'SKU', 'POS Net Sales',
                   'Rank Total')

    ## Reanme columns
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Date Detail", "Date")
    df = df.withColumnRenamed("Product Category", "Category")
    df = df.withColumnRenamed("Company and Cost Centre", "Store")
    df = df.withColumnRenamed("Business Unit", "BusinessUnit")
    df = df.withColumnRenamed("Rank Total", "rank")

    ## Column type cast
    columns = ['NetSales', 'rank']
    df = convertColumn(df, columns, FloatType())
    # Replace none to 0
    df = df.na.fill(0)
    return df
Example #30
0
def estimate_segments(
    df: pyspark.sql.dataframe.DataFrame,
    target_field: str = None,
    max_segments: int = 30,
    include_columns: List[str] = [],
    unique_perc_bounds: Tuple[float, float] = [None, 0.8],
    null_perc_bounds: Tuple[float, float] = [None, 0.2],
) -> Optional[Union[List[Dict], List[str]]]:
    """
    Estimates the most important features and values on which to segment
    data profiling using entropy-based methods.

    If no target column provided, maximum entropy column is substituted.

    :param df: the dataframe of data to profile
    :param target_field: target field (optional)
    :param max_segments: upper threshold for total combinations of segments,
    default 30
    :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance.
    :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive.
    :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive.
    :return: a list of segmentation feature names
    """
    current_split_columns = []
    segments = []
    segments_used = 1
    max_entropy_column = (float("-inf"), None)

    if not unique_perc_bounds[0]:
        unique_perc_bounds[0] = float("-inf")
    if not unique_perc_bounds[1]:
        unique_perc_bounds[1] = float("inf")
    if not null_perc_bounds[0]:
        null_perc_bounds[0] = float("-inf")
    if not null_perc_bounds[1]:
        null_perc_bounds[1] = float("inf")

    valid_column_names = set()

    count = df.count()

    print("Limiting to categorical (string) data columns...")
    valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)}

    print("Gathering cardinality information...")
    n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names}
    print("Gathering missing value information...")
    n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names}

    print("Finding valid columns for autosegmentation...")
    for col in valid_column_names.copy():
        null_perc = 0.0 if count == 0 else n_nulls[col] / count
        unique_perc = 0.0 if count == 0 else n_uniques[col] / count
        if (
            col in segments
            or n_uniques[col] <= 1
            or null_perc < null_perc_bounds[0]
            or null_perc >= null_perc_bounds[1]
            or unique_perc < unique_perc_bounds[0]
            or unique_perc >= unique_perc_bounds[1]
        ):
            valid_column_names.remove(col)

    if not valid_column_names:
        return []

    if not target_field:
        print("Finding alternative target field since none were specified...")
        for col in valid_column_names:
            col_entropy = _simple_entropy(df, col)
            if n_uniques[col] > 1:
                col_entropy /= math.log(n_uniques[col])
            if col_entropy > max_entropy_column[0]:
                max_entropy_column = (col_entropy, col)
        target_field = max_entropy_column[1]

    print(f"Using {target_field} column as target field.")
    assert target_field in df.columns
    valid_column_names.add(target_field)
    valid_column_names = list(valid_column_names)

    countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache()

    print("Calculating segments...")
    while segments_used < max_segments:
        valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))}
        _, segment_column_name = _find_best_split(
            countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques
        )

        if not segment_column_name:
            break

        segments.append(segment_column_name)
        current_split_columns.append(segment_column_name)
        segments_used *= n_uniques[segment_column_name]

    return segments