def _simple_entropy(df: pyspark.sql.dataframe.DataFrame, column_name: str) -> float: count = df.count() testdf = df.select(column_name).groupby(column_name).agg((F.count(column_name) / count).alias("p")) result = testdf.groupby().agg(-F.sum(F.col("p") * F.log2("p"))).collect()[0][0] if not result: return 0.0 return result
def transform_spark_df(self, sdf: pyspark.sql.dataframe.DataFrame, path_to_write: str, parquet_write_mode: str = 'append', repartition_val: int = 22) -> None: for categ_col in [x for x in sdf.columns if x in self.features_to_transform]: if f'transform_{categ_col}' not in globals().keys() or True: tr_func = self.transform_func.format( transformer_path=self.transformer_path, col_name=categ_col, own_module=self.own_module ) exec(f'global transform_{categ_col}\n{tr_func}') sdf = eval(f'''sdf.withColumn( '{categ_col}', transform_{categ_col}(*[F.lower(F.regexp_replace(F.col('{categ_col}'), ' ', ''))]))''' ) sdf = sdf.cache() sdf.repartition(repartition_val).write \ .mode(parquet_write_mode) \ .option('compression', 'none') \ .parquet(path_to_write)
def clean_and_add_date( df: pyspark.sql.dataframe.DataFrame, date_generated: list, spark: pyspark.sql.session.SparkSession ) -> pyspark.sql.dataframe.DataFrame: """ Add more rows to ensure each item in each store has the full-month records (since if both stock and sales are 0, the raw date can miss the relevant column) """ # Create a list of dates, start from the first day of dataset, end with the last day of dataset date_df = spark.createDataFrame(date_generated, DateType()) # create a Date df date_df = date_df.withColumnRenamed("value", "Date") # Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("dfView") # get temporary table with distinct combinnation of SKU and Store ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView") # Cross join two dataset to create full schema schema = sqlDF.crossJoin(date_df) # using crossjoin to quickly add #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \ #'row. But we get '+str(schema.count()) # we need add rows # left join origial dataset with new schema df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right') #assert df.count() == count # test on overall dataset return df
def calculate_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean` """ df_geometric_mean = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyQtySold'))))) df_geometric_mean = df_geometric_mean.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat') df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyNetSale'))))) df_geometric_mean2 = df_geometric_mean2.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat') # join the column to the original dataset df_new = df.join(df_geometric_mean, on=['month', 'SubCategory'], how='inner') df_new = df_new.join(df_geometric_mean2, on=['month', 'SubCategory'], how='inner') #assert df.count() == df_new.count() return df_new
def find_and_analysis_atLeastOneMonth_SKU( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ For SKU, which "soldQty < capacity" in at least one month, 1. Calcuaate the average of REAL NS, Standard dev of avg NS; REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months 2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty Output: 2 dataset: 1. df_atLeastOneMonth: fulldataset 2. unchange Depth SKU 3. Changed Depth SKU 4. df_full is the combination of unchanged_SKU and changed_SKU """ ## Find at least one month SKU df = df.withColumn( 'qty_less_than_capacity', when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0)) df_atLeastOneMonth = df.filter( df.qty_less_than_capacity == 1) # find SKU which qtySold> capacity at least on month ## Calculate the average of REAL NS; df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy( 'MatID', "SubCategory", 'Vendor') # Group by each SKU ## get the average net-sales of each product SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed( "avg(totalMonthlyQtySold)", "AvgQtySold") SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU") ## Join datasets df_1 = SKU_avg_Qty.join(df_atLeastOneMonth, on=["MatID", 'SubCategory', 'Vendor'], how="right") df_1 = df_1.join(SKU_avg_std, on=["MatID", 'SubCategory', 'Vendor'], how="left") df_1 = df_1.withColumn('Capacity_to_avg_qty', (col('Capacity') / col("AvgQtySold"))) df_1 = df_1.withColumn('Facing_to_avg_qty', (col('Facings') / col("AvgQtySold"))) # Calculate the ratio of average qty sold to the std of SKU df_1 = df_1.withColumn('StdQty_to_AvgQty', (col('Qty_std_by_SKU') / col("AvgQtySold"))) # if no standard derivation, means that this SKU is sold only one month df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates() # separate SKU to 2 groups unchanged_SKU = df_full.filter(col('Depth') < 3) changed_SKU = df_full.filter(col('ProposedDepth') == 3) return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
def get_store_item_concept_list(df: pyspark.sql.dataframe.DataFrame, spark) -> list: """ Get the list of combinations of SKU, concept in stores """ # Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("dfView") # Query and create new dataframe sqlDF = spark.sql("SELECT DISTINCT SKU, Store, Concept_NEW FROM dfView") store_item_list = sqlDF.rdd.map(tuple).collect() return store_item_list
def calculate_Capacity_to_sales( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ 1. Capacity / Qty sold 2. Capacity / NetSales """ df = df.withColumn("Capacity_to_qty", (df.Capacity / df.totalMonthlyQtySold)) df = df.withColumn("Capacity_to_sales", (df.Capacity / df.totalMonthlyNetSale)) return df
def find_and_analysis_fullMonth_SKU( df_atLeastOneMonth: pyspark.sql.dataframe.DataFrame, split_month: int, spark) -> pyspark.sql.dataframe.DataFrame: """ Find SKU, which "soldQty < capacity" in every month """ full_month_items = select_full_month_item( df_atLeastOneMonth.toPandas(), month_list=[split_month, split_month + 1, split_month + 2]) # three month data since split_month full_month_SKU_info = get_full_month_SKU_info( full_month_items, df_atLeastOneMonth.select(selected_column_fullMonth), spark).dropDuplicates() return full_month_SKU_info
def calculate_Depths( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Depth = Capacity / Facings ProposedDepth = 3, if Depth >= 4. Otherwise empty VarianceDepth = ProposedDepth - Depth (should be negative) """ df = df.withColumn("Depth", (df.Capacity / df.Facings)) df = df.withColumn("ProposedDepth", when(col('Depth') >= 4, 3).otherwise('')) df = df.withColumn( "VarianceDepth", when(col('Depth') >= 4, (df.ProposedDepth - df.Depth)).otherwise('')) return df
def get_concept_list(df: pyspark.sql.dataframe.DataFrame) -> list: # Query and create new dataframe concept_list = [ row.Concept_NEW for row in df.select("Concept_NEW").distinct().collect() ] return concept_list
def get_repartition_value(sdf: pyspark.sql.dataframe.DataFrame, target_size: int = 245, compression: str = 'none') -> int: lenght = sdf.count() df_1_row = sdf.limit(int(1e4)) tmp_file_name = 'test_file' while check_hdfs_file_ex(tmp_file_name): tmp_file_name += '_' df_1_row.coalesce(1).write.option('compression', compression)\ .mode('overwrite').parquet(tmp_file_name) row_byte_weight = int(sh.hdfs('dfs', '-du', tmp_file_name)\ .stdout.decode('utf-8').split('\n')[-2].split(' ')[0]) sh.hdfs('dfs', '-rm', '-R', '-skipTrash', tmp_file_name) nd_rep_val = int(row_byte_weight * lenght / target_size / (1024 * 1024) / 1e4) return 1 if nd_rep_val < 1 else nd_rep_val
def _find_best_split( countdf: pyspark.sql.dataframe.DataFrame, prev_split_columns: List[str], valid_column_names: List[str], target_column_name: str, normalization: Optional[Dict[str, int]] = None, ) -> Tuple[float, str]: total_count = countdf.count() max_score_tuple = 0.0, None pre_split_entropy = _weighted_entropy(countdf, total_count, prev_split_columns, target_column_name, True) for column_name in valid_column_names: if column_name == target_column_name: continue new_split_columns = prev_split_columns[:] new_split_columns.append(column_name) post_split_entropy = _weighted_entropy(countdf, total_count, new_split_columns, target_column_name, True) value = pre_split_entropy - post_split_entropy if normalization and normalization[column_name] > 0: value /= math.log(normalization[column_name]) if value > max_score_tuple[0]: max_score_tuple = value, column_name return max_score_tuple
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: object, newType: object) -> object: """ Convert the data type of DataFrame columns """ for name in names: df = df.withColumn(name, df[name].cast(newType)) return df
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: list, newType) -> pyspark.sql.dataframe.DataFrame: """ A custom function to convert the data type of DataFrame columns """ for name in names: df = df.withColumn(name, df[name].cast(newType)) return df
def find_removed_item(month_merge_late: pyspark.sql.dataframe.DataFrame, month_merge_early: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ removed item: The items are in distribution report, but have no sales from July-Sep (or Sep-Nov) """ Removed_df = dist_df.join(month_merge_late, on=["MatID"], how="left").fillna( 0, subset=['totalMonthlyGrossSale']) Removed_df = dist_df.join(month_merge_early, on=["MatID"], how="inner").fillna( 0, subset=['totalMonthlyGrossSale']) Removed_item = Removed_df.filter(Removed_df.totalMonthlyGrossSale == 0) Removed_item = Removed_item.select(output_columns) return Removed_item
def remove_no_stock_item( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: hassale_item = df.groupBy("SKU", "Store").agg({ "StockQty": "sum" }).filter(col('sum(StockQty)') != 0).drop('sum(StockQty)') new_df = hassale_item.join(df, on=["SKU", "Store"], how='inner') return new_df
def create_list_dates(df: pyspark.sql.dataframe.DataFrame) -> list: """ Create a list of dates, start from the first day of dataset end with the last day of dataset :param df: dataframe :return: a list of dates """ end = df.agg({"Date": "max"}).collect()[0][0] + timedelta(days=1) start = df.agg({"Date": "min"}).collect()[0][0] date_generated = [ start + timedelta(days=x) for x in range(0, (end - start).days) ] # Test the output #test_list_dates(date_generated, end, start) return date_generated
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame, cols: list, result_name: str, rename_fields: Dict[str, str] = None): """ Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]` Args: df: The input Spark DataFrame cols: The array columns that should be zipped result_name: The name of the column that will contain the newly created struct rename_fields: dictionary mapping column names to new struct field names. Used to rename columns in the newly created struct. Returns: `df.withColumn(result_name, zip(explode(cols)))` """ df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols))) if rename_fields: # create schema of new struct by simply renaming the top-level struct fields old_schema: t.StructType = df.schema[result_name].dataType # rename field if field ist in `old_schema.fieldNames()` new_field_names = [ rename_fields[field] if field in rename_fields else field for field in old_schema.fieldNames() ] new_schema = t.StructType([ t.StructField(name, field.dataType) for name, field in zip(new_field_names, old_schema.fields) ]) df = df.withColumn(result_name, f.col(result_name).cast(new_schema)) # # old method using withColumn and a new struct; breaks with PySpark 3.0 # df = df.withColumn(target_struct, f.struct(*[ # f.col(target_struct + "." + actualName).alias(targetName) # for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames()) # ])) return df
def merge_dataset( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: # Generate sale information for each product in each month month_df = df.select('MatID', "SKU", year("Date").alias('year'), month("Date").alias('month'), 'GrossSales', 'NetSales', 'COGS', 'QtySold', 'Price', 'SellMargin', 'FrontMargin', 'SubCategory', 'Vendor').groupBy("month", 'MatID', "SubCategory", 'Vendor') ## get the average net-sales of each product month_avg_NetSale = month_df.avg("NetSales").withColumnRenamed( "avg(NetSales)", "totalMonthlyNetSale") ## get the average gross-sales of each product month_avg_GrossSale = month_df.avg("GrossSales").withColumnRenamed( "avg(GrossSales)", "totalMonthlyGrossSale") month_avg_COGS = month_df.avg("COGS").withColumnRenamed( "avg(COGS)", "avgCOGS") month_avg_QtySold = month_df.avg("QtySold").withColumnRenamed( "avg(QtySold)", "totalMonthlyQtySold") month_avg_Price = month_df.avg("Price").withColumnRenamed( "avg(Price)", "Price") month_avg_SM = month_df.avg("SellMargin").withColumnRenamed( "avg(SellMargin)", "SellMargin") month_avg_FM = month_df.avg("FrontMargin").withColumnRenamed( "avg(FrontMargin)", "avgFrontMargin") month_merge = month_avg_NetSale.join( month_avg_GrossSale, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") month_merge = month_merge.join( month_avg_COGS, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") month_merge = month_merge.join( month_avg_QtySold, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") month_merge = month_merge.join( month_avg_Price, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") month_merge = month_merge.join( month_avg_SM, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") month_merge = month_merge.join( month_avg_FM, on=["MatID", 'month', 'SubCategory', 'Vendor'], how="inner") return month_merge
def clean_dist_df( dist_df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: # filter data dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply', 'Classification', 'Mat ID', '# POGs') ### Rename column dist_df = dist_df.withColumnRenamed("Name", "SKU") dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply") dist_df = dist_df.withColumnRenamed("Mat ID", "MatID") dist_df = dist_df.withColumnRenamed("# POGs", "POGS") # Conver columns to `FloatType()` dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float')) dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float')) dist_df = dist_df.withColumn("DaysSupply", dist_df.DaysSupply.cast('float')) dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer')) dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer')) return dist_df
def find_Incorrect_record_items( month_merge: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ The items has extremely high ratio of capacity/facing. (Ratio >6) """ Incorrect_record_items = month_merge.filter( col('Capacity') / col('Facings') > 6) Incorrect_record_items = Incorrect_record_items.withColumn( "Depth", col('Capacity') / col('Facings')).select(output_columns) return Incorrect_record_items
def find_check_item(month_merge: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ checked item: The items are in distribution report, but have no sales from Apr-Sep (or Sep-Nov) """ check_df = dist_df.join(month_merge, on=["MatID"], how="left").fillna( 0, subset=['totalMonthlyGrossSale']) check_item = check_df.filter(check_df.totalMonthlyGrossSale == 0) check_item = check_item.select(output_columns) return check_item
def Group_and_save_atLeastOneMonth_SKU( unchanged_SKU: pyspark.sql.dataframe.DataFrame, changed_SKU: pyspark.sql.dataframe.DataFrame): """ Separate unadjusted SKU to three sheets within same excel file: Capacity_to_avg_qty<3, Capacity_to_avg_qty<9 and Capacity_to_avg_qty>=3, Capacity_to_avg_qty>=9 """ # Separate SKU and save to excel files. changed_SKU.toPandas().to_csv( '../data/Output/atLeastOneMonth/adjusted_SKU.csv', index=False, encoding='utf-8') print( "Save adjusted SKU(atLeastOneMonth) to Output/atLeastOneMonth/adjusted_SKU.csv" ) unchanged_SKU = unchanged_SKU.toPandas() unchanged_SKU1 = unchanged_SKU.query('Capacity_to_avg_qty<3') unchanged_SKU2 = unchanged_SKU.query( 'Capacity_to_avg_qty<9 and Capacity_to_avg_qty>=3') unchanged_SKU3 = unchanged_SKU.query('Capacity_to_avg_qty>=9') writer = ExcelWriter('../data/Output/atLeastOneMonth/unadjusted_SKU.xlsx') unchanged_SKU1.to_excel(writer, 'lessThan3', index=False) unchanged_SKU2.to_excel(writer, 'between3And9', index=False) unchanged_SKU3.to_excel(writer, 'moreThan9', index=False) writer.save() print( "Save unadjusted SKU(atLeastOneMonth) to Output/atLeastOneMonth/unadjusted_SKU.xlsx" )
def find_Depth2_items(month_merge: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ same as Capacity < Facings*2. They are issued items: for example: 1. Capacity = Facings = 1, incorrect 2. Facing = 3, Capacity = 5, incorrect 3. Capacity should > Facing. """ Depth2_items = month_merge.filter(col('Capacity') / col('Facings') < 2) Depth2_items = Depth2_items.withColumn( "Depth", col('Capacity') / col('Facings')).select(output_columns) return Depth2_items
def calculate_mean_std_and_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month """ df_group = df.groupby('month', 'SubCategory') df = calculate_geometric_mean(df) df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\ .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\ .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat") df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat") df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\ .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat") # join to get final dataset df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner') return df
def find_new_item(month_merge_late: pyspark.sql.dataframe.DataFrame, dist_df: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ new item: The items are not in distribution report, but have sale history from July-Sep (or Sep-Nov) """ New_df = dist_df.join(month_merge_late, on=["MatID"], how="right").fillna(0, subset=['totalMonthlyGrossSale']) New_item = New_df.filter( New_df.totalMonthlyGrossSale != 0) # new item is sold during July- Sep New_item = New_item.filter(col( "Classification").isNull()) # new item has no classification records New_item = New_item.select(output_columns) return New_item
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame): name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}' while os.path.exists(name): name += '_' if check_hdfs_file_ex(name): sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) for column in sdf.dtypes: if 'date' in column[1]: sdf = sdf.withColumn( column[0], F.col(column[0]).cast(T.TimestampType()).alias(column[0])) sdf.write.mode('overwrite').parquet(name) sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd())) sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) data = pd.read_parquet(name + '/') os.system(f'rm -r {os.getcwd()}/{name}') return data
def _weighted_entropy( countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True ) -> float: """Entropy calculation across many .""" split_columns_plus_target = split_columns[:] split_columns_plus_target.append(target_column_name) groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count")) w = Window.partitionBy(split_columns) groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn( "weight", F.sum(groupdf["group_count"] / total_count).over(w) ) entropydf = groupdf.groupby(split_columns).agg( (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight") ) if weighted: result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0] else: result = entropydf.groupby().sum("entropy").collect()[0][0] return result
def clean_dataset( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: ## Select the target features df = df.select('Index', 'Date Detail', 'Company', 'Business Unit', 'Concept_NEW', 'Product Category', 'Company and Cost Centre', 'SKU', 'POS Net Sales', 'Rank Total') ## Reanme columns df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("Date Detail", "Date") df = df.withColumnRenamed("Product Category", "Category") df = df.withColumnRenamed("Company and Cost Centre", "Store") df = df.withColumnRenamed("Business Unit", "BusinessUnit") df = df.withColumnRenamed("Rank Total", "rank") ## Column type cast columns = ['NetSales', 'rank'] df = convertColumn(df, columns, FloatType()) # Replace none to 0 df = df.na.fill(0) return df
def estimate_segments( df: pyspark.sql.dataframe.DataFrame, target_field: str = None, max_segments: int = 30, include_columns: List[str] = [], unique_perc_bounds: Tuple[float, float] = [None, 0.8], null_perc_bounds: Tuple[float, float] = [None, 0.2], ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. If no target column provided, maximum entropy column is substituted. :param df: the dataframe of data to profile :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance. :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive. :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive. :return: a list of segmentation feature names """ current_split_columns = [] segments = [] segments_used = 1 max_entropy_column = (float("-inf"), None) if not unique_perc_bounds[0]: unique_perc_bounds[0] = float("-inf") if not unique_perc_bounds[1]: unique_perc_bounds[1] = float("inf") if not null_perc_bounds[0]: null_perc_bounds[0] = float("-inf") if not null_perc_bounds[1]: null_perc_bounds[1] = float("inf") valid_column_names = set() count = df.count() print("Limiting to categorical (string) data columns...") valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)} print("Gathering cardinality information...") n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names} print("Gathering missing value information...") n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names} print("Finding valid columns for autosegmentation...") for col in valid_column_names.copy(): null_perc = 0.0 if count == 0 else n_nulls[col] / count unique_perc = 0.0 if count == 0 else n_uniques[col] / count if ( col in segments or n_uniques[col] <= 1 or null_perc < null_perc_bounds[0] or null_perc >= null_perc_bounds[1] or unique_perc < unique_perc_bounds[0] or unique_perc >= unique_perc_bounds[1] ): valid_column_names.remove(col) if not valid_column_names: return [] if not target_field: print("Finding alternative target field since none were specified...") for col in valid_column_names: col_entropy = _simple_entropy(df, col) if n_uniques[col] > 1: col_entropy /= math.log(n_uniques[col]) if col_entropy > max_entropy_column[0]: max_entropy_column = (col_entropy, col) target_field = max_entropy_column[1] print(f"Using {target_field} column as target field.") assert target_field in df.columns valid_column_names.add(target_field) valid_column_names = list(valid_column_names) countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache() print("Calculating segments...") while segments_used < max_segments: valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))} _, segment_column_name = _find_best_split( countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques ) if not segment_column_name: break segments.append(segment_column_name) current_split_columns.append(segment_column_name) segments_used *= n_uniques[segment_column_name] return segments