def create_list_dates(df: pyspark.sql.dataframe.DataFrame) -> list: """ Create a list of dates, start from the first day of dataset end with the last day of dataset :param df: dataframe :return: a list of dates """ end = df.agg({"Date": "max"}).collect()[0][0] + timedelta(days=1) start = df.agg({"Date": "min"}).collect()[0][0] date_generated = [ start + timedelta(days=x) for x in range(0, (end - start).days) ] # Test the output #test_list_dates(date_generated, end, start) return date_generated
def estimate_segments( df: pyspark.sql.dataframe.DataFrame, target_field: str = None, max_segments: int = 30, include_columns: List[str] = [], unique_perc_bounds: Tuple[float, float] = [None, 0.8], null_perc_bounds: Tuple[float, float] = [None, 0.2], ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. If no target column provided, maximum entropy column is substituted. :param df: the dataframe of data to profile :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance. :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive. :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive. :return: a list of segmentation feature names """ current_split_columns = [] segments = [] segments_used = 1 max_entropy_column = (float("-inf"), None) if not unique_perc_bounds[0]: unique_perc_bounds[0] = float("-inf") if not unique_perc_bounds[1]: unique_perc_bounds[1] = float("inf") if not null_perc_bounds[0]: null_perc_bounds[0] = float("-inf") if not null_perc_bounds[1]: null_perc_bounds[1] = float("inf") valid_column_names = set() count = df.count() print("Limiting to categorical (string) data columns...") valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)} print("Gathering cardinality information...") n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names} print("Gathering missing value information...") n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names} print("Finding valid columns for autosegmentation...") for col in valid_column_names.copy(): null_perc = 0.0 if count == 0 else n_nulls[col] / count unique_perc = 0.0 if count == 0 else n_uniques[col] / count if ( col in segments or n_uniques[col] <= 1 or null_perc < null_perc_bounds[0] or null_perc >= null_perc_bounds[1] or unique_perc < unique_perc_bounds[0] or unique_perc >= unique_perc_bounds[1] ): valid_column_names.remove(col) if not valid_column_names: return [] if not target_field: print("Finding alternative target field since none were specified...") for col in valid_column_names: col_entropy = _simple_entropy(df, col) if n_uniques[col] > 1: col_entropy /= math.log(n_uniques[col]) if col_entropy > max_entropy_column[0]: max_entropy_column = (col_entropy, col) target_field = max_entropy_column[1] print(f"Using {target_field} column as target field.") assert target_field in df.columns valid_column_names.add(target_field) valid_column_names = list(valid_column_names) countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache() print("Calculating segments...") while segments_used < max_segments: valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))} _, segment_column_name = _find_best_split( countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques ) if not segment_column_name: break segments.append(segment_column_name) current_split_columns.append(segment_column_name) segments_used *= n_uniques[segment_column_name] return segments