def hist_numeric(_min_max, _buckets): if _min_max is None: _min_max = df.agg(F.min(col_name).alias("min"), F.max(col_name).alias("max")).to_dict()[0] if _min_max["min"] is not None and _min_max["max"] is not None: _buckets = create_buckets(_min_max["min"], _min_max["max"], _buckets) _exprs = create_exprs(col_name, _buckets, F.col) else: _exprs = None return _exprs
def hist_date(): now = datetime.datetime.now() current_year = now.year oldest_year = 1950 # Year _buckets = create_buckets(oldest_year, current_year, current_year - oldest_year) func = F.year year = create_exprs(col_name, _buckets, func) # Month _buckets = create_buckets(1, 12, 11) func = F.month month = create_exprs(col_name, _buckets, func) # Day _buckets = create_buckets(1, 31, 31) func = F.dayofweek day = create_exprs(col_name, _buckets, func) # Hour _buckets = create_buckets(0, 23, 23) func = F.hour hour = create_exprs(col_name, _buckets, func) # Min _buckets = create_buckets(0, 60, 60) func = F.minute minutes = create_exprs(col_name, _buckets, func) # Second _buckets = create_buckets(0, 60, 60) func = F.second second = create_exprs(col_name, _buckets, func) exprs = F.create_map(F.lit("years"), year, F.lit("months"), month, F.lit("weekdays"), day, F.lit("hours"), hour, F.lit("minutes"), minutes, F.lit("seconds"), second) return exprs
def hist_string(_buckets): _buckets = create_buckets(0, 50, _buckets) func = F.length return create_exprs(col_name, _buckets, func)
def hist_agg(col_name, df, buckets, min_max=None): """ Create a columns expression to calculate a column histogram :param col_name: :param df: :param buckets: :return: """ def create_exprs(_input_col, _buckets, _func): def count_exprs(_exprs): return F.sum(F.when(_exprs, 1).otherwise(0)) _exprs = [] for i, b in enumerate(_buckets): lower = b["lower"] upper = b["upper"] if is_numeric(lower): lower = round(lower, 2) if is_numeric(upper): upper = round(upper, 2) if i == len(_buckets): count = count_exprs((_func(_input_col) > lower) & (_func(_input_col) <= upper)) else: count = count_exprs((_func(_input_col) >= lower) & (_func(_input_col) < upper)) info = F.create_map(F.lit("count"), count.cast("int"), F.lit("lower"), F.lit(lower), F.lit("upper"), F.lit(upper)).alias("hist_agg" + "_" + _input_col + "_" + str(b["bucket"])) _exprs.append(info) _exprs = F.array(*_exprs).alias("hist" + _input_col) # print(_exprs) return _exprs if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): if min_max is None: min_max = df.agg( F.min(col_name).alias("min"), F.max(col_name).alias("max")).to_dict()[0] if min_max["min"] is not None and min_max["max"] is not None: buckets = create_buckets(min_max["min"], min_max["max"], buckets) func = F.col exprs = create_exprs(col_name, buckets, func) else: exprs = None elif is_column_a(df, col_name, "str"): buckets = create_buckets(0, 50, buckets) func = F.length exprs = create_exprs(col_name, buckets, func) elif is_column_a(df, col_name, "date"): now = datetime.datetime.now() current_year = now.year oldest_year = 1950 # Year buckets = create_buckets(oldest_year, current_year, current_year - oldest_year) func = F.year year = create_exprs(col_name, buckets, func) # Month buckets = create_buckets(1, 12, 11) func = F.month month = create_exprs(col_name, buckets, func) # Day buckets = create_buckets(1, 31, 31) func = F.dayofweek day = create_exprs(col_name, buckets, func) # Hour buckets = create_buckets(0, 23, 23) func = F.hour hour = create_exprs(col_name, buckets, func) # Min buckets = create_buckets(0, 60, 60) func = F.minute minutes = create_exprs(col_name, buckets, func) # Second buckets = create_buckets(0, 60, 60) func = F.second second = create_exprs(col_name, buckets, func) exprs = F.create_map(F.lit("years"), year, F.lit("months"), month, F.lit("weekdays"), day, F.lit("hours"), hour, F.lit("minutes"), minutes, F.lit("seconds"), second) else: exprs = None return exprs