Exemple #1
0
    def hist_numeric(_min_max, _buckets):
        if _min_max is None:
            _min_max = df.agg(F.min(col_name).alias("min"), F.max(col_name).alias("max")).to_dict()[0]

        if _min_max["min"] is not None and _min_max["max"] is not None:
            _buckets = create_buckets(_min_max["min"], _min_max["max"], _buckets)
            _exprs = create_exprs(col_name, _buckets, F.col)
        else:
            _exprs = None

        return _exprs
Exemple #2
0
    def hist_date():
        now = datetime.datetime.now()
        current_year = now.year
        oldest_year = 1950

        # Year
        _buckets = create_buckets(oldest_year, current_year,
                                  current_year - oldest_year)
        func = F.year
        year = create_exprs(col_name, _buckets, func)

        # Month
        _buckets = create_buckets(1, 12, 11)
        func = F.month
        month = create_exprs(col_name, _buckets, func)

        # Day
        _buckets = create_buckets(1, 31, 31)
        func = F.dayofweek
        day = create_exprs(col_name, _buckets, func)

        # Hour
        _buckets = create_buckets(0, 23, 23)
        func = F.hour
        hour = create_exprs(col_name, _buckets, func)

        # Min
        _buckets = create_buckets(0, 60, 60)
        func = F.minute
        minutes = create_exprs(col_name, _buckets, func)

        # Second
        _buckets = create_buckets(0, 60, 60)
        func = F.second
        second = create_exprs(col_name, _buckets, func)

        exprs = F.create_map(F.lit("years"), year, F.lit("months"), month,
                             F.lit("weekdays"), day, F.lit("hours"), hour,
                             F.lit("minutes"), minutes, F.lit("seconds"),
                             second)

        return exprs
Exemple #3
0
 def hist_string(_buckets):
     _buckets = create_buckets(0, 50, _buckets)
     func = F.length
     return create_exprs(col_name, _buckets, func)
Exemple #4
0
def hist_agg(col_name, df, buckets, min_max=None):
    """
    Create a columns expression to calculate a column histogram
    :param col_name:
    :param df:
    :param buckets:
    :return:
    """
    def create_exprs(_input_col, _buckets, _func):
        def count_exprs(_exprs):
            return F.sum(F.when(_exprs, 1).otherwise(0))

        _exprs = []
        for i, b in enumerate(_buckets):
            lower = b["lower"]
            upper = b["upper"]

            if is_numeric(lower):
                lower = round(lower, 2)
            if is_numeric(upper):
                upper = round(upper, 2)

            if i == len(_buckets):
                count = count_exprs((_func(_input_col) > lower)
                                    & (_func(_input_col) <= upper))
            else:
                count = count_exprs((_func(_input_col) >= lower)
                                    & (_func(_input_col) < upper))
            info = F.create_map(F.lit("count"), count.cast("int"),
                                F.lit("lower"), F.lit(lower), F.lit("upper"),
                                F.lit(upper)).alias("hist_agg" + "_" +
                                                    _input_col + "_" +
                                                    str(b["bucket"]))
            _exprs.append(info)
        _exprs = F.array(*_exprs).alias("hist" + _input_col)
        # print(_exprs)
        return _exprs

    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        if min_max is None:
            min_max = df.agg(
                F.min(col_name).alias("min"),
                F.max(col_name).alias("max")).to_dict()[0]

        if min_max["min"] is not None and min_max["max"] is not None:
            buckets = create_buckets(min_max["min"], min_max["max"], buckets)
            func = F.col
            exprs = create_exprs(col_name, buckets, func)
        else:
            exprs = None

    elif is_column_a(df, col_name, "str"):
        buckets = create_buckets(0, 50, buckets)
        func = F.length
        exprs = create_exprs(col_name, buckets, func)

    elif is_column_a(df, col_name, "date"):

        now = datetime.datetime.now()
        current_year = now.year
        oldest_year = 1950

        # Year
        buckets = create_buckets(oldest_year, current_year,
                                 current_year - oldest_year)
        func = F.year
        year = create_exprs(col_name, buckets, func)

        # Month
        buckets = create_buckets(1, 12, 11)
        func = F.month
        month = create_exprs(col_name, buckets, func)

        # Day
        buckets = create_buckets(1, 31, 31)
        func = F.dayofweek
        day = create_exprs(col_name, buckets, func)

        # Hour
        buckets = create_buckets(0, 23, 23)
        func = F.hour
        hour = create_exprs(col_name, buckets, func)

        # Min
        buckets = create_buckets(0, 60, 60)
        func = F.minute
        minutes = create_exprs(col_name, buckets, func)

        # Second
        buckets = create_buckets(0, 60, 60)
        func = F.second
        second = create_exprs(col_name, buckets, func)

        exprs = F.create_map(F.lit("years"), year, F.lit("months"), month,
                             F.lit("weekdays"), day, F.lit("hours"), hour,
                             F.lit("minutes"), minutes, F.lit("seconds"),
                             second)
    else:
        exprs = None

    return exprs