Example #1
0
    def extra_stats(self, df, col_name, stats):
        """
        Specific Stats for numeric columns
        :param df:
        :param col_name:
        :param stats:
        :return:
        """

        col_info = {}

        max_value = stats[col_name]["max"]
        min_value = stats[col_name]["min"]

        if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
            stddev = stats[col_name]['stddev']
            mean = stats[col_name]['mean']

            quantile = stats[col_name]["percentile"]
            col_info['range'] = max_value - min_value
            col_info['median'] = quantile["0.5"]
            col_info['interquartile_range'] = quantile["0.75"] - quantile["0.25"]

            if mean != 0:
                col_info['coef_variation'] = round((stddev / mean), 5)
            else:
                col_info['coef_variation'] = 0

            col_info['mad'] = round(df.cols.mad(col_name), 5)

        col_info['p_count_na'] = round((stats[col_name]['count_na'] * 100) / self.rows_count, 2)
        col_info['p_count_uniques'] = round((stats[col_name]['count_uniques'] * 100) / self.rows_count, 2)
        return col_info
Example #2
0
def percentile_agg(col_name, df, values, relative_error):
    """
    Return the percentile of a dataframe
    :param col_name:  '*', list of columns names or a single column name.
    :param df:
    :param values: list of percentiles to be calculated
    :param relative_error:  If set to zero, the exact percentiles are computed, which could be very expensive. 0 to 1 accepted
    :return: percentiles per columns
    """

    # Make sure values are double

    if values is None:
        values = [0.05, 0.25, 0.5, 0.75, 0.95]

    values = val_to_list(values)
    values = list(map(str, values))

    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        # Get percentiles

        p = F.expr(
            "percentile_approx(`{COLUMN}`, array({VALUES}), {ERROR})".format(
                COLUMN=col_name,
                VALUES=" , ".join(values),
                ERROR=relative_error))

        # Zip the arrays
        expr = [[F.lit(v), p.getItem(i)] for i, v in enumerate(values)]
        expr = F.create_map(*list(itertools.chain(*expr)))

    else:
        expr = None
    # print(expr)
    return expr
Example #3
0
def count_na_agg(col_name, df):
    # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean
    # if is_column_a(df, col_name, ["struct", "boolean"]):
    #     df = df.cols.cast(col_name, "string")

    # Select the nan/null rows depending of the columns data type
    # If numeric
    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        expr = F.count(F.when(match_nulls_integers(col_name), col_name))
    # If string. Include 'nan' string
    elif is_column_a(df, col_name, PYSPARK_STRING_TYPES):
        expr = F.count(F.when(match_nulls_strings(col_name), col_name))
        # print("Including 'nan' as Null in processing string type column '{}'".format(col_name))
    else:
        expr = F.count(F.when(match_null(col_name), col_name))

    return expr
Example #4
0
        def extra_columns_stats(df, col_name, stats):
            """
            Specific Stats for numeric columns
            :param df:
            :param col_name:
            :param stats:
            :return:
            """

            col_info = {}

            max_value = stats[col_name]["max"]
            min_value = stats[col_name]["min"]

            if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                stddev = stats[col_name]['stddev']
                mean = stats[col_name]['mean']

                quantile = stats[col_name]["percentile"]
                if max_value is not None and min_value is not None:
                    col_info['range'] = max_value - min_value
                else:
                    col_info['range'] = None

                col_info['median'] = quantile["0.5"]

                q1 = quantile["0.25"]
                q3 = quantile["0.75"]

                if q1 is not None and q3 is not None:
                    col_info['interquartile_range'] = q3 - q1
                else:
                    col_info['interquartile_range'] = None

                if mean != 0 and mean is not None:
                    col_info['coef_variation'] = round((stddev / mean), 5)
                else:
                    col_info['coef_variation'] = None

                mad = df.cols.mad(col_name)
                if mad is not None:
                    col_info['mad'] = round(df.cols.mad(col_name), 5)
                else:
                    col_info['mad'] = None

            if self.rows_count is None:
                self.rows_count = df.count()

            col_info['p_count_na'] = round(
                (stats[col_name]['count_na'] * 100) / self.rows_count, 2)
            col_info['p_count_uniques'] = round(
                (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2)
            return col_info
Example #5
0
    def minimal_stats(df, columns, buckets=10, approx_count=True):
        columns = parse_columns(df, columns)
        n = 60
        list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)]
        # we have problems sending +100 columns at the same time. Process in batch

        result = {}
        for i, cols in enumerate(list_columns):
            logger.print("Batch {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [count_uniques_agg]
            exprs = df.cols.create_exprs(cols, funcs, approx_count)

            funcs = [F.min, F.max]
            exprs.extend(df.cols.create_exprs(cols, funcs))

            funcs = [count_na_agg]
            exprs.extend(df.cols.create_exprs(cols, funcs, df))
            result.update(df.cols.exec_agg(exprs))

        n = 60
        # 40 2:46 seg
        # 50 2:12
        list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)]
        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [hist_agg]
            min_max = {}

            for col_name in cols:
                if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                    min_max = {"min": result[col_name]["min"], "max": result[col_name]["max"]}

            exprs.extend(df.cols.create_exprs(cols, funcs, df, buckets, min_max))
            result.update(df.cols.exec_agg(exprs))
        return result
Example #6
0
    def columns_agg(df,
                    columns,
                    buckets=10,
                    relative_error=RELATIVE_ERROR,
                    approx_count=True):
        columns = parse_columns(df, columns)
        n = BATCH_SIZE
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        # we have problems sending +100 columns at the same time. Process in batch

        result = {}
        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [count_uniques_agg]
            exprs = df.cols.create_exprs(cols, funcs, approx_count)

            # TODO: in basic calculations funcs = [F.min, F.max]
            funcs = [
                F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum,
                F.variance, zeros_agg
            ]
            exprs.extend(df.cols.create_exprs(cols, funcs))

            # TODO: None in basic calculation
            funcs = [percentile_agg]
            exprs.extend(
                df.cols.create_exprs(cols, funcs, df,
                                     [0.05, 0.25, 0.5, 0.75, 0.95],
                                     relative_error))

            funcs = [count_na_agg]
            exprs.extend(df.cols.create_exprs(cols, funcs, df))
            result.update(df.cols.exec_agg(exprs))

        exprs = []
        n = BATCH_SIZE
        result_hist = {}
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [hist_agg]
            # min_max = None

            for col_name in cols:
                # Only process histogram id numeric. For toher data types using frequency
                if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                    min_max = {
                        "min": result[col_name]["min"],
                        "max": result[col_name]["max"]
                    }
                    buckets = result[col_name]["count_uniques"] - 1
                    if buckets > MAX_BUCKETS:
                        buckets = MAX_BUCKETS
                    elif buckets == 0:
                        buckets = 1
                    exprs.extend(
                        df.cols.create_exprs(col_name, funcs, df, buckets,
                                             min_max))

            agg_result = df.cols.exec_agg(exprs)
            if agg_result is not None:
                result_hist.update(agg_result)

        # Merge results
        for col_name in result:
            if col_name in result_hist:
                result[col_name].update(result_hist[col_name])
        return result
Example #7
0
def hist_agg(col_name, df, buckets, min_max=None, dtype=None):
    """
    Create a columns expression to calculate a column histogram
    :param col_name:
    :param df:
    :param buckets:
    :param min_max: Min and max vaule neccesary to calculate the buckets
    :param dtype: Column datatype to calculate the related histogram. Int, String and Dates return different histograms

    :return:
    """
    def create_exprs(_input_col, _buckets, _func):
        def count_exprs(_exprs):
            return F.sum(F.when(_exprs, 1).otherwise(0))

        _exprs = []
        for i, b in enumerate(_buckets):
            lower = b["lower"]
            upper = b["upper"]

            if is_numeric(lower):
                lower = round(lower, 2)

            if is_numeric(upper):
                upper = round(upper, 2)

            if len(_buckets) == 1:
                count = count_exprs((_func(_input_col) == lower))
            else:
                if i == len(_buckets):
                    count = count_exprs((_func(_input_col) > lower)
                                        & (_func(_input_col) <= upper))
                else:
                    count = count_exprs((_func(_input_col) >= lower)
                                        & (_func(_input_col) < upper))

            info = F.create_map(F.lit("count"), count.cast("int"),
                                F.lit("lower"), F.lit(lower), F.lit("upper"),
                                F.lit(upper)).alias("hist_agg" + "_" +
                                                    _input_col + "_" +
                                                    str(b["bucket"]))
            _exprs.append(info)
        _exprs = F.array(*_exprs).alias("hist" + _input_col)
        return _exprs

    def hist_numeric(_min_max, _buckets):
        if _min_max is None:
            _min_max = df.agg(
                F.min(col_name).alias("min"),
                F.max(col_name).alias("max")).to_dict()[0]

        if _min_max["min"] is not None and _min_max["max"] is not None:
            _buckets = create_buckets(_min_max["min"], _min_max["max"],
                                      _buckets)
            _exprs = create_exprs(col_name, _buckets, F.col)
        else:
            _exprs = None

        return _exprs

    def hist_string(_buckets):
        _buckets = create_buckets(0, 50, _buckets)
        func = F.length
        return create_exprs(col_name, _buckets, func)

    def hist_date():
        now = datetime.datetime.now()
        current_year = now.year
        oldest_year = 1950

        # Year
        _buckets = create_buckets(oldest_year, current_year,
                                  current_year - oldest_year)
        func = F.year
        year = create_exprs(col_name, _buckets, func)

        # Month
        _buckets = create_buckets(1, 12, 11)
        func = F.month
        month = create_exprs(col_name, _buckets, func)

        # Day
        _buckets = create_buckets(1, 31, 31)
        func = F.dayofweek
        day = create_exprs(col_name, _buckets, func)

        # Hour
        _buckets = create_buckets(0, 23, 23)
        func = F.hour
        hour = create_exprs(col_name, _buckets, func)

        # Min
        _buckets = create_buckets(0, 60, 60)
        func = F.minute
        minutes = create_exprs(col_name, _buckets, func)

        # Second
        _buckets = create_buckets(0, 60, 60)
        func = F.second
        second = create_exprs(col_name, _buckets, func)

        exprs = F.create_map(F.lit("years"), year, F.lit("months"), month,
                             F.lit("weekdays"), day, F.lit("hours"), hour,
                             F.lit("minutes"), minutes, F.lit("seconds"),
                             second)

        return exprs

    if dtype is not None:
        col_dtype = dtype[col_name]["dtype"]
        if col_dtype == "int" or col_dtype == "decimal":
            exprs = hist_numeric(min_max, buckets)
        elif col_dtype == "string":
            exprs = hist_string(buckets)
        elif col_dtype == "date":
            exprs = hist_date()
        else:
            exprs = None
    else:
        if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
            exprs = hist_numeric(min_max, buckets)

        elif is_column_a(df, col_name, "str"):
            exprs = hist_string(buckets)

        elif is_column_a(df, col_name, "date") or is_column_a(
                df, col_name, "timestamp"):
            exprs = hist_date()
        else:
            exprs = None

    return exprs
Example #8
0
def hist_agg(col_name, df, buckets, min_max=None):
    """
    Create a columns expression to calculate a column histogram
    :param col_name:
    :param df:
    :param buckets:
    :return:
    """
    def create_exprs(_input_col, _buckets, _func):
        def count_exprs(_exprs):
            return F.sum(F.when(_exprs, 1).otherwise(0))

        _exprs = []
        for i, b in enumerate(_buckets):
            lower = b["lower"]
            upper = b["upper"]

            if is_numeric(lower):
                lower = round(lower, 2)
            if is_numeric(upper):
                upper = round(upper, 2)

            if i == len(_buckets):
                count = count_exprs((_func(_input_col) > lower)
                                    & (_func(_input_col) <= upper))
            else:
                count = count_exprs((_func(_input_col) >= lower)
                                    & (_func(_input_col) < upper))
            info = F.create_map(F.lit("count"), count.cast("int"),
                                F.lit("lower"), F.lit(lower), F.lit("upper"),
                                F.lit(upper)).alias("hist_agg" + "_" +
                                                    _input_col + "_" +
                                                    str(b["bucket"]))
            _exprs.append(info)
        _exprs = F.array(*_exprs).alias("hist" + _input_col)
        # print(_exprs)
        return _exprs

    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        if min_max is None:
            min_max = df.agg(
                F.min(col_name).alias("min"),
                F.max(col_name).alias("max")).to_dict()[0]

        if min_max["min"] is not None and min_max["max"] is not None:
            buckets = create_buckets(min_max["min"], min_max["max"], buckets)
            func = F.col
            exprs = create_exprs(col_name, buckets, func)
        else:
            exprs = None

    elif is_column_a(df, col_name, "str"):
        buckets = create_buckets(0, 50, buckets)
        func = F.length
        exprs = create_exprs(col_name, buckets, func)

    elif is_column_a(df, col_name, "date"):

        now = datetime.datetime.now()
        current_year = now.year
        oldest_year = 1950

        # Year
        buckets = create_buckets(oldest_year, current_year,
                                 current_year - oldest_year)
        func = F.year
        year = create_exprs(col_name, buckets, func)

        # Month
        buckets = create_buckets(1, 12, 11)
        func = F.month
        month = create_exprs(col_name, buckets, func)

        # Day
        buckets = create_buckets(1, 31, 31)
        func = F.dayofweek
        day = create_exprs(col_name, buckets, func)

        # Hour
        buckets = create_buckets(0, 23, 23)
        func = F.hour
        hour = create_exprs(col_name, buckets, func)

        # Min
        buckets = create_buckets(0, 60, 60)
        func = F.minute
        minutes = create_exprs(col_name, buckets, func)

        # Second
        buckets = create_buckets(0, 60, 60)
        func = F.second
        second = create_exprs(col_name, buckets, func)

        exprs = F.create_map(F.lit("years"), year, F.lit("months"), month,
                             F.lit("weekdays"), day, F.lit("hours"), hour,
                             F.lit("minutes"), minutes, F.lit("seconds"),
                             second)
    else:
        exprs = None

    return exprs
Example #9
0
    def columns_agg(self,
                    df,
                    columns,
                    buckets=10,
                    relative_error=RELATIVE_ERROR,
                    approx_count=True,
                    advanced_stats=True):
        columns = parse_columns(df, columns)
        n = BATCH_SIZE
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        # we have problems sending +100 columns at the same time. Processing in batch

        result = {}

        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            # Count uniques is necessary for calculate the histogram buckets
            funcs = [count_uniques_agg]
            exprs = df.cols.create_exprs(cols, funcs, approx_count)

            funcs = [F.min, F.max]
            exprs.extend(df.cols.create_exprs(cols, funcs))

            funcs = [count_na_agg]
            exprs.extend(df.cols.create_exprs(cols, funcs, df))

            if advanced_stats is True:
                funcs = [
                    F.stddev, F.kurtosis, F.mean, F.skewness, F.sum,
                    F.variance, zeros_agg
                ]
                exprs.extend(df.cols.create_exprs(cols, funcs))

                # TODO: None in basic calculation
                funcs = [percentile_agg]
                exprs.extend(
                    df.cols.create_exprs(cols, funcs, df,
                                         [0.05, 0.25, 0.5, 0.75, 0.95],
                                         relative_error))

            result.update(df.cols.exec_agg(exprs))

        n = BATCH_SIZE
        result_hist = {}
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]

        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [hist_agg]

            for col_name in cols:
                # Only process histogram for numeric columns. For other data types using frequency
                if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                    min_max = {
                        "min": result[col_name]["min"],
                        "max": result[col_name]["max"]
                    }
                    buckets = result[col_name]["count_uniques"] - 1
                    if buckets > MAX_BUCKETS:
                        buckets = MAX_BUCKETS
                    elif buckets == 0:
                        buckets = 1
                    exprs.extend(
                        df.cols.create_exprs(col_name, funcs, df, buckets,
                                             min_max))
            agg_result = df.cols.exec_agg(exprs)
            if agg_result is not None:
                result_hist.update(agg_result)

        # Merge results
        for col_name in result:
            if col_name in result_hist:
                result[col_name].update(result_hist[col_name])

        def extra_columns_stats(df, col_name, stats):
            """
            Specific Stats for numeric columns
            :param df:
            :param col_name:
            :param stats:
            :return:
            """

            col_info = {}

            max_value = stats[col_name]["max"]
            min_value = stats[col_name]["min"]

            if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                stddev = stats[col_name]['stddev']
                mean = stats[col_name]['mean']

                quantile = stats[col_name]["percentile"]
                if max_value is not None and min_value is not None:
                    col_info['range'] = max_value - min_value
                else:
                    col_info['range'] = None

                col_info['median'] = quantile["0.5"]

                q1 = quantile["0.25"]
                q3 = quantile["0.75"]

                if q1 is not None and q3 is not None:
                    col_info['interquartile_range'] = q3 - q1
                else:
                    col_info['interquartile_range'] = None

                if mean != 0 and mean is not None:
                    col_info['coef_variation'] = round((stddev / mean), 5)
                else:
                    col_info['coef_variation'] = None

                mad = df.cols.mad(col_name)
                if mad is not None:
                    col_info['mad'] = round(df.cols.mad(col_name), 5)
                else:
                    col_info['mad'] = None

            if self.rows_count is None:
                self.rows_count = df.count()

            col_info['p_count_na'] = round(
                (stats[col_name]['count_na'] * 100) / self.rows_count, 2)
            col_info['p_count_uniques'] = round(
                (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2)
            return col_info

        if advanced_stats is True:
            for col_name in columns:
                result.update(extra_columns_stats(df, col_name, result))

        return result