Esempio n. 1
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     pivot_table = self._data_frame.stat.crosstab(
         "{}".format(targetDimension), testDimension)
     # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
     rdd = list(
         chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
     data_matrix = Matrices.dense(pivot_table.count(),
                                  len(pivot_table.columns) - 1, rdd)
     result = Statistics.chiSqTest(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result.statistic
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Esempio n. 2
0
    def stats_for_dimension_column(self, dimension_column):
        if not self._dataframe_helper.is_string_column(dimension_column):
            raise BIException.non_string_column(dimension_column)

        col_non_nulls = FN.count(dimension_column).alias('non_nulls')
        col_nulls = FN.sum(
            FN.col(dimension_column).isNull().cast('integer')).alias('nulls')
        aggregate_columns = (col_non_nulls, col_nulls)
        result = self._data_frame.select(
            *aggregate_columns).collect()[0].asDict()
        cardinality = self._data_frame.select(
            FN.col(dimension_column)).distinct().count()

        # TODO column value frequencies
        descr_stats = DimensionDescriptiveStats(
            num_null_values=result.get('nulls'),
            num_non_null_values=result.get('non_nulls'),
            cardinality=cardinality)

        if cardinality > DescriptiveStats.MAX_NUM_LEVELS:
            return descr_stats

        freq = {}
        level_and_counts = self._data_frame.groupBy(
            dimension_column).count().sort(FN.desc('count')).collect()
        for row in level_and_counts:
            freq[row[0]] = row[1]

        descr_stats.set_value_frequencies(freq)
        return descr_stats
Esempio n. 3
0
    def stats_for_measure_column(self, measure_column):
        if not self._dataframe_helper.is_numeric_column(measure_column):
            raise BIException.non_numeric_column(measure_column)

        descr_stats = MeasureDescriptiveStats()
        num_values = self._data_frame.select(measure_column).count()
        min_value = Stats.min(self._data_frame, measure_column)
        max_value = Stats.max(self._data_frame, measure_column)
        total_value = Stats.total(self._data_frame, measure_column)
        mean = Stats.mean(self._data_frame, measure_column)
        variance = Stats.variance(self._data_frame, measure_column)
        std_dev = Stats.std_dev(self._data_frame, measure_column)

        if min_value == max_value:
            skewness = 0
            kurtosis = 0
        else:
            skewness = Stats.skew(self._data_frame, measure_column)
            kurtosis = Stats.kurtosis(self._data_frame, measure_column)

        descr_stats.set_summary_stats(num_values=num_values,
                                      min_value=min_value,
                                      max_value=max_value,
                                      total=total_value,
                                      mean=mean,
                                      variance=variance,
                                      std_dev=std_dev,
                                      skew=skewness,
                                      kurtosis=kurtosis)
        descr_stats.set_five_point_summary_stats(
            self.five_point_summary(measure_column))

        descr_stats.set_histogram(
            Binner(self._data_frame,
                   self._dataframe_helper).get_bins(measure_column))

        #descr_stats.set_raw_data([float(row[0]) for row in self._data_frame.select(measure_column).collect()])
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "statCalculationEnd",\
        #                             "info",\
        #                             self._scriptStages["statCalculationEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "statCalculationEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="script")
        # self._dataframe_context.update_completion_status(self._completionStatus)
        return descr_stats
Esempio n. 4
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     if self._pandas_flag:
         pivot_table = pd.crosstab([self._data_frame[targetDimension]],
                                   self._data_frame[testDimension])
         try:
             data_matrix = np.array(
                 pivot_table.as_matrix(columns=None)).astype(np.int)
         except:
             data_matrix = np.array(pivot_table.values).astype(np.int)
     else:
         pivot_table = self._data_frame.stat.crosstab(
             "{}".format(targetDimension), testDimension)
         # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
         rdd = list(
             chain(*list(
                 zip(*pivot_table.drop(pivot_table.columns[0]).collect()))))
         data_matrix = Matrices.dense(pivot_table.count(),
                                      len(pivot_table.columns) - 1, rdd)
         data_matrix = data_matrix.toArray().tolist()
     result = chi2_contingency(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result[0]
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Esempio n. 5
0
    def get_bins(self, column_name, num_bins=10, split_points=None):
        """
        Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied.

        :param column_name: column to be binned
        :param num_bins:    number of bins to create
        :param split_points:    list of tupels [(a,b), (b, c), ...] such that
                                all values in the range [a, b) assigned to bucket1
        :return:
        """
        if not column_name in self._numeric_columns:
            raise BIException.column_does_not_exist(column_name)

        splits = None
        if split_points == None:
            if self._pandas_flag:
                min_value = self._data_frame[column_name].min()
                max_value = self._data_frame[column_name].max()
            else:
                min_max = self._data_frame.agg(
                    FN.min(column_name).alias('min'),
                    FN.max(column_name).alias('max')).collect()
                min_value = min_max[0]['min']
                max_value = min_max[0]['max']
            # splits = CommonUtils.frange(min_value, max_value, num_bins)
            if self._pandas_flag:
                splits = CommonUtils.return_optimum_bins(
                    self._data_frame[column_name])
            else:
                splits = CommonUtils.return_optimum_bins(
                    self._data_frame.select(column_name).toPandas()
                    [column_name])
            if splits[0] > min_value:
                splits = [min_value - 1] + list(splits)
                print("Min Point Added")
            if splits[-1] < max_value:
                splits = list(splits) + [max_value + 1]
                print("Max Point Added")
        else:
            splits = split_points
        # cast column_name to double type if needed, otherwise Bucketizer does not work
        column_df = None
        if self._pandas_flag:
            binning_df = pd.DataFrame()
            binning_df[BinnerConstants.
                       ORIGINAL_COLUMN_NAME] = self._data_frame[column_name]
        else:
            if self._column_data_types.get(column_name) != DoubleType:
                column_df = self._data_frame.select(
                    FN.col(column_name).cast('double').alias(
                        BinnerConstants.ORIGINAL_COLUMN_NAME))
            else:
                column_df = self._data_frame.select(
                    FN.col(column_name).alias(
                        BinnerConstants.ORIGINAL_COLUMN_NAME))

            bucketizer = Bucketizer(
                inputCol=BinnerConstants.ORIGINAL_COLUMN_NAME,
                outputCol=BinnerConstants.BINNED_COLUMN_NAME)
            bucketizer.setSplits(splits)

        if min_value == max_value:
            histogram = Histogram(column_name, self._num_rows)
            bin_number = 0
            start_value = int(min_value - 0.5)
            end_value = int(max_value + 0.5)
            histogram.add_bin(bin_number, start_value, end_value,
                              self._num_rows)
        else:
            if self._pandas_flag:
                binning_df[BinnerConstants.BINNED_COLUMN_NAME] = pd.cut(
                    self._data_frame[column_name],
                    bins=splits,
                    labels=list(range(len(splits) - 1)),
                    right=False,
                    include_lowest=True)
                buckets_counts_df = binning_df.groupby(
                    BinnerConstants.BINNED_COLUMN_NAME,
                    as_index=False,
                    sort=False).count()
                histogram = Histogram(column_name, self._num_rows)
                for row in buckets_counts_df.iterrows():
                    bin_number = int(row[1][0])
                    start_value = splits[bin_number]
                    end_value = splits[bin_number + 1]
                    try:
                        histogram.add_bin(bin_number, float(start_value),
                                          float(end_value), float(row[1][1]))
                    except:
                        histogram.add_bin(bin_number, start_value, end_value,
                                          float(row[1][1]))
            else:
                buckets_and_counts = bucketizer.transform(column_df).groupBy(
                    BinnerConstants.BINNED_COLUMN_NAME).agg({
                        '*': 'count'
                    }).collect()
                histogram = Histogram(column_name, self._num_rows)
                for row in buckets_and_counts:
                    bin_number = int(row[0])
                    start_value = splits[bin_number]
                    end_value = splits[bin_number + 1]
                    histogram.add_bin(bin_number, start_value, end_value,
                                      row[1])

        return histogram
Esempio n. 6
0
    def test(self,
             measure_column_name,
             dimension_column_name,
             max_num_levels=200):
        if not measure_column_name in self._dataframe_helper.get_numeric_columns(
        ):
            raise BIException.non_numeric_column(measure_column_name)
        if not dimension_column_name in self._dataframe_helper.get_string_columns(
        ):
            raise BIException.non_string_column(dimension_column_name)
        num_levels = self._data_frame.na.drop(
            subset=dimension_column_name).select(
                dimension_column_name).distinct().count()
        num_rows = self._data_frame.count()
        if num_levels > max_num_levels:
            print 'Dimension column(%s) has more than %d levels' % (
                dimension_column_name, max_num_levels)
            return None
        grand_mean_expr = (FN.mean(measure_column_name).alias(
            OneWayAnova.GRAND_MEAN_COLUMN_NAME), )
        grand_mean = self._data_frame.select(
            *grand_mean_expr).collect()[0][OneWayAnova.GRAND_MEAN_COLUMN_NAME]
        agg_expr = (FN.count(measure_column_name).alias(
            OneWayAnova.COUNT_COLUMN_NAME), FN.mean(measure_column_name).alias(
                OneWayAnova.MEAN_COLUMN_NAME))

        groups_data = {}
        aggregated_data = self._data_frame.na.drop(
            subset=dimension_column_name).groupBy(dimension_column_name).agg(
                *agg_expr).collect()
        for row in aggregated_data:
            row_data = row.asDict()
            groups_data[row_data.get(dimension_column_name)] = row_data
        dimension_column = FN.col(dimension_column_name)
        measure_column = FN.col(measure_column_name)
        anova_result = AnovaResult()
        sum_of_squares_error = 0
        for group_name in groups_data.keys():
            group_mean = groups_data.get(group_name).get(
                OneWayAnova.MEAN_COLUMN_NAME)
            group_sum_of_squares = self._data_frame.filter(dimension_column == group_name) \
                .select(((measure_column - group_mean) * (measure_column - group_mean))) \
                .agg({'*': 'sum'}).collect()[0][0]
            sum_of_squares_error += group_sum_of_squares
            num_values_in_group = groups_data.get(group_name).get(
                OneWayAnova.COUNT_COLUMN_NAME)
            column_value_group = ColumnValueGroup(
                {dimension_column_name: group_name})
            descr_stats = DescriptiveStats(
                self._data_frame.filter(
                    dimension_column == group_name).select(measure_column),
                self._dataframe_helper, self._dataframe_context)
            try:
                group_descr_stats = descr_stats.stats_for_measure_column(
                    measure_column_name)
            except Exception, e:
                print e
                group_descr_stats = {}
            anova_column_value_group_stats = AnovaColumnValueGroupStats(
                column_value_group, group_descr_stats)
            anova_result.add_group_stats(anova_column_value_group_stats)
    def get_bins(self, column_name, num_bins=10, split_points=None):
        """
        Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied.

        :param column_name: column to be binned
        :param num_bins:    number of bins to create
        :param split_points:    list of tupels [(a,b), (b, c), ...] such that
                                all values in the range [a, b) assigned to bucket1
        :return:
        """
        if not column_name in self._numeric_columns:
            raise BIException.column_does_not_exist(column_name)

        splits = None
        if split_points == None:
            min_max = self._data_frame.agg(
                FN.min(column_name).alias('min'),
                FN.max(column_name).alias('max')).collect()
            min_value = min_max[0]['min']
            max_value = min_max[0]['max']
            quantile_discretizer = QuantileDiscretizer(numBuckets=10,
                                                       inputCol=column_name,
                                                       outputCol='buckets',
                                                       relativeError=0.01)
            bucketizer = quantile_discretizer.fit(self._data_frame)
            # splits have these values [-Inf, Q1, Median, Q3, Inf]
            splits = bucketizer.getSplits()
        else:
            splits = split_points
        # cast column_name to double type if needed, otherwise Bucketizer does not work
        splits[0] = min_value - 0.1
        splits[-1] = max_value + 0.1
        column_df = None
        if self._column_data_types.get(column_name) != DoubleType:
            column_df = self._data_frame.select(
                FN.col(column_name).cast('double').alias('values'))
        else:
            column_df = self._data_frame.select(
                FN.col(column_name).alias('values'))

        bucketizer = Bucketizer(inputCol='values', outputCol='bins')
        bucketizer.setSplits(splits)
        if min_value == max_value:
            histogram = Histogram(column_name, self._num_rows)
            bin_number = 0
            start_value = min_value - 0.5
            end_value = max_value + 0.5
            histogram.add_bin(bin_number, start_value, end_value,
                              self._num_rows)
        else:
            buckets_and_counts = bucketizer.transform(column_df).groupBy(
                'bins').agg({
                    '*': 'count'
                }).collect()
            histogram = Histogram(column_name, self._num_rows)
            for row in buckets_and_counts:
                bin_number = int(row[0])
                start_value = splits[bin_number]
                end_value = splits[bin_number + 1]
                histogram.add_bin(
                    bin_number, start_value, end_value,
                    float(row[1]) * 100.0 / (end_value - start_value))

        return histogram
Esempio n. 8
0
    def test(self,
             measure_column_name,
             dimension_column_name,
             max_num_levels=200):
        if not measure_column_name in self._dataframe_helper.get_numeric_columns(
        ):
            raise BIException.non_numeric_column(measure_column_name)
        if not dimension_column_name in self._dataframe_helper.get_string_columns(
        ):
            raise BIException.non_string_column(dimension_column_name)
        num_levels = self._data_frame.na.drop(
            subset=dimension_column_name).select(
                dimension_column_name).distinct().count()
        num_rows = self._data_frame.count()
        if num_levels > max_num_levels:
            print('Dimension column(%s) has more than %d levels' %
                  (dimension_column_name, max_num_levels))
            return None
        grand_mean_expr = (FN.mean(measure_column_name).alias(
            OneWayAnova.GRAND_MEAN_COLUMN_NAME), )
        grand_mean = self._data_frame.select(
            *grand_mean_expr).collect()[0][OneWayAnova.GRAND_MEAN_COLUMN_NAME]
        agg_expr = (FN.count(measure_column_name).alias(
            OneWayAnova.COUNT_COLUMN_NAME), FN.mean(measure_column_name).alias(
                OneWayAnova.MEAN_COLUMN_NAME))

        groups_data = {}
        aggregated_data = self._data_frame.na.drop(
            subset=dimension_column_name).groupBy(dimension_column_name).agg(
                *agg_expr).collect()
        for row in aggregated_data:
            row_data = row.asDict()
            groups_data[row_data.get(dimension_column_name)] = row_data
        dimension_column = FN.col(dimension_column_name)
        measure_column = FN.col(measure_column_name)
        anova_result = AnovaResult()
        sum_of_squares_error = 0
        for group_name in list(groups_data.keys()):
            group_mean = groups_data.get(group_name).get(
                OneWayAnova.MEAN_COLUMN_NAME)
            group_sum_of_squares = self._data_frame.filter(dimension_column == group_name) \
                .select(((measure_column - group_mean) * (measure_column - group_mean))) \
                .agg({'*': 'sum'}).collect()[0][0]
            sum_of_squares_error += group_sum_of_squares
            num_values_in_group = groups_data.get(group_name).get(
                OneWayAnova.COUNT_COLUMN_NAME)
            column_value_group = ColumnValueGroup(
                {dimension_column_name: group_name})
            descr_stats = DescriptiveStats(
                self._data_frame.filter(
                    dimension_column == group_name).select(measure_column),
                self._dataframe_helper, self._dataframe_context)
            try:
                group_descr_stats = descr_stats.stats_for_measure_column(
                    measure_column_name)
            except Exception as e:
                print(e)
                group_descr_stats = {}
            anova_column_value_group_stats = AnovaColumnValueGroupStats(
                column_value_group, group_descr_stats)
            anova_result.add_group_stats(anova_column_value_group_stats)

        sum_of_squares_between = sum([
            data[OneWayAnova.COUNT_COLUMN_NAME] *
            math.pow(grand_mean - data[OneWayAnova.MEAN_COLUMN_NAME], 2)
            for data in list(groups_data.values())
        ])
        mean_sum_of_squares_between = float(sum_of_squares_between) / (
            num_levels - 1)

        mean_sum_of_squares_error = float(sum_of_squares_error) / (num_rows -
                                                                   num_levels)
        f_value = old_div(mean_sum_of_squares_between,
                          mean_sum_of_squares_error)
        p_value = Stats.f_distribution_critical_value(f_value, num_levels - 1,
                                                      num_rows - num_levels)
        anova_result.set_params(
            df1=num_levels - 1,
            df2=(num_rows - num_levels),
            sum_of_squares_between=sum_of_squares_between,
            mean_sum_of_squares_between=mean_sum_of_squares_between,
            sum_of_squares_error=sum_of_squares_error,
            mean_sum_of_squares_error=mean_sum_of_squares_error,
            f_value=f_value,
            p_value=p_value,
            total_number_of_records=num_rows)

        return anova_result
Esempio n. 9
0
    def approxQuantize(data_frame, measure_column, dataframe_helper):
        if not dataframe_helper.is_numeric_column(measure_column):
            raise BIException.non_numeric_column(measure_column)
        st = time.time()
        data_frame = data_frame.select(measure_column)
        splits = data_frame.approxQuantile(measure_column,
                                           Quantizer.QUARTILE_PERCENTAGES,
                                           Quantizer.APPROX_ERROR)
        print splits
        print "bucketizer", time.time() - st
        q1 = splits[0]
        median = splits[1]
        q3 = splits[2]
        iqr = (q3 - q1)
        left_hinge = (q1 - 1.5 * iqr)
        right_hinge = (q3 + 1.5 * iqr)
        # print q1,median,q3,iqr,left_hinge,right_hinge

        mean = data_frame.select(FN.mean(measure_column)).collect()[0][0]
        column = FN.column(measure_column)
        num_left_outliers = data_frame.filter(column < left_hinge).count()
        num_right_outliers = data_frame.filter(column > right_hinge).count()

        q1_stats = data_frame.filter(column < q1).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q2_stats = data_frame.filter(column >= q1).filter(column < median).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q3_stats = data_frame.filter(column >= median).filter(column < q3).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q4_stats = data_frame.filter(column >= q3).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()

        q1_freq = q1_stats[0]['count']
        q2_freq = q2_stats[0]['count']
        q3_freq = q3_stats[0]['count']
        q4_freq = q4_stats[0]['count']

        quartile_sums = {}
        quartile_sums[
            'q1'] = q1_stats[0]['sum'] if q1_stats[0]['sum'] != None else 0
        quartile_sums[
            'q2'] = q2_stats[0]['sum'] if q1_stats[0]['sum'] != None else 0
        quartile_sums[
            'q3'] = q3_stats[0]['sum'] if q3_stats[0]['sum'] != None else 0
        quartile_sums[
            'q4'] = q4_stats[0]['sum'] if q4_stats[0]['sum'] != None else 0

        quartile_means = {}
        quartile_means['q1'] = quartile_sums['q1'] / q1_stats[0][
            'count'] if q1_stats[0]['count'] != 0 else None
        quartile_means['q2'] = quartile_sums['q2'] / q2_stats[0][
            'count'] if q2_stats[0]['count'] != 0 else None
        quartile_means['q3'] = quartile_sums['q3'] / q3_stats[0][
            'count'] if q2_stats[0]['count'] != 0 else None
        quartile_means['q4'] = quartile_sums['q4'] / q4_stats[0][
            'count'] if q2_stats[0]['count'] != 0 else None

        FPS = FivePointSummary(left_hinge_value=left_hinge,
                               q1_value=q1,
                               median=median,
                               q3_value=q3,
                               right_hinge_value=right_hinge,
                               num_left_outliers=num_left_outliers,
                               num_right_outliers=num_right_outliers,
                               q1_freq=q1_freq,
                               q2_freq=q2_freq,
                               q3_freq=q3_freq,
                               q4_freq=q4_freq)
        FPS.set_means(quartile_means)
        FPS.set_sums(quartile_sums)

        return FPS
Esempio n. 10
0
    def quantize(data_frame, measure_column, dataframe_helper):
        if not dataframe_helper.is_numeric_column(measure_column):
            raise BIException.non_numeric_column(measure_column)

        quantile_discretizer = QuantileDiscretizer(
            numBuckets=4,
            inputCol=measure_column,
            outputCol=Quantizer.QUANTIZATION_OUTPUT_COLUMN,
            relativeError=Quantizer.QUANTIZATION_RELATIVE_ERROR)
        bucketizer = quantile_discretizer.fit(data_frame)
        # splits have these values [-Inf, Q1, Median, Q3, Inf]
        splits = bucketizer.getSplits()
        if len(splits) < 5:
            q1 = splits[1]
            median = splits[1]
            q3 = splits[1]
            iqr = (q3 - q1)
            left_hinge = (q1 - 1.5 * iqr)
            right_hinge = (q3 + 1.5 * iqr)
        else:
            q1 = splits[1]
            median = splits[2]
            q3 = splits[3]
            iqr = (q3 - q1)
            left_hinge = (q1 - 1.5 * iqr)
            right_hinge = (q3 + 1.5 * iqr)

        mean = data_frame.select(FN.mean(measure_column)).collect()[0][0]
        column = FN.column(measure_column)
        num_left_outliers = data_frame.filter(column < left_hinge).count()
        num_right_outliers = data_frame.filter(column > right_hinge).count()
        # q1_freq = data_frame.filter(column < q1).count()
        # q2_freq = data_frame.filter(column >= q1).filter(column < median).count()
        # q3_freq = data_frame.filter(column >= median).filter(column < q3).count()
        # q4_freq = data_frame.filter(column >= q3).count()
        q1_stats = data_frame.filter(column < q1).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q2_stats = data_frame.filter(column >= q1).filter(column < median).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q3_stats = data_frame.filter(column >= median).filter(column < q3).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()
        q4_stats = data_frame.filter(column >= q3).agg(
            FN.sum(column).alias('sum'),
            FN.count(column).alias('count')).collect()

        q1_freq = q1_stats[0]['count']
        q2_freq = q2_stats[0]['count']
        q3_freq = q3_stats[0]['count']
        q4_freq = q4_stats[0]['count']

        quartile_sums = {}
        quartile_sums['q1'] = q1_stats[0]['sum']
        quartile_sums['q2'] = q2_stats[0]['sum']
        quartile_sums['q3'] = q3_stats[0]['sum']
        quartile_sums['q4'] = q4_stats[0]['sum']

        quartile_means = {}
        quartile_means['q1'] = q1_stats[0]['sum'] / q1_stats[0]['count']
        quartile_means['q2'] = q2_stats[0]['sum'] / q2_stats[0]['count']
        quartile_means['q3'] = q3_stats[0]['sum'] / q3_stats[0]['count']
        quartile_means['q4'] = q4_stats[0]['sum'] / q4_stats[0]['count']

        FPS = FivePointSummary(left_hinge_value=left_hinge,
                               q1_value=q1,
                               median=median,
                               q3_value=q3,
                               right_hinge_value=right_hinge,
                               num_left_outliers=num_left_outliers,
                               num_right_outliers=num_right_outliers,
                               q1_freq=q1_freq,
                               q2_freq=q2_freq,
                               q3_freq=q3_freq,
                               q4_freq=q4_freq)
        FPS.set_means(quartile_means)
        FPS.set_sums(quartile_sums)

        return FPS