def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def stats_for_dimension_column(self, dimension_column): if not self._dataframe_helper.is_string_column(dimension_column): raise BIException.non_string_column(dimension_column) col_non_nulls = FN.count(dimension_column).alias('non_nulls') col_nulls = FN.sum( FN.col(dimension_column).isNull().cast('integer')).alias('nulls') aggregate_columns = (col_non_nulls, col_nulls) result = self._data_frame.select( *aggregate_columns).collect()[0].asDict() cardinality = self._data_frame.select( FN.col(dimension_column)).distinct().count() # TODO column value frequencies descr_stats = DimensionDescriptiveStats( num_null_values=result.get('nulls'), num_non_null_values=result.get('non_nulls'), cardinality=cardinality) if cardinality > DescriptiveStats.MAX_NUM_LEVELS: return descr_stats freq = {} level_and_counts = self._data_frame.groupBy( dimension_column).count().sort(FN.desc('count')).collect() for row in level_and_counts: freq[row[0]] = row[1] descr_stats.set_value_frequencies(freq) return descr_stats
def stats_for_measure_column(self, measure_column): if not self._dataframe_helper.is_numeric_column(measure_column): raise BIException.non_numeric_column(measure_column) descr_stats = MeasureDescriptiveStats() num_values = self._data_frame.select(measure_column).count() min_value = Stats.min(self._data_frame, measure_column) max_value = Stats.max(self._data_frame, measure_column) total_value = Stats.total(self._data_frame, measure_column) mean = Stats.mean(self._data_frame, measure_column) variance = Stats.variance(self._data_frame, measure_column) std_dev = Stats.std_dev(self._data_frame, measure_column) if min_value == max_value: skewness = 0 kurtosis = 0 else: skewness = Stats.skew(self._data_frame, measure_column) kurtosis = Stats.kurtosis(self._data_frame, measure_column) descr_stats.set_summary_stats(num_values=num_values, min_value=min_value, max_value=max_value, total=total_value, mean=mean, variance=variance, std_dev=std_dev, skew=skewness, kurtosis=kurtosis) descr_stats.set_five_point_summary_stats( self.five_point_summary(measure_column)) descr_stats.set_histogram( Binner(self._data_frame, self._dataframe_helper).get_bins(measure_column)) #descr_stats.set_raw_data([float(row[0]) for row in self._data_frame.select(measure_column).collect()]) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"] # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "statCalculationEnd",\ # "info",\ # self._scriptStages["statCalculationEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "statCalculationEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="script") # self._dataframe_context.update_completion_status(self._completionStatus) return descr_stats
def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() if self._pandas_flag: pivot_table = pd.crosstab([self._data_frame[targetDimension]], self._data_frame[testDimension]) try: data_matrix = np.array( pivot_table.as_matrix(columns=None)).astype(np.int) except: data_matrix = np.array(pivot_table.values).astype(np.int) else: pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*list( zip(*pivot_table.drop(pivot_table.columns[0]).collect())))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) data_matrix = data_matrix.toArray().tolist() result = chi2_contingency(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result[0] n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def get_bins(self, column_name, num_bins=10, split_points=None): """ Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied. :param column_name: column to be binned :param num_bins: number of bins to create :param split_points: list of tupels [(a,b), (b, c), ...] such that all values in the range [a, b) assigned to bucket1 :return: """ if not column_name in self._numeric_columns: raise BIException.column_does_not_exist(column_name) splits = None if split_points == None: if self._pandas_flag: min_value = self._data_frame[column_name].min() max_value = self._data_frame[column_name].max() else: min_max = self._data_frame.agg( FN.min(column_name).alias('min'), FN.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] # splits = CommonUtils.frange(min_value, max_value, num_bins) if self._pandas_flag: splits = CommonUtils.return_optimum_bins( self._data_frame[column_name]) else: splits = CommonUtils.return_optimum_bins( self._data_frame.select(column_name).toPandas() [column_name]) if splits[0] > min_value: splits = [min_value - 1] + list(splits) print("Min Point Added") if splits[-1] < max_value: splits = list(splits) + [max_value + 1] print("Max Point Added") else: splits = split_points # cast column_name to double type if needed, otherwise Bucketizer does not work column_df = None if self._pandas_flag: binning_df = pd.DataFrame() binning_df[BinnerConstants. ORIGINAL_COLUMN_NAME] = self._data_frame[column_name] else: if self._column_data_types.get(column_name) != DoubleType: column_df = self._data_frame.select( FN.col(column_name).cast('double').alias( BinnerConstants.ORIGINAL_COLUMN_NAME)) else: column_df = self._data_frame.select( FN.col(column_name).alias( BinnerConstants.ORIGINAL_COLUMN_NAME)) bucketizer = Bucketizer( inputCol=BinnerConstants.ORIGINAL_COLUMN_NAME, outputCol=BinnerConstants.BINNED_COLUMN_NAME) bucketizer.setSplits(splits) if min_value == max_value: histogram = Histogram(column_name, self._num_rows) bin_number = 0 start_value = int(min_value - 0.5) end_value = int(max_value + 0.5) histogram.add_bin(bin_number, start_value, end_value, self._num_rows) else: if self._pandas_flag: binning_df[BinnerConstants.BINNED_COLUMN_NAME] = pd.cut( self._data_frame[column_name], bins=splits, labels=list(range(len(splits) - 1)), right=False, include_lowest=True) buckets_counts_df = binning_df.groupby( BinnerConstants.BINNED_COLUMN_NAME, as_index=False, sort=False).count() histogram = Histogram(column_name, self._num_rows) for row in buckets_counts_df.iterrows(): bin_number = int(row[1][0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] try: histogram.add_bin(bin_number, float(start_value), float(end_value), float(row[1][1])) except: histogram.add_bin(bin_number, start_value, end_value, float(row[1][1])) else: buckets_and_counts = bucketizer.transform(column_df).groupBy( BinnerConstants.BINNED_COLUMN_NAME).agg({ '*': 'count' }).collect() histogram = Histogram(column_name, self._num_rows) for row in buckets_and_counts: bin_number = int(row[0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] histogram.add_bin(bin_number, start_value, end_value, row[1]) return histogram
def test(self, measure_column_name, dimension_column_name, max_num_levels=200): if not measure_column_name in self._dataframe_helper.get_numeric_columns( ): raise BIException.non_numeric_column(measure_column_name) if not dimension_column_name in self._dataframe_helper.get_string_columns( ): raise BIException.non_string_column(dimension_column_name) num_levels = self._data_frame.na.drop( subset=dimension_column_name).select( dimension_column_name).distinct().count() num_rows = self._data_frame.count() if num_levels > max_num_levels: print 'Dimension column(%s) has more than %d levels' % ( dimension_column_name, max_num_levels) return None grand_mean_expr = (FN.mean(measure_column_name).alias( OneWayAnova.GRAND_MEAN_COLUMN_NAME), ) grand_mean = self._data_frame.select( *grand_mean_expr).collect()[0][OneWayAnova.GRAND_MEAN_COLUMN_NAME] agg_expr = (FN.count(measure_column_name).alias( OneWayAnova.COUNT_COLUMN_NAME), FN.mean(measure_column_name).alias( OneWayAnova.MEAN_COLUMN_NAME)) groups_data = {} aggregated_data = self._data_frame.na.drop( subset=dimension_column_name).groupBy(dimension_column_name).agg( *agg_expr).collect() for row in aggregated_data: row_data = row.asDict() groups_data[row_data.get(dimension_column_name)] = row_data dimension_column = FN.col(dimension_column_name) measure_column = FN.col(measure_column_name) anova_result = AnovaResult() sum_of_squares_error = 0 for group_name in groups_data.keys(): group_mean = groups_data.get(group_name).get( OneWayAnova.MEAN_COLUMN_NAME) group_sum_of_squares = self._data_frame.filter(dimension_column == group_name) \ .select(((measure_column - group_mean) * (measure_column - group_mean))) \ .agg({'*': 'sum'}).collect()[0][0] sum_of_squares_error += group_sum_of_squares num_values_in_group = groups_data.get(group_name).get( OneWayAnova.COUNT_COLUMN_NAME) column_value_group = ColumnValueGroup( {dimension_column_name: group_name}) descr_stats = DescriptiveStats( self._data_frame.filter( dimension_column == group_name).select(measure_column), self._dataframe_helper, self._dataframe_context) try: group_descr_stats = descr_stats.stats_for_measure_column( measure_column_name) except Exception, e: print e group_descr_stats = {} anova_column_value_group_stats = AnovaColumnValueGroupStats( column_value_group, group_descr_stats) anova_result.add_group_stats(anova_column_value_group_stats)
def get_bins(self, column_name, num_bins=10, split_points=None): """ Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied. :param column_name: column to be binned :param num_bins: number of bins to create :param split_points: list of tupels [(a,b), (b, c), ...] such that all values in the range [a, b) assigned to bucket1 :return: """ if not column_name in self._numeric_columns: raise BIException.column_does_not_exist(column_name) splits = None if split_points == None: min_max = self._data_frame.agg( FN.min(column_name).alias('min'), FN.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] quantile_discretizer = QuantileDiscretizer(numBuckets=10, inputCol=column_name, outputCol='buckets', relativeError=0.01) bucketizer = quantile_discretizer.fit(self._data_frame) # splits have these values [-Inf, Q1, Median, Q3, Inf] splits = bucketizer.getSplits() else: splits = split_points # cast column_name to double type if needed, otherwise Bucketizer does not work splits[0] = min_value - 0.1 splits[-1] = max_value + 0.1 column_df = None if self._column_data_types.get(column_name) != DoubleType: column_df = self._data_frame.select( FN.col(column_name).cast('double').alias('values')) else: column_df = self._data_frame.select( FN.col(column_name).alias('values')) bucketizer = Bucketizer(inputCol='values', outputCol='bins') bucketizer.setSplits(splits) if min_value == max_value: histogram = Histogram(column_name, self._num_rows) bin_number = 0 start_value = min_value - 0.5 end_value = max_value + 0.5 histogram.add_bin(bin_number, start_value, end_value, self._num_rows) else: buckets_and_counts = bucketizer.transform(column_df).groupBy( 'bins').agg({ '*': 'count' }).collect() histogram = Histogram(column_name, self._num_rows) for row in buckets_and_counts: bin_number = int(row[0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] histogram.add_bin( bin_number, start_value, end_value, float(row[1]) * 100.0 / (end_value - start_value)) return histogram
def test(self, measure_column_name, dimension_column_name, max_num_levels=200): if not measure_column_name in self._dataframe_helper.get_numeric_columns( ): raise BIException.non_numeric_column(measure_column_name) if not dimension_column_name in self._dataframe_helper.get_string_columns( ): raise BIException.non_string_column(dimension_column_name) num_levels = self._data_frame.na.drop( subset=dimension_column_name).select( dimension_column_name).distinct().count() num_rows = self._data_frame.count() if num_levels > max_num_levels: print('Dimension column(%s) has more than %d levels' % (dimension_column_name, max_num_levels)) return None grand_mean_expr = (FN.mean(measure_column_name).alias( OneWayAnova.GRAND_MEAN_COLUMN_NAME), ) grand_mean = self._data_frame.select( *grand_mean_expr).collect()[0][OneWayAnova.GRAND_MEAN_COLUMN_NAME] agg_expr = (FN.count(measure_column_name).alias( OneWayAnova.COUNT_COLUMN_NAME), FN.mean(measure_column_name).alias( OneWayAnova.MEAN_COLUMN_NAME)) groups_data = {} aggregated_data = self._data_frame.na.drop( subset=dimension_column_name).groupBy(dimension_column_name).agg( *agg_expr).collect() for row in aggregated_data: row_data = row.asDict() groups_data[row_data.get(dimension_column_name)] = row_data dimension_column = FN.col(dimension_column_name) measure_column = FN.col(measure_column_name) anova_result = AnovaResult() sum_of_squares_error = 0 for group_name in list(groups_data.keys()): group_mean = groups_data.get(group_name).get( OneWayAnova.MEAN_COLUMN_NAME) group_sum_of_squares = self._data_frame.filter(dimension_column == group_name) \ .select(((measure_column - group_mean) * (measure_column - group_mean))) \ .agg({'*': 'sum'}).collect()[0][0] sum_of_squares_error += group_sum_of_squares num_values_in_group = groups_data.get(group_name).get( OneWayAnova.COUNT_COLUMN_NAME) column_value_group = ColumnValueGroup( {dimension_column_name: group_name}) descr_stats = DescriptiveStats( self._data_frame.filter( dimension_column == group_name).select(measure_column), self._dataframe_helper, self._dataframe_context) try: group_descr_stats = descr_stats.stats_for_measure_column( measure_column_name) except Exception as e: print(e) group_descr_stats = {} anova_column_value_group_stats = AnovaColumnValueGroupStats( column_value_group, group_descr_stats) anova_result.add_group_stats(anova_column_value_group_stats) sum_of_squares_between = sum([ data[OneWayAnova.COUNT_COLUMN_NAME] * math.pow(grand_mean - data[OneWayAnova.MEAN_COLUMN_NAME], 2) for data in list(groups_data.values()) ]) mean_sum_of_squares_between = float(sum_of_squares_between) / ( num_levels - 1) mean_sum_of_squares_error = float(sum_of_squares_error) / (num_rows - num_levels) f_value = old_div(mean_sum_of_squares_between, mean_sum_of_squares_error) p_value = Stats.f_distribution_critical_value(f_value, num_levels - 1, num_rows - num_levels) anova_result.set_params( df1=num_levels - 1, df2=(num_rows - num_levels), sum_of_squares_between=sum_of_squares_between, mean_sum_of_squares_between=mean_sum_of_squares_between, sum_of_squares_error=sum_of_squares_error, mean_sum_of_squares_error=mean_sum_of_squares_error, f_value=f_value, p_value=p_value, total_number_of_records=num_rows) return anova_result
def approxQuantize(data_frame, measure_column, dataframe_helper): if not dataframe_helper.is_numeric_column(measure_column): raise BIException.non_numeric_column(measure_column) st = time.time() data_frame = data_frame.select(measure_column) splits = data_frame.approxQuantile(measure_column, Quantizer.QUARTILE_PERCENTAGES, Quantizer.APPROX_ERROR) print splits print "bucketizer", time.time() - st q1 = splits[0] median = splits[1] q3 = splits[2] iqr = (q3 - q1) left_hinge = (q1 - 1.5 * iqr) right_hinge = (q3 + 1.5 * iqr) # print q1,median,q3,iqr,left_hinge,right_hinge mean = data_frame.select(FN.mean(measure_column)).collect()[0][0] column = FN.column(measure_column) num_left_outliers = data_frame.filter(column < left_hinge).count() num_right_outliers = data_frame.filter(column > right_hinge).count() q1_stats = data_frame.filter(column < q1).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q2_stats = data_frame.filter(column >= q1).filter(column < median).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q3_stats = data_frame.filter(column >= median).filter(column < q3).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q4_stats = data_frame.filter(column >= q3).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q1_freq = q1_stats[0]['count'] q2_freq = q2_stats[0]['count'] q3_freq = q3_stats[0]['count'] q4_freq = q4_stats[0]['count'] quartile_sums = {} quartile_sums[ 'q1'] = q1_stats[0]['sum'] if q1_stats[0]['sum'] != None else 0 quartile_sums[ 'q2'] = q2_stats[0]['sum'] if q1_stats[0]['sum'] != None else 0 quartile_sums[ 'q3'] = q3_stats[0]['sum'] if q3_stats[0]['sum'] != None else 0 quartile_sums[ 'q4'] = q4_stats[0]['sum'] if q4_stats[0]['sum'] != None else 0 quartile_means = {} quartile_means['q1'] = quartile_sums['q1'] / q1_stats[0][ 'count'] if q1_stats[0]['count'] != 0 else None quartile_means['q2'] = quartile_sums['q2'] / q2_stats[0][ 'count'] if q2_stats[0]['count'] != 0 else None quartile_means['q3'] = quartile_sums['q3'] / q3_stats[0][ 'count'] if q2_stats[0]['count'] != 0 else None quartile_means['q4'] = quartile_sums['q4'] / q4_stats[0][ 'count'] if q2_stats[0]['count'] != 0 else None FPS = FivePointSummary(left_hinge_value=left_hinge, q1_value=q1, median=median, q3_value=q3, right_hinge_value=right_hinge, num_left_outliers=num_left_outliers, num_right_outliers=num_right_outliers, q1_freq=q1_freq, q2_freq=q2_freq, q3_freq=q3_freq, q4_freq=q4_freq) FPS.set_means(quartile_means) FPS.set_sums(quartile_sums) return FPS
def quantize(data_frame, measure_column, dataframe_helper): if not dataframe_helper.is_numeric_column(measure_column): raise BIException.non_numeric_column(measure_column) quantile_discretizer = QuantileDiscretizer( numBuckets=4, inputCol=measure_column, outputCol=Quantizer.QUANTIZATION_OUTPUT_COLUMN, relativeError=Quantizer.QUANTIZATION_RELATIVE_ERROR) bucketizer = quantile_discretizer.fit(data_frame) # splits have these values [-Inf, Q1, Median, Q3, Inf] splits = bucketizer.getSplits() if len(splits) < 5: q1 = splits[1] median = splits[1] q3 = splits[1] iqr = (q3 - q1) left_hinge = (q1 - 1.5 * iqr) right_hinge = (q3 + 1.5 * iqr) else: q1 = splits[1] median = splits[2] q3 = splits[3] iqr = (q3 - q1) left_hinge = (q1 - 1.5 * iqr) right_hinge = (q3 + 1.5 * iqr) mean = data_frame.select(FN.mean(measure_column)).collect()[0][0] column = FN.column(measure_column) num_left_outliers = data_frame.filter(column < left_hinge).count() num_right_outliers = data_frame.filter(column > right_hinge).count() # q1_freq = data_frame.filter(column < q1).count() # q2_freq = data_frame.filter(column >= q1).filter(column < median).count() # q3_freq = data_frame.filter(column >= median).filter(column < q3).count() # q4_freq = data_frame.filter(column >= q3).count() q1_stats = data_frame.filter(column < q1).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q2_stats = data_frame.filter(column >= q1).filter(column < median).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q3_stats = data_frame.filter(column >= median).filter(column < q3).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q4_stats = data_frame.filter(column >= q3).agg( FN.sum(column).alias('sum'), FN.count(column).alias('count')).collect() q1_freq = q1_stats[0]['count'] q2_freq = q2_stats[0]['count'] q3_freq = q3_stats[0]['count'] q4_freq = q4_stats[0]['count'] quartile_sums = {} quartile_sums['q1'] = q1_stats[0]['sum'] quartile_sums['q2'] = q2_stats[0]['sum'] quartile_sums['q3'] = q3_stats[0]['sum'] quartile_sums['q4'] = q4_stats[0]['sum'] quartile_means = {} quartile_means['q1'] = q1_stats[0]['sum'] / q1_stats[0]['count'] quartile_means['q2'] = q2_stats[0]['sum'] / q2_stats[0]['count'] quartile_means['q3'] = q3_stats[0]['sum'] / q3_stats[0]['count'] quartile_means['q4'] = q4_stats[0]['sum'] / q4_stats[0]['count'] FPS = FivePointSummary(left_hinge_value=left_hinge, q1_value=q1, median=median, q3_value=q3, right_hinge_value=right_hinge, num_left_outliers=num_left_outliers, num_right_outliers=num_right_outliers, q1_freq=q1_freq, q2_freq=q2_freq, q3_freq=q3_freq, q4_freq=q4_freq) FPS.set_means(quartile_means) FPS.set_sums(quartile_sums) return FPS