def get_binned_stat(self, df, colname, col_stat, n_split=10): splits = CommonUtils.frange(col_stat["min"], col_stat["max"], num_steps=n_split) splits = sorted(splits) splits_range = [(splits[idx], splits[idx + 1]) for idx in range(len(splits) - 1)] splits_data = {"splits": splits, "splits_range": splits_range} splits = splits_data["splits"] double_df = df.withColumn(colname, df[colname].cast(DoubleType())) bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas() str_splits_range = [ " to ".join([str(x[0]), str(x[1])]) for x in splits_range ] bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range)) bin_name_dict[n_split] = "null" histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply( lambda x: n_split if pd.isnull(x) else x) histogram_df["bins"] = histogram_df["orderIndex"].apply( lambda x: bin_name_dict[int(x)]) relevant_df = histogram_df[["bins", "count", "orderIndex"]] histogram_dict = relevant_df.T.to_dict().values() histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"]) output = [] for val in histogram_dict: output.append({"name": val["bins"], "value": val["count"]}) return output
def generateGroupedMeasureDataDict(self, measure_column): splits_data = self.get_measure_column_splits(self._data_frame, measure_column, 4) splits = splits_data["splits"] double_df = self._data_frame.withColumn( measure_column, self._data_frame[measure_column].cast(DoubleType())) bucketizer = Bucketizer(inputCol=measure_column, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) unique_bins = binned_df.select("BINNED_INDEX").distinct().collect() unique_bins = [int(x[0]) for x in unique_bins] binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"])) output = {"bins": binned_index_dict, "data": binned_df} return output
def bucketize(self, splits, target_col): self._bucket_name = 'bucket_' + target_col bucketizer = Bucketizer(inputCol=target_col, outputCol=self._bucket_name) splits.sort() bucketizer.setSplits(splits) column_data_types = { field.name: field.dataType for field in self._data_frame.schema.fields } if column_data_types[target_col] != DoubleType: self._data_frame = self._data_frame.select(*[ col(target_col).cast('double').alias(target_col) if column == target_col else column for column in self._data_frame.columns ]) self._data_frame = bucketizer.transform(self._data_frame) return self._bucket_name
def bin_columns(self, colsToBin): for bincol in colsToBin: if self._pandas_flag: try: minval, maxval = float(min( self._data_frame[bincol])), float( max(self._data_frame[bincol])) n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) self._data_frame[bincol] = pd.cut( self._data_frame[bincol], bins=splitsData["splits"], labels=list(splitsData['bin_mapping'].values()), right=True, include_lowest=True) except Exception as e: print("Binning failed for : ", bincol) else: try: minval, maxval = self._data_frame.select([ FN.max(bincol).alias("max"), FN.min(bincol).alias("min") ]).collect()[0] n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) splits = splitsData["splits"] self._data_frame = self._data_frame.withColumn( bincol, self._data_frame[bincol].cast(DoubleType())) bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) self._data_frame = bucketizer.transform(self._data_frame) mapping_expr = create_map([ lit(x) for x in chain( *list(splitsData["bin_mapping"].items())) ]) # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( bincol, mapping_expr.getItem(col("BINNED_INDEX"))) self._data_frame = self._data_frame.select(self.columns) except Exception as e: print("Binning failed for : ", bincol)
def bin_columns(self, colsToBin): for bincol in colsToBin: minval, maxval = self._data_frame.select( [FN.max(bincol).alias("max"), FN.min(bincol).alias("min")]).collect()[0] n_split = 10 splitsData = CommonUtils.get_splits(minval, maxval, n_split) splits = splitsData["splits"] self._data_frame = self._data_frame.withColumn( bincol, self._data_frame[bincol].cast(DoubleType())) bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) self._data_frame = bucketizer.transform(self._data_frame) mapping_expr = create_map( [lit(x) for x in chain(*splitsData["bin_mapping"].items())]) self._data_frame = self._data_frame.withColumnRenamed( "bincol", bincol + "JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( bincol, mapping_expr.getItem(col("BINNED_INDEX"))) self._data_frame = self._data_frame.select(self.columns)
def bucketize(self, splits, target_col): self._bucket_name = 'bucket_' + target_col if self._pandas_flag: ''' TO DO: this method is not being used anywhere ''' pass else: bucketizer = Bucketizer(inputCol=target_col, outputCol=self._bucket_name) splits.sort() bucketizer.setSplits(splits) column_data_types = { field.name: field.dataType for field in self._data_frame.schema.fields } if column_data_types[target_col] != DoubleType: self._data_frame = self._data_frame.select(*[ col(target_col).cast('double'). alias(target_col) if column == target_col else column for column in self._data_frame.columns ]) self._data_frame = bucketizer.transform(self._data_frame) return self._bucket_name
def get_bins(self, column_name, num_bins=10, split_points=None): """ Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied. :param column_name: column to be binned :param num_bins: number of bins to create :param split_points: list of tupels [(a,b), (b, c), ...] such that all values in the range [a, b) assigned to bucket1 :return: """ if not column_name in self._numeric_columns: raise BIException.column_does_not_exist(column_name) splits = None if split_points == None: if self._pandas_flag: min_value = self._data_frame[column_name].min() max_value = self._data_frame[column_name].max() else: min_max = self._data_frame.agg( FN.min(column_name).alias('min'), FN.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] # splits = CommonUtils.frange(min_value, max_value, num_bins) if self._pandas_flag: splits = CommonUtils.return_optimum_bins( self._data_frame[column_name]) else: splits = CommonUtils.return_optimum_bins( self._data_frame.select(column_name).toPandas() [column_name]) if splits[0] > min_value: splits = [min_value - 1] + list(splits) print("Min Point Added") if splits[-1] < max_value: splits = list(splits) + [max_value + 1] print("Max Point Added") else: splits = split_points # cast column_name to double type if needed, otherwise Bucketizer does not work column_df = None if self._pandas_flag: binning_df = pd.DataFrame() binning_df[BinnerConstants. ORIGINAL_COLUMN_NAME] = self._data_frame[column_name] else: if self._column_data_types.get(column_name) != DoubleType: column_df = self._data_frame.select( FN.col(column_name).cast('double').alias( BinnerConstants.ORIGINAL_COLUMN_NAME)) else: column_df = self._data_frame.select( FN.col(column_name).alias( BinnerConstants.ORIGINAL_COLUMN_NAME)) bucketizer = Bucketizer( inputCol=BinnerConstants.ORIGINAL_COLUMN_NAME, outputCol=BinnerConstants.BINNED_COLUMN_NAME) bucketizer.setSplits(splits) if min_value == max_value: histogram = Histogram(column_name, self._num_rows) bin_number = 0 start_value = int(min_value - 0.5) end_value = int(max_value + 0.5) histogram.add_bin(bin_number, start_value, end_value, self._num_rows) else: if self._pandas_flag: binning_df[BinnerConstants.BINNED_COLUMN_NAME] = pd.cut( self._data_frame[column_name], bins=splits, labels=list(range(len(splits) - 1)), right=False, include_lowest=True) buckets_counts_df = binning_df.groupby( BinnerConstants.BINNED_COLUMN_NAME, as_index=False, sort=False).count() histogram = Histogram(column_name, self._num_rows) for row in buckets_counts_df.iterrows(): bin_number = int(row[1][0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] try: histogram.add_bin(bin_number, float(start_value), float(end_value), float(row[1][1])) except: histogram.add_bin(bin_number, start_value, end_value, float(row[1][1])) else: buckets_and_counts = bucketizer.transform(column_df).groupBy( BinnerConstants.BINNED_COLUMN_NAME).agg({ '*': 'count' }).collect() histogram = Histogram(column_name, self._num_rows) for row in buckets_and_counts: bin_number = int(row[0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] histogram.add_bin(bin_number, start_value, end_value, row[1]) return histogram
def get_bins(self, column_name, num_bins=10, split_points=None): """ Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied. :param column_name: column to be binned :param num_bins: number of bins to create :param split_points: list of tupels [(a,b), (b, c), ...] such that all values in the range [a, b) assigned to bucket1 :return: """ if not column_name in self._numeric_columns: raise BIException.column_does_not_exist(column_name) splits = None if split_points == None: min_max = self._data_frame.agg( FN.min(column_name).alias('min'), FN.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] quantile_discretizer = QuantileDiscretizer(numBuckets=10, inputCol=column_name, outputCol='buckets', relativeError=0.01) bucketizer = quantile_discretizer.fit(self._data_frame) # splits have these values [-Inf, Q1, Median, Q3, Inf] splits = bucketizer.getSplits() else: splits = split_points # cast column_name to double type if needed, otherwise Bucketizer does not work splits[0] = min_value - 0.1 splits[-1] = max_value + 0.1 column_df = None if self._column_data_types.get(column_name) != DoubleType: column_df = self._data_frame.select( FN.col(column_name).cast('double').alias('values')) else: column_df = self._data_frame.select( FN.col(column_name).alias('values')) bucketizer = Bucketizer(inputCol='values', outputCol='bins') bucketizer.setSplits(splits) if min_value == max_value: histogram = Histogram(column_name, self._num_rows) bin_number = 0 start_value = min_value - 0.5 end_value = max_value + 0.5 histogram.add_bin(bin_number, start_value, end_value, self._num_rows) else: buckets_and_counts = bucketizer.transform(column_df).groupBy( 'bins').agg({ '*': 'count' }).collect() histogram = Histogram(column_name, self._num_rows) for row in buckets_and_counts: bin_number = int(row[0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] histogram.add_bin( bin_number, start_value, end_value, float(row[1]) * 100.0 / (end_value - start_value)) return histogram