Ejemplo n.º 1
0
    def get_binned_stat(self, df, colname, col_stat, n_split=10):

        splits = CommonUtils.frange(col_stat["min"],
                                    col_stat["max"],
                                    num_steps=n_split)
        splits = sorted(splits)
        splits_range = [(splits[idx], splits[idx + 1])
                        for idx in range(len(splits) - 1)]

        splits_data = {"splits": splits, "splits_range": splits_range}
        splits = splits_data["splits"]
        double_df = df.withColumn(colname, df[colname].cast(DoubleType()))
        bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX")
        bucketizer.setSplits(splits)
        binned_df = bucketizer.transform(double_df)
        histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas()
        str_splits_range = [
            " to ".join([str(x[0]), str(x[1])]) for x in splits_range
        ]
        bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range))
        bin_name_dict[n_split] = "null"
        histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply(
            lambda x: n_split if pd.isnull(x) else x)
        histogram_df["bins"] = histogram_df["orderIndex"].apply(
            lambda x: bin_name_dict[int(x)])
        relevant_df = histogram_df[["bins", "count", "orderIndex"]]
        histogram_dict = relevant_df.T.to_dict().values()
        histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"])
        output = []
        for val in histogram_dict:
            output.append({"name": val["bins"], "value": val["count"]})
        return output
 def generateGroupedMeasureDataDict(self, measure_column):
     splits_data = self.get_measure_column_splits(self._data_frame,
                                                  measure_column, 4)
     splits = splits_data["splits"]
     double_df = self._data_frame.withColumn(
         measure_column,
         self._data_frame[measure_column].cast(DoubleType()))
     bucketizer = Bucketizer(inputCol=measure_column,
                             outputCol="BINNED_INDEX")
     bucketizer.setSplits(splits)
     binned_df = bucketizer.transform(double_df)
     unique_bins = binned_df.select("BINNED_INDEX").distinct().collect()
     unique_bins = [int(x[0]) for x in unique_bins]
     binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"]))
     output = {"bins": binned_index_dict, "data": binned_df}
     return output
Ejemplo n.º 3
0
 def bucketize(self, splits, target_col):
     self._bucket_name = 'bucket_' + target_col
     bucketizer = Bucketizer(inputCol=target_col,
                             outputCol=self._bucket_name)
     splits.sort()
     bucketizer.setSplits(splits)
     column_data_types = {
         field.name: field.dataType
         for field in self._data_frame.schema.fields
     }
     if column_data_types[target_col] != DoubleType:
         self._data_frame = self._data_frame.select(*[
             col(target_col).cast('double').alias(target_col) if column ==
             target_col else column for column in self._data_frame.columns
         ])
     self._data_frame = bucketizer.transform(self._data_frame)
     return self._bucket_name
Ejemplo n.º 4
0
 def bin_columns(self, colsToBin):
     for bincol in colsToBin:
         if self._pandas_flag:
             try:
                 minval, maxval = float(min(
                     self._data_frame[bincol])), float(
                         max(self._data_frame[bincol]))
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 self._data_frame[bincol] = pd.cut(
                     self._data_frame[bincol],
                     bins=splitsData["splits"],
                     labels=list(splitsData['bin_mapping'].values()),
                     right=True,
                     include_lowest=True)
             except Exception as e:
                 print("Binning failed for : ", bincol)
         else:
             try:
                 minval, maxval = self._data_frame.select([
                     FN.max(bincol).alias("max"),
                     FN.min(bincol).alias("min")
                 ]).collect()[0]
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 splits = splitsData["splits"]
                 self._data_frame = self._data_frame.withColumn(
                     bincol, self._data_frame[bincol].cast(DoubleType()))
                 bucketizer = Bucketizer(inputCol=bincol,
                                         outputCol="BINNED_INDEX")
                 bucketizer.setSplits(splits)
                 self._data_frame = bucketizer.transform(self._data_frame)
                 mapping_expr = create_map([
                     lit(x) for x in chain(
                         *list(splitsData["bin_mapping"].items()))
                 ])
                 # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ")
                 self._data_frame = self._data_frame.withColumn(
                     bincol, mapping_expr.getItem(col("BINNED_INDEX")))
                 self._data_frame = self._data_frame.select(self.columns)
             except Exception as e:
                 print("Binning failed for : ", bincol)
Ejemplo n.º 5
0
 def bin_columns(self, colsToBin):
     for bincol in colsToBin:
         minval, maxval = self._data_frame.select(
             [FN.max(bincol).alias("max"),
              FN.min(bincol).alias("min")]).collect()[0]
         n_split = 10
         splitsData = CommonUtils.get_splits(minval, maxval, n_split)
         splits = splitsData["splits"]
         self._data_frame = self._data_frame.withColumn(
             bincol, self._data_frame[bincol].cast(DoubleType()))
         bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX")
         bucketizer.setSplits(splits)
         self._data_frame = bucketizer.transform(self._data_frame)
         mapping_expr = create_map(
             [lit(x) for x in chain(*splitsData["bin_mapping"].items())])
         self._data_frame = self._data_frame.withColumnRenamed(
             "bincol", bincol + "JJJLLLLKJJ")
         self._data_frame = self._data_frame.withColumn(
             bincol, mapping_expr.getItem(col("BINNED_INDEX")))
         self._data_frame = self._data_frame.select(self.columns)
Ejemplo n.º 6
0
 def bucketize(self, splits, target_col):
     self._bucket_name = 'bucket_' + target_col
     if self._pandas_flag:
         ''' TO DO: this method is not being used anywhere '''
         pass
     else:
         bucketizer = Bucketizer(inputCol=target_col,
                                 outputCol=self._bucket_name)
         splits.sort()
         bucketizer.setSplits(splits)
         column_data_types = {
             field.name: field.dataType
             for field in self._data_frame.schema.fields
         }
         if column_data_types[target_col] != DoubleType:
             self._data_frame = self._data_frame.select(*[
                 col(target_col).cast('double').
                 alias(target_col) if column == target_col else column
                 for column in self._data_frame.columns
             ])
         self._data_frame = bucketizer.transform(self._data_frame)
     return self._bucket_name
Ejemplo n.º 7
0
    def get_bins(self, column_name, num_bins=10, split_points=None):
        """
        Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied.

        :param column_name: column to be binned
        :param num_bins:    number of bins to create
        :param split_points:    list of tupels [(a,b), (b, c), ...] such that
                                all values in the range [a, b) assigned to bucket1
        :return:
        """
        if not column_name in self._numeric_columns:
            raise BIException.column_does_not_exist(column_name)

        splits = None
        if split_points == None:
            if self._pandas_flag:
                min_value = self._data_frame[column_name].min()
                max_value = self._data_frame[column_name].max()
            else:
                min_max = self._data_frame.agg(
                    FN.min(column_name).alias('min'),
                    FN.max(column_name).alias('max')).collect()
                min_value = min_max[0]['min']
                max_value = min_max[0]['max']
            # splits = CommonUtils.frange(min_value, max_value, num_bins)
            if self._pandas_flag:
                splits = CommonUtils.return_optimum_bins(
                    self._data_frame[column_name])
            else:
                splits = CommonUtils.return_optimum_bins(
                    self._data_frame.select(column_name).toPandas()
                    [column_name])
            if splits[0] > min_value:
                splits = [min_value - 1] + list(splits)
                print("Min Point Added")
            if splits[-1] < max_value:
                splits = list(splits) + [max_value + 1]
                print("Max Point Added")
        else:
            splits = split_points
        # cast column_name to double type if needed, otherwise Bucketizer does not work
        column_df = None
        if self._pandas_flag:
            binning_df = pd.DataFrame()
            binning_df[BinnerConstants.
                       ORIGINAL_COLUMN_NAME] = self._data_frame[column_name]
        else:
            if self._column_data_types.get(column_name) != DoubleType:
                column_df = self._data_frame.select(
                    FN.col(column_name).cast('double').alias(
                        BinnerConstants.ORIGINAL_COLUMN_NAME))
            else:
                column_df = self._data_frame.select(
                    FN.col(column_name).alias(
                        BinnerConstants.ORIGINAL_COLUMN_NAME))

            bucketizer = Bucketizer(
                inputCol=BinnerConstants.ORIGINAL_COLUMN_NAME,
                outputCol=BinnerConstants.BINNED_COLUMN_NAME)
            bucketizer.setSplits(splits)

        if min_value == max_value:
            histogram = Histogram(column_name, self._num_rows)
            bin_number = 0
            start_value = int(min_value - 0.5)
            end_value = int(max_value + 0.5)
            histogram.add_bin(bin_number, start_value, end_value,
                              self._num_rows)
        else:
            if self._pandas_flag:
                binning_df[BinnerConstants.BINNED_COLUMN_NAME] = pd.cut(
                    self._data_frame[column_name],
                    bins=splits,
                    labels=list(range(len(splits) - 1)),
                    right=False,
                    include_lowest=True)
                buckets_counts_df = binning_df.groupby(
                    BinnerConstants.BINNED_COLUMN_NAME,
                    as_index=False,
                    sort=False).count()
                histogram = Histogram(column_name, self._num_rows)
                for row in buckets_counts_df.iterrows():
                    bin_number = int(row[1][0])
                    start_value = splits[bin_number]
                    end_value = splits[bin_number + 1]
                    try:
                        histogram.add_bin(bin_number, float(start_value),
                                          float(end_value), float(row[1][1]))
                    except:
                        histogram.add_bin(bin_number, start_value, end_value,
                                          float(row[1][1]))
            else:
                buckets_and_counts = bucketizer.transform(column_df).groupBy(
                    BinnerConstants.BINNED_COLUMN_NAME).agg({
                        '*': 'count'
                    }).collect()
                histogram = Histogram(column_name, self._num_rows)
                for row in buckets_and_counts:
                    bin_number = int(row[0])
                    start_value = splits[bin_number]
                    end_value = splits[bin_number + 1]
                    histogram.add_bin(bin_number, start_value, end_value,
                                      row[1])

        return histogram
Ejemplo n.º 8
0
    def get_bins(self, column_name, num_bins=10, split_points=None):
        """
        Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied.

        :param column_name: column to be binned
        :param num_bins:    number of bins to create
        :param split_points:    list of tupels [(a,b), (b, c), ...] such that
                                all values in the range [a, b) assigned to bucket1
        :return:
        """
        if not column_name in self._numeric_columns:
            raise BIException.column_does_not_exist(column_name)

        splits = None
        if split_points == None:
            min_max = self._data_frame.agg(
                FN.min(column_name).alias('min'),
                FN.max(column_name).alias('max')).collect()
            min_value = min_max[0]['min']
            max_value = min_max[0]['max']
            quantile_discretizer = QuantileDiscretizer(numBuckets=10,
                                                       inputCol=column_name,
                                                       outputCol='buckets',
                                                       relativeError=0.01)
            bucketizer = quantile_discretizer.fit(self._data_frame)
            # splits have these values [-Inf, Q1, Median, Q3, Inf]
            splits = bucketizer.getSplits()
        else:
            splits = split_points
        # cast column_name to double type if needed, otherwise Bucketizer does not work
        splits[0] = min_value - 0.1
        splits[-1] = max_value + 0.1
        column_df = None
        if self._column_data_types.get(column_name) != DoubleType:
            column_df = self._data_frame.select(
                FN.col(column_name).cast('double').alias('values'))
        else:
            column_df = self._data_frame.select(
                FN.col(column_name).alias('values'))

        bucketizer = Bucketizer(inputCol='values', outputCol='bins')
        bucketizer.setSplits(splits)
        if min_value == max_value:
            histogram = Histogram(column_name, self._num_rows)
            bin_number = 0
            start_value = min_value - 0.5
            end_value = max_value + 0.5
            histogram.add_bin(bin_number, start_value, end_value,
                              self._num_rows)
        else:
            buckets_and_counts = bucketizer.transform(column_df).groupBy(
                'bins').agg({
                    '*': 'count'
                }).collect()
            histogram = Histogram(column_name, self._num_rows)
            for row in buckets_and_counts:
                bin_number = int(row[0])
                start_value = splits[bin_number]
                end_value = splits[bin_number + 1]
                histogram.add_bin(
                    bin_number, start_value, end_value,
                    float(row[1]) * 100.0 / (end_value - start_value))

        return histogram