def test_overlapping_range_not_accepted(self):

        with self.assertRaises(ValueError):
            BinningSpec(
                feature=Feature(
                    name="feature",
                    range=Range(
                        numeric_range_start=0.5,
                        numeric_range_end=2.5,
                        categorical_indicators={"M", "C", "_"}
                    )
                ),
                bins={
                    Range(
                        numeric_range_start=0.5,
                        numeric_range_end=1.7,
                        categorical_indicators={"M", "C"}
                    ),
                    Range(
                        numeric_range_start=1.5,
                        numeric_range_end=2.5,
                        categorical_indicators={"_"}
                    )
                }
            )
 def test_valid_range_accepted_binning(self):
     BinningSpec(
         feature=Feature(
             name="feature",
             range=Range(
                 numeric_range_start=0.5,
                 numeric_range_end=2.5,
                 categorical_indicators={"M", "C", "_"}
             )
         ),
         bins={
             Range(
                 numeric_range_start=0.4,
                 numeric_range_end=2.6,
                 categorical_indicators={"M", "C", "_"}
             )
         }
     )
    def test_invalid_categorical_range_not_accepted_binning(self):

        with self.assertRaises(ValueError):
            BinningSpec(
                feature=Feature(
                    name="feature",
                    range=Range(
                        numeric_range_start=0.5,
                        numeric_range_end=2.5,
                        categorical_indicators={"M", "C", "_", "X"}
                    )
                ),
                bins={
                    Range(
                        numeric_range_start=0.5,
                        numeric_range_end=2.5,
                        categorical_indicators={"M", "C", "_"}
                    )
                }
            )
Esempio n. 4
0
def _get_spec(feature: Feature, sorted_thresholds: List,
              categorical_indicators: FrozenSet) -> BinningSpec:
    """
    Creates a binning spec from provided data.

    :param feature: the feature that's being binned
    :param sorted_thresholds: the numeric part thresholds, sorted and unique
    :param categorical_indicators: the categorical indicator set to be used
    :return:
    """

    list_of_sets_required = [
        set([
            Range(numeric_range_start=sorted_thresholds[i],
                  numeric_range_end=threshold)
        ]) for i, threshold in enumerate(sorted_thresholds[1:])
    ] + [
        set([Range(categorical_indicators=set([char]))])
        for char in categorical_indicators
    ]

    return BinningSpec(feature=feature, bins=set.union(*list_of_sets_required))
Esempio n. 5
0
def _iteratively_merge_bins(
        x: pd.Series,
        y: pd.Series,
        binning_spec: BinningSpec,
        stat_test: Callable = _proportion_z_test_returning_p_value,
        p_value_threshold: float = P_VALUE_THRESHOLD
) -> BinningSpec:
    """
    TODO fix the method

    :param x: the variable being binned
    :param y: the target variable used to determine which bins to merge
    :param binning_spec: the binning specification for the feature; it will be iteratively merged
    :param stat_test: a function that returns a single value, p-value of a statistical test
    :param p_value_threshold: the threshold to decide if the null hypothesis is rejected
    :return: a binning specification with similar contiguous bins merged
    """

    bins_merged_at_iteration = True
    new_binning = binning_spec.bins

    while bins_merged_at_iteration:
        old_binning = new_binning
        new_binning = set()
        bins_ordered = list(old_binning)
        bin_masks = [get_mask_from_range(x, bin) for bin in bins_ordered]
        bin_event_counts = [y[mask].sum() for mask in bin_masks]
        bin_sizes = [mask.sum() for mask in bin_masks]
        bad_rates = [events / size for events, size in zip(bin_event_counts, bin_sizes)]

        # Sort bins according to bad rates.
        sorted_indexes = np.argsort(bad_rates)
        i = 1

        # Merge contiguous bins that have bad rates that aren't stat. sign. different.
        while i < len(sorted_indexes):
            prev_idx = sorted_indexes[i - 1]
            idx = sorted_indexes[i]
            p_value = stat_test(
                bin_event_counts[prev_idx],
                bin_event_counts[idx],
                bin_sizes[prev_idx],
                bin_sizes[idx]
            )

            if p_value >= p_value_threshold:
                print(
                    'merging bins ({}, {}) - ({}, {})'.format(
                        bins_ordered[prev_idx].numeric_range_start,
                        bins_ordered[prev_idx].numeric_range_end,
                        bins_ordered[idx].numeric_range_start,
                        bins_ordered[idx].numeric_range_end
                    )
                )
                new_binning.add(
                    Range(
                        numeric_range_start=min(
                            bins_ordered[prev_idx].numeric_range_start,
                            bins_ordered[idx].numeric_range_start
                        ),
                        numeric_range_end=max(
                            bins_ordered[prev_idx].numeric_range_end,
                            bins_ordered[idx].numeric_range_end
                        ),
                        categorical_indicators=frozenset.union(
                            bins_ordered[prev_idx].categorical_indicators,
                            bins_ordered[idx].categorical_indicators
                        )
                    )
                )
                i += 2

                # If we have skipped past the last bin, add it.
                if i == len(sorted_indexes):
                    new_binning.add(bins_ordered[sorted_indexes[-1]])

            else:
                new_binning.add(bins_ordered[prev_idx])
                i += 1

        if len(old_binning) == len(new_binning):
            bins_merged_at_iteration = False

        elif len(old_binning) > len(new_binning):
            bins_merged_at_iteration = True

        else:
            raise RuntimeError()

    return BinningSpec(
        feature=binning_spec.feature,
        bins=new_binning
    )