def test_overlapping_range_not_accepted(self): with self.assertRaises(ValueError): BinningSpec( feature=Feature( name="feature", range=Range( numeric_range_start=0.5, numeric_range_end=2.5, categorical_indicators={"M", "C", "_"} ) ), bins={ Range( numeric_range_start=0.5, numeric_range_end=1.7, categorical_indicators={"M", "C"} ), Range( numeric_range_start=1.5, numeric_range_end=2.5, categorical_indicators={"_"} ) } )
def test_valid_range_accepted_binning(self): BinningSpec( feature=Feature( name="feature", range=Range( numeric_range_start=0.5, numeric_range_end=2.5, categorical_indicators={"M", "C", "_"} ) ), bins={ Range( numeric_range_start=0.4, numeric_range_end=2.6, categorical_indicators={"M", "C", "_"} ) } )
def test_invalid_categorical_range_not_accepted_binning(self): with self.assertRaises(ValueError): BinningSpec( feature=Feature( name="feature", range=Range( numeric_range_start=0.5, numeric_range_end=2.5, categorical_indicators={"M", "C", "_", "X"} ) ), bins={ Range( numeric_range_start=0.5, numeric_range_end=2.5, categorical_indicators={"M", "C", "_"} ) } )
def _get_spec(feature: Feature, sorted_thresholds: List, categorical_indicators: FrozenSet) -> BinningSpec: """ Creates a binning spec from provided data. :param feature: the feature that's being binned :param sorted_thresholds: the numeric part thresholds, sorted and unique :param categorical_indicators: the categorical indicator set to be used :return: """ list_of_sets_required = [ set([ Range(numeric_range_start=sorted_thresholds[i], numeric_range_end=threshold) ]) for i, threshold in enumerate(sorted_thresholds[1:]) ] + [ set([Range(categorical_indicators=set([char]))]) for char in categorical_indicators ] return BinningSpec(feature=feature, bins=set.union(*list_of_sets_required))
def _iteratively_merge_bins( x: pd.Series, y: pd.Series, binning_spec: BinningSpec, stat_test: Callable = _proportion_z_test_returning_p_value, p_value_threshold: float = P_VALUE_THRESHOLD ) -> BinningSpec: """ TODO fix the method :param x: the variable being binned :param y: the target variable used to determine which bins to merge :param binning_spec: the binning specification for the feature; it will be iteratively merged :param stat_test: a function that returns a single value, p-value of a statistical test :param p_value_threshold: the threshold to decide if the null hypothesis is rejected :return: a binning specification with similar contiguous bins merged """ bins_merged_at_iteration = True new_binning = binning_spec.bins while bins_merged_at_iteration: old_binning = new_binning new_binning = set() bins_ordered = list(old_binning) bin_masks = [get_mask_from_range(x, bin) for bin in bins_ordered] bin_event_counts = [y[mask].sum() for mask in bin_masks] bin_sizes = [mask.sum() for mask in bin_masks] bad_rates = [events / size for events, size in zip(bin_event_counts, bin_sizes)] # Sort bins according to bad rates. sorted_indexes = np.argsort(bad_rates) i = 1 # Merge contiguous bins that have bad rates that aren't stat. sign. different. while i < len(sorted_indexes): prev_idx = sorted_indexes[i - 1] idx = sorted_indexes[i] p_value = stat_test( bin_event_counts[prev_idx], bin_event_counts[idx], bin_sizes[prev_idx], bin_sizes[idx] ) if p_value >= p_value_threshold: print( 'merging bins ({}, {}) - ({}, {})'.format( bins_ordered[prev_idx].numeric_range_start, bins_ordered[prev_idx].numeric_range_end, bins_ordered[idx].numeric_range_start, bins_ordered[idx].numeric_range_end ) ) new_binning.add( Range( numeric_range_start=min( bins_ordered[prev_idx].numeric_range_start, bins_ordered[idx].numeric_range_start ), numeric_range_end=max( bins_ordered[prev_idx].numeric_range_end, bins_ordered[idx].numeric_range_end ), categorical_indicators=frozenset.union( bins_ordered[prev_idx].categorical_indicators, bins_ordered[idx].categorical_indicators ) ) ) i += 2 # If we have skipped past the last bin, add it. if i == len(sorted_indexes): new_binning.add(bins_ordered[sorted_indexes[-1]]) else: new_binning.add(bins_ordered[prev_idx]) i += 1 if len(old_binning) == len(new_binning): bins_merged_at_iteration = False elif len(old_binning) > len(new_binning): bins_merged_at_iteration = True else: raise RuntimeError() return BinningSpec( feature=binning_spec.feature, bins=new_binning )