Ejemplo n.º 1
0
def test_numerical_default_fit_transform():
    optb = MulticlassOptimalBinning()

    x_transform = optb.fit_transform(x, y, metric="mean_woe")
    assert x_transform[:5] == approx(
        [-0.00074357, 0.48973998, 0.02189459, -0.00074357, 0.02189459],
        rel=1e-5)
Ejemplo n.º 2
0
def test_numerical_default():
    optb = MulticlassOptimalBinning()
    optb.fit(x, y)

    assert optb.status == "OPTIMAL"
    assert optb.splits == approx(
        [2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6)
Ejemplo n.º 3
0
def test_numerical_default():
    optb = MulticlassOptimalBinning()
    optb.fit(x, y)

    assert optb.status == "OPTIMAL"
    assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
                                  2.6450001], rel=1e-6)

    optb.binning_table.build()
    optb.binning_table.analysis()
    assert optb.binning_table.js == approx(0.10989515, rel=1e-6)
    assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6)
Ejemplo n.º 4
0
def test_numerical_default():
    optb = MulticlassOptimalBinning()
    optb.fit(x, y)

    assert optb.status == "OPTIMAL"
    assert optb.splits == approx(
        [2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6)

    optb.binning_table.build()
    optb.binning_table.analysis()
    assert optb.binning_table.js == approx(0.10989515, rel=1e-6)
    assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6)
    optb.binning_table.plot(savefig="test_multiclass_binning.png")
    optb.binning_table.plot(add_special=False,
                            savefig="test_multiclass_binning_no_special.png")
    optb.binning_table.plot(add_missing=False,
                            savefig="test_multiclass_binning_no_missing.png")
Ejemplo n.º 5
0
def test_default_transform_multiclass():
    data = load_wine()
    variable_names = data.feature_names
    X = data.data
    y = data.target

    process = BinningProcess(variable_names)
    process.fit(X, y)
    X_transform = process.transform(X)

    optb = process.get_binned_variable(variable_names[0])
    assert isinstance(optb, MulticlassOptimalBinning)

    optb = MulticlassOptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)
    assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
Ejemplo n.º 6
0
def test_numerical_default_solvers():
    optb_mip_bop = MulticlassOptimalBinning(solver="mip", mip_solver="bop")
    optb_mip_bop.fit(x, y)

    optb_cp = MulticlassOptimalBinning(solver="cp")
    optb_cp.fit(x, y)

    for optb in [optb_mip_bop, optb_cp]:
        assert optb.status == "OPTIMAL"
        assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
                                      2.6450001], rel=1e-6)
Ejemplo n.º 7
0
def test_numerical_default_transform():
    optb = MulticlassOptimalBinning()
    with raises(NotFittedError):
        x_transform = optb.transform(x)

    optb.fit(x, y)

    x_transform = optb.transform([0.3, 2.1, 2.5, 3], metric="mean_woe")
    assert x_transform == approx([0.48973998, 0.48973998, -0.00074357,
                                  0.02189459], rel=1e-5)
Ejemplo n.º 8
0
    def splitData(self, data, availableAttributes, numericAttrBinning,
                  repeatAttributes, minNumRecordsLeafNode):
        '''Given a list of available attributes chooses a split that has 
        the largest information gain. Returns the chosen attribute, the 
        subsets of the dataframe resulting from the split, the best split 
        threshold and the ranges for each subset'''
        bestGain = -np.inf
        bestSubsets = None
        splitAttrib = None
        bestSplitThreshold = None
        bestRanges = None

        for attr in availableAttributes:
            # if attr is discrete attribute with z values
            if str(data[attr].dtype) == 'object' or str(
                    data[attr].dtype) == 'category':

                if len(set(data[attr])) == 1:
                    continue  # skip if only one category
                grouped = data.groupby(attr)
                # get values for binning
                x = data[attr].values
                y = data.iloc[:, -1].values

                # type of binning is determined by tree type
                if self.treeType == 'classification':
                    optb = OptimalBinning(dtype='categorical',
                                          min_n_bins=2,
                                          max_n_bins=4)
                else:
                    optb = ContinuousOptimalBinning(dtype='categorical',
                                                    min_n_bins=2,
                                                    max_n_bins=4,
                                                    min_prebin_size=0.001)

                optb.fit(x, y)
                binningResultDt = optb.binning_table.build()
                bins = binningResultDt['Bin'].head(-3)

                # create susbset for each bin if target var is binary and there are multiple bins
                if (len(self.classes) == 2
                        and len(bins) > 1):  # Binary targret variable
                    subsets = [
                        pd.concat([grouped.get_group(cat) for cat in bin])
                        for bin in bins
                    ]

                else:  # otherwise create subset for each value (category) of the attribute
                    subsets = [
                        grouped.get_group(x) for x in data[attr].unique()
                    ]

                if any(
                        len(subset) for subset in subsets
                        if len(subset) < minNumRecordsLeafNode):
                    continue  # skip if there are too small subsets

                if self.treeType == 'classification':
                    infoGain = self.calculateInformationGain(data, subsets)
                else:
                    infoGain = self.calculateStandardDeviationReduction(
                        data, subsets)

                if infoGain >= bestGain:
                    bestGain = infoGain
                    bestSubsets = subsets
                    splitAttrib = attr
                    bestSplitThreshold = None
                    bestRanges = None

            else:  # if attr has numeric values
                onlyOneBin = False
                # get values for binning
                x = data[attr].values
                y = data.iloc[:, -1].values

                # type of binning is determined by tree type
                if self.treeType == 'classification':
                    optb = MulticlassOptimalBinning(min_n_bins=2, max_n_bins=4)
                else:
                    if x.min() == x.max(): continue
                    optb = ContinuousOptimalBinning(min_n_bins=2,
                                                    max_n_bins=4,
                                                    min_prebin_size=0.001)

                optb.fit(x, y)
                binningResultDt = optb.binning_table.build()
                bins = binningResultDt['Bin'].head(-3)
                if len(bins) == 1:
                    onlyOneBin = True

                # if user enabled numeric attribue binning and there are multiple bins
                if numericAttrBinning is True and onlyOneBin is False:

                    # modify range string representation so it can be parsed
                    bins.iloc[0] = bins.iloc[0].replace('-inf', "'-inf'")
                    bins.iloc[-1] = bins.iloc[-1].replace('inf', "'inf'")
                    # create list of tuples for every range
                    ranges = [literal_eval(x.replace('[', '(')) for x in bins]
                    # replace 'inf' strigns with np.inf
                    ranges = [(-np.inf, x[1]) if x[0] == '-inf' else
                              ((x[0], np.inf) if x[1] == 'inf' else
                               (x[0], x[1])) for x in ranges]
                    # create subsets according to the ranges
                    subsets = [
                        data.loc[(data[attr] >= r[0]) & (data[attr] < r[1])]
                        for r in ranges
                    ]

                    if any(
                            len(subset) for subset in subsets
                            if len(subset) < minNumRecordsLeafNode):
                        continue  # skip if there are too small subsets

                    if self.treeType == 'classification':
                        infoGain = self.calculateInformationGain(data, subsets)
                    else:
                        infoGain = self.calculateStandardDeviationReduction(
                            data, subsets)

                    if infoGain >= bestGain:
                        bestGain = infoGain
                        bestSubsets = subsets
                        splitAttrib = attr
                        bestSplitThreshold = None
                        bestRanges = ranges
                else:  # binary split using threshold
                    sortedData = data.sort_values(attr)  # sort data by attr
                    for i in range(len(sortedData[attr]) -
                                   1):  # for each entry (without the last one)
                        # if current and next value of attr are equal - do nothing
                        if sortedData[attr].iloc[i] == sortedData[attr].iloc[
                                i + 1]:
                            continue
                        # calculate threshold and use it to create two subsets
                        currentThreshold = (sortedData[attr].iloc[i] +
                                            sortedData[attr].iloc[i + 1]) / 2
                        lowerSubset = sortedData[
                            sortedData[attr] <= currentThreshold]
                        higherSubset = sortedData[
                            sortedData[attr] > currentThreshold]

                        if len(lowerSubset) < minNumRecordsLeafNode or len(
                                higherSubset) < minNumRecordsLeafNode:
                            continue  # skip if there are too small subsets

                        if self.treeType == 'classification':
                            infoGain = self.calculateInformationGain(
                                data, [lowerSubset, higherSubset])
                        else:
                            infoGain = self.calculateStandardDeviationReduction(
                                data, [lowerSubset, higherSubset])

                        if infoGain > bestGain:
                            bestGain = infoGain
                            bestSubsets = [lowerSubset, higherSubset]
                            splitAttrib = attr
                            bestSplitThreshold = currentThreshold
                            bestRanges = None

        # fix ranges if repeatingAttributes
        if bestRanges and repeatAttributes:
            parentRanges = self.numericAttrRanges[splitAttrib][0][
                self.numericAttrRanges[splitAttrib][1]]
            checkValue = data[splitAttrib].iloc[0]
            parentRange = next(rng for rng in parentRanges
                               if checkValue >= rng[0] and checkValue < rng[1])
            bestRanges[0] = (parentRange[0], bestRanges[0][1])
            bestRanges[-1] = (bestRanges[-1][0], parentRange[-1])

            self.numericAttrRanges[splitAttrib][1] += 1

            if self.numericAttrRanges[splitAttrib][1] in range(
                    0, len(self.numericAttrRanges[splitAttrib][0])):
                self.numericAttrRanges[splitAttrib][0][
                    self.numericAttrRanges[splitAttrib][1]] = bestRanges
            else:
                self.numericAttrRanges[splitAttrib][0].append(bestRanges)

        return (splitAttrib, bestSubsets, bestSplitThreshold, bestRanges,
                bestGain)
Ejemplo n.º 9
0
def test_params():
    with raises(TypeError):
        optb = MulticlassOptimalBinning(name=1)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(prebinning_method="new_method")
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(solver="new_solver")
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(max_n_prebins=-2)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(min_prebin_size=0.6)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(min_n_bins=-2)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(max_n_bins=-2.2)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(min_n_bins=3, max_n_bins=2)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(min_bin_size=0.6)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(max_bin_size=-0.6)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(min_bin_size=0.5, max_bin_size=0.3)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(monotonic_trend=["new_trend", "auto"])
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(monotonic_trend="new_trend")
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(max_pvalue=1.1)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(max_pvalue_policy="new_policy")
        optb.fit(x, y)

    with raises(TypeError):
        optb = MulticlassOptimalBinning(user_splits={"a": [1, 2]})
        optb.fit(x, y)

    with raises(TypeError):
        optb = MulticlassOptimalBinning(special_codes={1, 2, 3})
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(split_digits=9)
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(mip_solver="new_solver")
        optb.fit(x, y)

    with raises(ValueError):
        optb = MulticlassOptimalBinning(time_limit=-2)
        optb.fit(x, y)

    with raises(TypeError):
        optb = MulticlassOptimalBinning(verbose=1)
        optb.fit(x, y)
Ejemplo n.º 10
0
def test_verbose():
    optb = MulticlassOptimalBinning(verbose=True)
    optb.fit(x, y)

    assert optb.status == "OPTIMAL"
Ejemplo n.º 11
0
def test_classes():
    optb = MulticlassOptimalBinning()
    optb.fit(x, y)

    assert optb.classes == approx([0, 1, 2])
Ejemplo n.º 12
0
def test_numerical_user_splits_fixed():
    user_splits = [2.1, 2.2, 2.3, 2.6, 2.9]

    with raises(ValueError):
        user_splits_fixed = [False, False, False, True, False]
        optb = MulticlassOptimalBinning(user_splits_fixed=user_splits_fixed)
        optb.fit(x, y)

    with raises(TypeError):
        user_splits_fixed = (False, False, False, True, False)
        optb = MulticlassOptimalBinning(user_splits=user_splits,
                                        user_splits_fixed=user_splits_fixed)
        optb.fit(x, y)

    with raises(ValueError):
        user_splits_fixed = [0, 0, 0, 1, 0]
        optb = MulticlassOptimalBinning(user_splits=user_splits,
                                        user_splits_fixed=user_splits_fixed)
        optb.fit(x, y)

    with raises(ValueError):
        user_splits_fixed = [False, False, False, False]
        optb = MulticlassOptimalBinning(user_splits=user_splits,
                                        user_splits_fixed=user_splits_fixed)
        optb.fit(x, y)

    user_splits_fixed = [False, False, False, True, True]

    with raises(ValueError):
        # pure pre-bins
        optb = MulticlassOptimalBinning(user_splits=user_splits,
                                        user_splits_fixed=user_splits_fixed)
        optb.fit(x, y)

    user_splits = [2.1, 2.2, 2.3, 2.6, 2.7]
    optb = MulticlassOptimalBinning(user_splits=user_splits,
                                    user_splits_fixed=user_splits_fixed)
    optb.fit(x, y)

    assert optb.status == "OPTIMAL"
    assert 2.7 in optb.splits
Ejemplo n.º 13
0
def test_numerical_user_splits_non_unique():
    user_splits = [2.1, 2.2, 2.2, 2.6, 2.9]
    optb = MulticlassOptimalBinning(user_splits=user_splits)

    with raises(ValueError):
        optb.fit(x, y)