Exemple #1
0
    def enumerate_splits(self, target_dist: proba.Multinomial):

        low = min(h[0].right for h in self.hists.values())
        high = min(h[-1].right for h in self.hists.values())

        # If only one single value has been observed, then no split can be proposed
        if low >= high:
            return
            yield  # not a typo

        n_thresholds = min(self.n_splits,
                           max(map(len, self.hists.values())) - 1)

        thresholds = list(decimal_range(start=low, stop=high,
                                        num=n_thresholds))
        cdfs = {y: hist.iter_cdf(thresholds) for y, hist in self.hists.items()}

        for at in thresholds:

            l_dist = {}
            r_dist = {}

            for y in target_dist:
                p_xy = next(cdfs[y]) if y in cdfs else 0.  # P(x < t | y)
                p_y = target_dist.pmf(y)  # P(y)
                l_dist[y] = target_dist.n_samples * p_y * p_xy  # P(y | x < t)
                r_dist[y] = target_dist.n_samples * p_y * (1 - p_xy
                                                           )  # P(y | x >= t)

            l_dist = proba.Multinomial(l_dist)
            r_dist = proba.Multinomial(r_dist)

            yield LT, at, l_dist, r_dist
Exemple #2
0
    def enumerate_splits(self, target_dist: proba.Multinomial):

        categories = set(*(p_x.keys() for p_x in self.P_xy.values()))

        # There has to be at least two categories for a split to be possible
        if len(categories) < 2:
            return
            yield  # not a typo

        for cat in categories:

            l_dist = {}
            r_dist = {}

            for y in target_dist:
                p_xy = self.P_xy[y].pmf(cat)  # P(cat | y)
                p_y = target_dist.pmf(y)  # P(y)
                l_dist[y] = target_dist.n_samples * p_y * p_xy  # P(y | cat)
                r_dist[y] = target_dist.n_samples * p_y * (1. - p_xy
                                                           )  # P(y | !cat)

            l_dist = proba.Multinomial(l_dist)
            r_dist = proba.Multinomial(r_dist)

            yield EQ, cat, l_dist, r_dist
Exemple #3
0
    def __init__(self,
                 criterion='gini',
                 patience=250,
                 max_depth=5,
                 min_split_gain=0.,
                 min_child_samples=20,
                 confidence=1e-10,
                 tie_threshold=5e-2,
                 n_split_points=30,
                 max_bins=60,
                 curtail_under=10):
        self.criterion = criterion
        self.patience = patience
        self.max_depth = max_depth
        self.min_split_gain = min_split_gain
        self.min_child_samples = min_child_samples
        self.confidence = confidence
        self.tie_threshold = tie_threshold
        self.n_split_points = n_split_points
        self.max_bins = max_bins
        self.curtail_under = curtail_under

        self.criterion_func = CRITERIA_CLF[criterion]
        self.root = leaf.Leaf(depth=0,
                              tree=self,
                              target_dist=proba.Multinomial())
Exemple #4
0
 def _make_leaf_dist(self):
     return proba.Multinomial()