Beispiel #1
0
    def __init__(self,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 subsample=1.0,
                 random_state=1234,
                 n_jobs=-1,
                 **kwarg):

        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

        def find_split(avc):

            if avc.shape[0] == 0:
                return None

            valid_splits = np.logical_and(avc[:, 3] > self.min_samples_leaf,
                                          avc[:, 6] > self.min_samples_leaf)
            avc = avc[valid_splits, :]

            if avc.shape[0] == 0:
                return None

            n_l = avc[:, 3]
            n_r = avc[:, 6]
            y_hat_l = avc[:, 4] / n_l
            y_hat_r = avc[:, 7] / n_r

            diff = y_hat_l - y_hat_r
            diff2 = diff * diff
            friedman_score = n_l * n_r / (n_l + n_r) * diff2

            best_idx = np.argsort(friedman_score)[-1]
            ss = {"selected": avc[best_idx, :]}

            return ss

        def is_leaf(branch, branch_parent):

            if (branch["depth"] >= self.max_depth
                    or branch["n_samples"] < self.min_samples_split):
                return True
            else:
                return False

        Bonsai.__init__(self,
                        find_split,
                        is_leaf,
                        subsample=subsample,
                        random_state=random_state,
                        n_jobs=n_jobs,
                        z_type="M2")
Beispiel #2
0
    def __init__(self,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_varsum_decrease=0.0,
                 subsample=1.0,
                 random_state=1234,
                 **kwarg):

        self.regression = True
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_varsum_decrease = min_varsum_decrease

        def find_split(avc):

            if avc.shape[0] == 0:
                return None

            valid_splits = np.logical_and(avc[:, 3] > self.min_samples_leaf,
                                          avc[:, 6] > self.min_samples_leaf)
            avc = avc[valid_splits, :]

            if avc.shape[0] == 0:
                return None

            n_l = avc[:, 3]
            n_r = avc[:, 6]
            mu_l = avc[:, 4] / n_l
            mu_r = avc[:, 7] / n_r
            M2_l = avc[:, 5]
            M2_r = avc[:, 8]
            var_l = M2_l / n_l - mu_l * mu_l
            var_r = M2_r / n_r - mu_r * mu_r
            varsum = var_l * n_l + var_r * n_r
            best_idx = np.argsort(varsum)[0]
            best_varsum = varsum[best_idx]

            ss = {
                "selected": avc[best_idx, :],  # required for Bonsai
                "varsum@l": var_l[best_idx],  # required for RegTree
                "varsum@r": var_r[best_idx],
            }  # required for RegTree

            return ss

        def is_leaf(branch, branch_parent):

            varsum_dec = 1.0 + self.min_varsum_decrease
            if "varsum" in branch_parent:
                varsum_dec = branch_parent["varsum"] - branch["varsum"]
            if (branch["depth"] >= self.max_depth
                    or branch["n_samples"] < self.min_samples_split
                    or varsum_dec < self.min_varsum_decrease):
                return True
            else:
                return False

        Bonsai.__init__(
            self,
            find_split,
            is_leaf,
            subsample=subsample,
            random_state=random_state,
            z_type="M2",
        )
Beispiel #3
0
    def __init__(self,
                 alpha=1.0,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 **kwargs):

        self.alpha = alpha
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

        def find_split(avc):

            if avc.shape[0] == 0:
                return None

            valid_splits = np.logical_and(
                avc[:, 3] > max(self.min_samples_leaf, 1),
                avc[:, 6] > max(self.min_samples_leaf, 1),
            )
            avc = avc[valid_splits, :]
            if avc.shape[0] == 0:
                return None

            n_l = avc[:, 3]
            n_r = avc[:, 6]
            p_y_l = (avc[:, 4] + 1.0) / (n_l + 2.0)
            p_y_r = (avc[:, 7] + 1.0) / (n_r + 2.0)

            p_x_l = n_l / (n_l + n_r)
            p_x_r = n_r / (n_l + n_r)

            p_y_l[p_y_l < PRECISION] = PRECISION
            p_y_r[p_y_r < PRECISION] = PRECISION
            p_y_l[p_y_l > 1.0 - PRECISION] = 1.0 - PRECISION
            p_y_r[p_y_r > 1.0 - PRECISION] = 1.0 - PRECISION

            gain = np.zeros(avc.shape[0])

            if self.alpha == 1.0:  # Information Gain
                gain += p_x_l * p_y_l * np.log(p_y_l)
                gain += p_x_l * (1.0 - p_y_l) * np.log(1.0 - p_y_l)
                gain += p_x_r * p_y_r * np.log(p_y_r)
                gain += p_x_r * (1.0 - p_y_r) * np.log(1.0 - p_y_r)
            elif self.alpha == 0.0:
                gain -= p_x_l * np.log(p_y_l)
                gain -= p_x_l * np.log(1.0 - p_y_l)
                gain -= p_x_r * np.log(p_y_r)
                gain -= p_x_r * np.log(1.0 - p_y_r)
            else:
                gain -= p_x_l * np.power(p_y_l, self.alpha)
                gain -= p_x_l * np.power(1.0 - p_y_l, self.alpha)
                gain -= p_x_r * np.power(p_y_r, self.alpha)
                gain -= p_x_r * np.power(1.0 - p_y_r, self.alpha)
                gain = gain / self.alpha / (1.0 - self.alpha)

            best_idx = np.argsort(gain)[-1]

            return {"selected": avc[best_idx, :]}

        def is_leaf(branch, branch_parent):

            if (branch["depth"] >= self.max_depth
                    or branch["n_samples"] < self.min_samples_split):
                return True
            else:
                return False

        Bonsai.__init__(self, find_split, is_leaf, z_type="M2", **kwargs)
Beispiel #4
0
    def __init__(
            self,
            max_depth=5,
            min_samples_split=2,
            min_samples_leaf=1,
            subsample=1.0,
            reg_lambda=0.1,  # regularization
            random_state=1234,
            distribution="gaussian",
            **kwarg):

        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.reg_lambda = reg_lambda
        self.distribution = distribution

        def find_split(avc):

            if avc.shape[0] == 0:
                return None

            valid_splits = np.logical_and(avc[:, 3] > self.min_samples_leaf,
                                          avc[:, 6] > self.min_samples_leaf)
            avc = avc[valid_splits, :]

            if avc.shape[0] == 0:
                return None

            if self.distribution == "bernoulli":
                h_l = avc[:, 5]
                h_r = avc[:, 8]
            else:
                h_l = avc[:, 3]
                h_r = avc[:, 6]
            g_l = avc[:, 4]
            g_r = avc[:, 7]
            obj = g_l * g_l / (h_l + self.reg_lambda)
            obj = obj + g_r * g_r / (h_r + self.reg_lambda)

            y_l = g_l / (h_l + self.reg_lambda)
            y_r = g_r / (h_r + self.reg_lambda)

            best_idx = np.argsort(obj)[-1]

            ss = {
                "selected": avc[best_idx, :],
                "y@l": y_l[best_idx],
                "y@r": y_r[best_idx],
            }

            return ss

        def is_leaf(branch, branch_parent):

            if (branch["depth"] >= self.max_depth
                    or branch["n_samples"] < self.min_samples_split):
                return True
            else:
                return False

        Bonsai.__init__(
            self,
            find_split,
            is_leaf,
            subsample=subsample,
            random_state=random_state,
            z_type="Hessian",
        )