def test_init_raw_predictions_shapes(): # Make sure get_init_raw_predictions returns float64 arrays with shape # (n_samples, K) where K is 1 for binary classification and regression, and # K = n_classes for multiclass classification rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 5)) y = rng.normal(size=n_samples) for loss in (LeastSquaresError(n_classes=1), LeastAbsoluteError(n_classes=1), QuantileLossFunction(n_classes=1), HuberLossFunction(n_classes=1)): init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, 1) assert raw_predictions.dtype == np.float64 y = rng.randint(0, 2, size=n_samples) for loss in (BinomialDeviance(n_classes=2), ExponentialLoss(n_classes=2)): init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, 1) assert raw_predictions.dtype == np.float64 for n_classes in range(3, 5): y = rng.randint(0, n_classes, size=n_samples) loss = MultinomialDeviance(n_classes=n_classes) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, n_classes) assert raw_predictions.dtype == np.float64
def test_sample_weight_smoke(): rng = check_random_state(13) y = rng.rand(100) pred = rng.rand(100) # least squares loss = LeastSquaresError(1) loss_wo_sw = loss(y, pred) loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32)) assert_almost_equal(loss_wo_sw, loss_w_sw)
def test_init_raw_predictions_values(): # Make sure the get_init_raw_predictions() returns the expected values for # each loss. rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 5)) y = rng.normal(size=n_samples) # Least squares loss loss = LeastSquaresError(n_classes=1) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the mean of all targets assert_almost_equal(raw_predictions, y.mean()) # Least absolute and huber loss for Loss in (LeastAbsoluteError, HuberLossFunction): loss = Loss(n_classes=1) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the median of all targets assert_almost_equal(raw_predictions, np.median(y)) # Quantile loss for alpha in (.1, .5, .9): loss = QuantileLossFunction(n_classes=1, alpha=alpha) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the alpha-quantile of all targets assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100)) y = rng.randint(0, 2, size=n_samples) # Binomial deviance loss = BinomialDeviance(n_classes=2) init_estimator = loss.init_estimator().fit(X, y) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, # and by definition # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) p = y.mean() assert_almost_equal(raw_predictions, np.log(p / (1 - p))) # Exponential loss loss = ExponentialLoss(n_classes=2) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) p = y.mean() assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p))) # Multinomial deviance loss for n_classes in range(3, 5): y = rng.randint(0, n_classes, size=n_samples) loss = MultinomialDeviance(n_classes=n_classes) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) for k in range(n_classes): p = (y == k).mean() assert_almost_equal(raw_predictions[:, k], np.log(p))
def __init__(self, nb_trees: int, nb_trees_per_ensemble: int, n_classes: Optional[int] = None, max_depth: int = 6, privacy_budget: float = 1.0, learning_rate: float = 0.1, max_leaves: Optional[int] = None, min_samples_split: int = 2, balance_partition: bool = True, use_bfs: bool = False, use_3_trees: bool = False, cat_idx: Optional[List[int]] = None, num_idx: Optional[List[int]] = None) -> None: """Initialize the GradientBoostingEnsemble class. Args: nb_trees (int): The total number of trees in the model. nb_trees_per_ensemble (int): The number of trees in each ensemble. n_classes (int): Number of classes. Triggers regression (None) vs classification. max_depth (int): Optional. The depth for the trees. Default is 6. privacy_budget (float): Optional. The privacy budget available for the model. Default is 1.0. learning_rate (float): Optional. The learning rate. Default is 0.1. max_leaves (int): Optional. The max number of leaf nodes for the trees. Tree will grow in a best-leaf first fashion until it contains max_leaves or until it reaches maximum depth, whichever comes first. min_samples_split (int): Optional. The minimum number of samples required to split an internal node. Default is 2. balance_partition (bool): Optional. Balance data repartition for training the trees. The default is True, meaning all trees within an ensemble will receive an equal amount of training samples. If set to False, each tree will receive <x> samples where <x> is given in line 8 of the algorithm in the author's paper. use_bfs (bool): Optional. If max_leaves is specified, then this is automatically True. This will build the tree in a BFS fashion instead of DFS. Default is False. use_3_trees (bool): Optional. If True, only build trees that have 3 nodes, and then assemble nb_trees based on these sub-trees, at random. Default is False. cat_idx (List): Optional. List of indices for categorical features. num_idx (List): Optional. List of indices for numerical features. """ self.nb_trees = nb_trees self.nb_trees_per_ensemble = nb_trees_per_ensemble self.max_depth = max_depth self.privacy_budget = privacy_budget self.learning_rate = learning_rate self.max_leaves = max_leaves self.min_samples_split = min_samples_split self.balance_partition = balance_partition self.use_bfs = use_bfs self.use_3_trees = use_3_trees self.cat_idx = cat_idx self.num_idx = num_idx self.trees = [] # type: List[List[DifferentiallyPrivateTree]] # classification vs regression self.loss_ = MultinomialDeviance( n_classes) if n_classes else LeastSquaresError( 1) # type: LossFunction self.init_ = self.loss_.init_estimator() # Loss parameters self.l2_threshold = 1.0 self.l2_lambda = 0.1 # Initial score self.init_score = None if self.use_3_trees and self.use_bfs: # Since we're building 3-node trees it's the same anyways. self.use_bfs = False