def test_init_raw_predictions_shapes(): # Make sure get_init_raw_predictions returns float64 arrays with shape # (n_samples, K) where K is 1 for binary classification and regression, and # K = n_classes for multiclass classification rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 5)) y = rng.normal(size=n_samples) for loss in (LeastSquaresError(n_classes=1), LeastAbsoluteError(n_classes=1), QuantileLossFunction(n_classes=1), HuberLossFunction(n_classes=1)): init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, 1) assert raw_predictions.dtype == np.float64 y = rng.randint(0, 2, size=n_samples) for loss in (BinomialDeviance(n_classes=2), ExponentialLoss(n_classes=2)): init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, 1) assert raw_predictions.dtype == np.float64 for n_classes in range(3, 5): y = rng.randint(0, n_classes, size=n_samples) loss = MultinomialDeviance(n_classes=n_classes) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) assert raw_predictions.shape == (n_samples, n_classes) assert raw_predictions.dtype == np.float64
def test_mdl_computation_weighted(): raw_predictions = np.array([[1., -1., -.1], [-2., 1., 2.]]) y_true = np.array([0, 1]) weights = np.array([1, 3]) expected_loss = 1.0909323 # MultinomialDeviance loss computation with weights. loss = MultinomialDeviance(3) assert loss(y_true, raw_predictions, weights) == approx(expected_loss)
def test_multinomial_deviance(n_classes, n_samples): # Check multinomial deviance with and without sample weights. rng = np.random.RandomState(13) sample_weight = np.ones(n_samples) y_true = rng.randint(0, n_classes, size=n_samples) y_pred = np.zeros((n_samples, n_classes), dtype=np.float64) for klass in range(y_pred.shape[1]): y_pred[:, klass] = y_true == klass loss = MultinomialDeviance(n_classes) loss_wo_sw = loss(y_true, y_pred) assert loss_wo_sw > 0 loss_w_sw = loss(y_true, y_pred, sample_weight=sample_weight) assert loss_wo_sw == pytest.approx(loss_w_sw) # Multinomial deviance uses weighted average loss rather than # weighted sum loss, so we make sure that the value remains the same # when we device the weight by 2. loss_w_sw = loss(y_true, y_pred, sample_weight=0.5 * sample_weight) assert loss_wo_sw == pytest.approx(loss_w_sw)
def test_init_raw_predictions_values(): # Make sure the get_init_raw_predictions() returns the expected values for # each loss. rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 5)) y = rng.normal(size=n_samples) # Least squares loss loss = LeastSquaresError(n_classes=1) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the mean of all targets assert_almost_equal(raw_predictions, y.mean()) # Least absolute and huber loss for Loss in (LeastAbsoluteError, HuberLossFunction): loss = Loss(n_classes=1) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the median of all targets assert_almost_equal(raw_predictions, np.median(y)) # Quantile loss for alpha in (.1, .5, .9): loss = QuantileLossFunction(n_classes=1, alpha=alpha) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) # Make sure baseline prediction is the alpha-quantile of all targets assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100)) y = rng.randint(0, 2, size=n_samples) # Binomial deviance loss = BinomialDeviance(n_classes=2) init_estimator = loss.init_estimator().fit(X, y) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, # and by definition # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) p = y.mean() assert_almost_equal(raw_predictions, np.log(p / (1 - p))) # Exponential loss loss = ExponentialLoss(n_classes=2) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) p = y.mean() assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p))) # Multinomial deviance loss for n_classes in range(3, 5): y = rng.randint(0, n_classes, size=n_samples) loss = MultinomialDeviance(n_classes=n_classes) init_estimator = loss.init_estimator().fit(X, y) raw_predictions = loss.get_init_raw_predictions(y, init_estimator) for k in range(n_classes): p = (y == k).mean() assert_almost_equal(raw_predictions[:, k], np.log(p))
def test_mdl_exception(n): # Check that MultinomialDeviance throws an exception when n_classes <= 2 err_msg = 'MultinomialDeviance requires more than 2 classes.' with pytest.raises(ValueError, match=err_msg): MultinomialDeviance(n)
def __init__(self, nb_trees: int, nb_trees_per_ensemble: int, n_classes: Optional[int] = None, max_depth: int = 6, privacy_budget: float = 1.0, learning_rate: float = 0.1, max_leaves: Optional[int] = None, min_samples_split: int = 2, balance_partition: bool = True, use_bfs: bool = False, use_3_trees: bool = False, cat_idx: Optional[List[int]] = None, num_idx: Optional[List[int]] = None) -> None: """Initialize the GradientBoostingEnsemble class. Args: nb_trees (int): The total number of trees in the model. nb_trees_per_ensemble (int): The number of trees in each ensemble. n_classes (int): Number of classes. Triggers regression (None) vs classification. max_depth (int): Optional. The depth for the trees. Default is 6. privacy_budget (float): Optional. The privacy budget available for the model. Default is 1.0. learning_rate (float): Optional. The learning rate. Default is 0.1. max_leaves (int): Optional. The max number of leaf nodes for the trees. Tree will grow in a best-leaf first fashion until it contains max_leaves or until it reaches maximum depth, whichever comes first. min_samples_split (int): Optional. The minimum number of samples required to split an internal node. Default is 2. balance_partition (bool): Optional. Balance data repartition for training the trees. The default is True, meaning all trees within an ensemble will receive an equal amount of training samples. If set to False, each tree will receive <x> samples where <x> is given in line 8 of the algorithm in the author's paper. use_bfs (bool): Optional. If max_leaves is specified, then this is automatically True. This will build the tree in a BFS fashion instead of DFS. Default is False. use_3_trees (bool): Optional. If True, only build trees that have 3 nodes, and then assemble nb_trees based on these sub-trees, at random. Default is False. cat_idx (List): Optional. List of indices for categorical features. num_idx (List): Optional. List of indices for numerical features. """ self.nb_trees = nb_trees self.nb_trees_per_ensemble = nb_trees_per_ensemble self.max_depth = max_depth self.privacy_budget = privacy_budget self.learning_rate = learning_rate self.max_leaves = max_leaves self.min_samples_split = min_samples_split self.balance_partition = balance_partition self.use_bfs = use_bfs self.use_3_trees = use_3_trees self.cat_idx = cat_idx self.num_idx = num_idx self.trees = [] # type: List[List[DifferentiallyPrivateTree]] # classification vs regression self.loss_ = MultinomialDeviance( n_classes) if n_classes else LeastSquaresError( 1) # type: LossFunction self.init_ = self.loss_.init_estimator() # Loss parameters self.l2_threshold = 1.0 self.l2_lambda = 0.1 # Initial score self.init_score = None if self.use_3_trees and self.use_bfs: # Since we're building 3-node trees it's the same anyways. self.use_bfs = False
class GradientBoostingEnsemble: """Implement gradient boosting ensemble of trees. Attributes: nb_trees (int): The total number of trees in the model. nb_trees_per_ensemble (int): The number of trees in each ensemble. max_depth (int): The depth for the trees. privacy_budget (float): The privacy budget available for the model. learning_rate (float): The learning rate. l2_threshold (int): Threshold for the loss function. For the square loss function (default), this is 1. l2_lambda (float): Regularization parameter for l2 loss function. For the square loss function (default), this is 0.1. trees (List[List[DifferentiallyPrivateTree]]): A list of k-classes DP trees. """ # pylint: disable=invalid-name, too-many-arguments, unused-variable def __init__(self, nb_trees: int, nb_trees_per_ensemble: int, n_classes: Optional[int] = None, max_depth: int = 6, privacy_budget: float = 1.0, learning_rate: float = 0.1, max_leaves: Optional[int] = None, min_samples_split: int = 2, balance_partition: bool = True, use_bfs: bool = False, use_3_trees: bool = False, cat_idx: Optional[List[int]] = None, num_idx: Optional[List[int]] = None) -> None: """Initialize the GradientBoostingEnsemble class. Args: nb_trees (int): The total number of trees in the model. nb_trees_per_ensemble (int): The number of trees in each ensemble. n_classes (int): Number of classes. Triggers regression (None) vs classification. max_depth (int): Optional. The depth for the trees. Default is 6. privacy_budget (float): Optional. The privacy budget available for the model. Default is 1.0. learning_rate (float): Optional. The learning rate. Default is 0.1. max_leaves (int): Optional. The max number of leaf nodes for the trees. Tree will grow in a best-leaf first fashion until it contains max_leaves or until it reaches maximum depth, whichever comes first. min_samples_split (int): Optional. The minimum number of samples required to split an internal node. Default is 2. balance_partition (bool): Optional. Balance data repartition for training the trees. The default is True, meaning all trees within an ensemble will receive an equal amount of training samples. If set to False, each tree will receive <x> samples where <x> is given in line 8 of the algorithm in the author's paper. use_bfs (bool): Optional. If max_leaves is specified, then this is automatically True. This will build the tree in a BFS fashion instead of DFS. Default is False. use_3_trees (bool): Optional. If True, only build trees that have 3 nodes, and then assemble nb_trees based on these sub-trees, at random. Default is False. cat_idx (List): Optional. List of indices for categorical features. num_idx (List): Optional. List of indices for numerical features. """ self.nb_trees = nb_trees self.nb_trees_per_ensemble = nb_trees_per_ensemble self.max_depth = max_depth self.privacy_budget = privacy_budget self.learning_rate = learning_rate self.max_leaves = max_leaves self.min_samples_split = min_samples_split self.balance_partition = balance_partition self.use_bfs = use_bfs self.use_3_trees = use_3_trees self.cat_idx = cat_idx self.num_idx = num_idx self.trees = [] # type: List[List[DifferentiallyPrivateTree]] # classification vs regression self.loss_ = MultinomialDeviance( n_classes) if n_classes else LeastSquaresError( 1) # type: LossFunction self.init_ = self.loss_.init_estimator() # Loss parameters self.l2_threshold = 1.0 self.l2_lambda = 0.1 # Initial score self.init_score = None if self.use_3_trees and self.use_bfs: # Since we're building 3-node trees it's the same anyways. self.use_bfs = False def Train(self, X: np.array, y: np.array) -> 'GradientBoostingEnsemble': """Train the ensembles of gradient boosted trees. Args: X (np.array): The features. y (np.array): The label. Returns: GradientBoostingEnsemble: A GradientBoostingEnsemble object. """ # Init gradients self.init_.fit(X, y) self.init_score = self.loss_.get_init_raw_predictions( X, self.init_) # (n_samples, K) update_gradients = True X_train, X_test, y_train, y_test = train_test_split(X, y) X, y = X_train, y_train # Number of ensembles in the model nb_ensembles = int(np.ceil(self.nb_trees / self.nb_trees_per_ensemble)) # Privacy budget allocated to all trees in each ensemble tree_privacy_budget = np.divide(self.privacy_budget, nb_ensembles) # In multi-class classification the budget for each tree # is the same as for the whole K trees but halved # As each datapoint is only assigned to one class, # it only matters if it is assigned to the considered class or not but not to which other # Thus it always remains 2 - independently of how many total classes exists privacy_budget_per_tree = 2 if self.loss_.is_multi_class else 1 tree_privacy_budget = np.divide(tree_privacy_budget, privacy_budget_per_tree) prev_score = np.inf # Train all trees for tree_index in range(self.nb_trees): # Compute sensitivity delta_g = 3 * np.square(self.l2_threshold) delta_v = min( self.l2_threshold / (1 + self.l2_lambda), 2 * self.l2_threshold * math.pow( (1 - self.learning_rate), tree_index)) current_tree_for_ensemble = tree_index % self.nb_trees_per_ensemble if current_tree_for_ensemble == 0: # Initialize the dataset and the gradients X_ensemble = np.copy(X) y_ensemble = np.copy(y) prev_score = np.inf update_gradients = True # gradient initialization will happen later in the per-class-loop # Compute the number of rows that the current tree will use for training if self.balance_partition: # All trees will receive same amount of samples if self.nb_trees % self.nb_trees_per_ensemble == 0: # Perfect split number_of_rows = int(len(X) / self.nb_trees_per_ensemble) else: # Partitioning data across ensembles if np.ceil(tree_index / self.nb_trees_per_ensemble) == np.ceil( self.nb_trees / self.nb_trees_per_ensemble): number_of_rows = int( len(X) / (self.nb_trees % self.nb_trees_per_ensemble)) else: number_of_rows = int( len(X) / self.nb_trees_per_ensemble) + int( len(X) / (self.nb_trees % self.nb_trees_per_ensemble)) else: # Line 8 of Algorithm 2 from the paper number_of_rows = int( (len(X) * self.learning_rate * math.pow( (1 - self.learning_rate), current_tree_for_ensemble)) / (1 - math.pow( (1 - self.learning_rate), self.nb_trees_per_ensemble))) # If using the formula from the algorithm, some trees may not get # samples. In that case we skip the tree and issue a warning. This # should hint the user to change its parameters (likely the ensembles # are too unbalanced) if number_of_rows == 0: logger.warning( 'The choice of trees per ensemble vs. the total number ' 'of trees is not balanced properly; some trees will ' 'not get any training samples. Try using ' 'balance_partition=True or change your parameters.') continue # Select <number_of_rows> rows at random from the ensemble dataset rows = np.random.randint(len(X_ensemble), size=number_of_rows) X_tree = X_ensemble[rows, :] y_tree = y_ensemble[rows] # train for each class a seperate tree on the same rows. # In regression or binary classification, K has been set to one. k_trees = [] # type: List[DifferentiallyPrivateTree] for kth_tree in range(self.loss_.K): if tree_index == 0: # First tree, start with initial scores (mean of labels) gradients = self.ComputeGradientForLossFunction( y, self.init_score[:len(y)], kth_tree) else: # Update gradients of all training instances on loss l if update_gradients: gradients = self.ComputeGradientForLossFunction( y_ensemble, self.Predict(X_ensemble), kth_tree) # type: ignore assert gradients is not None gradients_tree = gradients[rows] # Gradient based data filtering norm_1_gradient = np.abs(gradients_tree) rows_gbf = norm_1_gradient <= self.l2_threshold X_tree = X_tree[rows_gbf, :] y_tree = y_tree[rows_gbf] gradients_tree = gradients_tree[rows_gbf] # Get back the original row index from the first filtering selected_rows = rows[rows_gbf] if tree_index > 0 else rows # Fit a differentially private decision tree tree = DifferentiallyPrivateTree( tree_index, self.learning_rate, self.l2_threshold, self.l2_lambda, tree_privacy_budget, delta_g, delta_v, self.loss_, max_depth=self.max_depth, max_leaves=self.max_leaves, min_samples_split=self.min_samples_split, use_bfs=self.use_bfs, use_3_trees=self.use_3_trees, cat_idx=self.cat_idx, num_idx=self.num_idx) # in multi-class classification, the target has to be binary # as each tree is a per-class regressor y_target = ((y_tree == kth_tree).astype(np.float64) if self.loss_.is_multi_class else y_tree) tree.Fit(X_tree, y_target, gradients_tree) # Add the tree to its corresponding ensemble k_trees.append(tree) self.trees.append(k_trees) score = self.loss_(y_test, self.Predict(X_test)) # i.e. mse or deviance if score >= prev_score: # This tree doesn't improve overall prediction quality, removing from # model update_gradients = self.loss_.is_multi_class # not reusing gradients in multi-class as they are class-dependent self.trees.pop() else: print(tree_index, score) update_gradients = True prev_score = score # Remove the selected rows from the ensemble's dataset # The instances that were filtered out by GBF can still be used for the # training of the next trees X_ensemble = np.delete(X_ensemble, selected_rows, axis=0) y_ensemble = np.delete(y_ensemble, selected_rows) if self.use_3_trees: self.Combine_3_trees(self.trees) return self def Combine_3_trees( self, k_trees: List[List['DifferentiallyPrivateTree']]) -> None: """Combine 3-trees together to construct bigger decision trees. Args: k_trees (List[List[DifferentiallyPrivateTree]]): A k-list of 3-trees. """ self.trees = [] # Re-init final predictions trees for index, k_three_tree in enumerate( k_trees): # iterate through the ensemble k_trees_ = [] # type: List[DifferentiallyPrivateTree] for k, three_tree in enumerate( k_three_tree): # iterate through the classes # select a whole ensemble per class; continue as if there were only one class copy = list(np.copy([i[k] for i in k_trees])) copy.pop(index) if len(copy) == 0: continue queue_children = Queue() # type: Queue['DecisionNode'] queue_children.put( three_tree.root_node.left_child) # type: ignore queue_children.put( three_tree.root_node.right_child) # type: ignore depth = 1 privacy_budget_for_node = np.around(np.divide( three_tree.privacy_budget / 2, three_tree.max_depth + 1), decimals=7) while not queue_children.empty(): if depth == self.max_depth or len(copy) == 0: break left_child = queue_children.get() right_child = queue_children.get() for child in [left_child, right_child]: if len(copy) == 0 or not child or not child.X.any( ): # type: ignore continue # Apply exponential mechanism to find sub 3-node tree probabilities = [] max_gain = -np.inf for candidate_index, candidate in enumerate(copy): if not candidate.root_node.X.any(): continue # Compute distance between the two nodes. Lower is better. gain = np.linalg.norm( np.matmul(np.transpose(child.X), child.X) - np.matmul(np.transpose(candidate.root_node.X), candidate.root_node.X)) exp_gain = (privacy_budget_for_node * gain) / (2. * three_tree.delta_g) if exp_gain > max_gain: max_gain = exp_gain prob = { 'candidate_index': candidate_index, 'index': candidate.root_node.index, 'value': candidate.root_node.value, 'gain': exp_gain } probabilities.append(prob) candidate = ExponentialMechanism(probabilities, max_gain, reverse=True) if not candidate or not candidate[ 'index'] or not candidate['value']: continue copy.pop(candidate['candidate_index']) split_index = candidate['index'] split_value = candidate['value'] left_, right_ = self.SplitNode( child, split_index, split_value, three_tree.privacy_budget, index, three_tree.delta_v) queue_children.put(left_) queue_children.put(right_) depth += 1 k_trees_.append(three_tree) self.trees.append(k_trees_) if not self.trees or self.trees == [[]]: self.trees = k_trees def SplitNode(self, node: 'DecisionNode', index: int, value: float, tree_privacy_budget: float, tree_index: int, delta_v: float) -> Tuple['DecisionNode', 'DecisionNode']: """Split children of a 3-nodes tree based on the (index, value) pair. Args: node (DecisionNode): The node to split. index (int): The feature's index on which to split the node. value (float): The feature's value on which to split the node. tree_privacy_budget (float): The privacy budget for the current tree. tree_index (int): The index of the tree. delta_v (float): The loss function's sensitivity for the tree. Returns: Tuple: Children created after the split. """ assert node.X is not None assert node.y is not None assert node.gradients is not None # Split indices of instances from the node's dataset lhs_op, rhs_op = self.GetOperators(index) lhs = np.where(lhs_op(node.X[:, index], value))[0] rhs = np.where(rhs_op(node.X[:, index], value))[0] # Compute the associated predictions lhs_prediction = ComputePredictions(node.gradients[lhs], node.y[lhs], self.loss_, self.l2_lambda) rhs_prediction = ComputePredictions(node.gradients[rhs], node.y[rhs], self.loss_, self.l2_lambda) # Mark current node as split node and not leaf node node.prediction = None node.index = index node.value = value # Add children to node node.left_child = DecisionNode(X=node.X[lhs], y=node.y[lhs], prediction=lhs_prediction, gradients=node.gradients[lhs]) node.right_child = DecisionNode(X=node.X[rhs], y=node.y[rhs], prediction=rhs_prediction, gradients=node.gradients[rhs]) # Apply Geometry leaf clipping ClipLeaves([node.left_child, node.right_child], self.l2_threshold, self.learning_rate, tree_index) # Add noise to the leaf predictions laplace_scale = delta_v / tree_privacy_budget / 2 AddLaplacianNoise([node.left_child, node.right_child], laplace_scale) # Shrink by learning rate Shrink([node.left_child, node.right_child], self.learning_rate) return node.left_child, node.right_child def Predict(self, X: np.array) -> np.array: """Predict values from the ensemble of gradient boosted trees. See https://github.com/microsoft/LightGBM/issues/1778. Args: X (np.array): The dataset for which to predict values. Returns: np.array of shape (n_samples, K): The predictions. """ # sum across the ensemble per class predictions = np.sum([[tree.Predict(X) for tree in k_trees] for k_trees in self.trees], axis=0).T assert self.init_score is not None init_score = self.init_score[:len(predictions)] return np.add(init_score, predictions) def PredictLabels(self, X: np.ndarray) -> np.ndarray: """Predict labels out of the raw prediction values of `Predict`. Only defined for classification tasks. Args: X (np.ndarray): The dataset for which to predict labels. Returns: np.ndarray: The label predictions. """ if type(self.loss_) is not MultinomialDeviance: raise ValueError("Labels are not defined for regression tasks.") raw_predictions = self.Predict(X) encoded_labels = self.loss_._raw_prediction_to_decision( raw_predictions) return encoded_labels def ComputeGradientForLossFunction(self, y: np.array, y_pred: np.array, k: int) -> np.array: """Compute the gradient of the loss function. Args: y (np.array): The true values. y_pred (np.array): The predictions. k (int): the class. Returns: (np.array): The gradient of the loss function. """ if self.loss_.is_multi_class: y = (y == k).astype(np.float64) # sklearn's impl is using the negative gradient (i.e. y - F). # Here the positive gradient is used though return -self.loss_.negative_gradient(y, y_pred, k=k) def GetOperators(self, index: int) -> Tuple[Any, Any]: """Return operators to use to split a node's dataset. Args: index (int): The index for the feature to split the data on. Returns: Tuple[Any, Any]: The operators to use. """ if self.cat_idx and index in self.cat_idx: # Categorical feature return operator.eq, operator.ne # Numerical feature return operator.lt, operator.ge