Exemple #1
0
def test_init_raw_predictions_shapes():
    # Make sure get_init_raw_predictions returns float64 arrays with shape
    # (n_samples, K) where K is 1 for binary classification and regression, and
    # K = n_classes for multiclass classification
    rng = np.random.RandomState(0)

    n_samples = 100
    X = rng.normal(size=(n_samples, 5))
    y = rng.normal(size=n_samples)
    for loss in (LeastSquaresError(n_classes=1),
                 LeastAbsoluteError(n_classes=1),
                 QuantileLossFunction(n_classes=1),
                 HuberLossFunction(n_classes=1)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    y = rng.randint(0, 2, size=n_samples)
    for loss in (BinomialDeviance(n_classes=2), ExponentialLoss(n_classes=2)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    for n_classes in range(3, 5):
        y = rng.randint(0, n_classes, size=n_samples)
        loss = MultinomialDeviance(n_classes=n_classes)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, n_classes)
        assert raw_predictions.dtype == np.float64
Exemple #2
0
def test_sample_weight_smoke():
    rng = check_random_state(13)
    y = rng.rand(100)
    pred = rng.rand(100)

    # least squares
    loss = LeastSquaresError(1)
    loss_wo_sw = loss(y, pred)
    loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
    assert_almost_equal(loss_wo_sw, loss_w_sw)
Exemple #3
0
def test_init_raw_predictions_values():
    # Make sure the get_init_raw_predictions() returns the expected values for
    # each loss.
    rng = np.random.RandomState(0)

    n_samples = 100
    X = rng.normal(size=(n_samples, 5))
    y = rng.normal(size=n_samples)

    # Least squares loss
    loss = LeastSquaresError(n_classes=1)
    init_estimator = loss.init_estimator().fit(X, y)
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    # Make sure baseline prediction is the mean of all targets
    assert_almost_equal(raw_predictions, y.mean())

    # Least absolute and huber loss
    for Loss in (LeastAbsoluteError, HuberLossFunction):
        loss = Loss(n_classes=1)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        # Make sure baseline prediction is the median of all targets
        assert_almost_equal(raw_predictions, np.median(y))

    # Quantile loss
    for alpha in (.1, .5, .9):
        loss = QuantileLossFunction(n_classes=1, alpha=alpha)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        # Make sure baseline prediction is the alpha-quantile of all targets
        assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100))

    y = rng.randint(0, 2, size=n_samples)

    # Binomial deviance
    loss = BinomialDeviance(n_classes=2)
    init_estimator = loss.init_estimator().fit(X, y)
    # Make sure baseline prediction is equal to link_function(p), where p
    # is the proba of the positive class. We want predict_proba() to return p,
    # and by definition
    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    p = y.mean()
    assert_almost_equal(raw_predictions, np.log(p / (1 - p)))

    # Exponential loss
    loss = ExponentialLoss(n_classes=2)
    init_estimator = loss.init_estimator().fit(X, y)
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    p = y.mean()
    assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))

    # Multinomial deviance loss
    for n_classes in range(3, 5):
        y = rng.randint(0, n_classes, size=n_samples)
        loss = MultinomialDeviance(n_classes=n_classes)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        for k in range(n_classes):
            p = (y == k).mean()
        assert_almost_equal(raw_predictions[:, k], np.log(p))
Exemple #4
0
    def __init__(self,
                 nb_trees: int,
                 nb_trees_per_ensemble: int,
                 n_classes: Optional[int] = None,
                 max_depth: int = 6,
                 privacy_budget: float = 1.0,
                 learning_rate: float = 0.1,
                 max_leaves: Optional[int] = None,
                 min_samples_split: int = 2,
                 balance_partition: bool = True,
                 use_bfs: bool = False,
                 use_3_trees: bool = False,
                 cat_idx: Optional[List[int]] = None,
                 num_idx: Optional[List[int]] = None) -> None:
        """Initialize the GradientBoostingEnsemble class.

    Args:
      nb_trees (int): The total number of trees in the model.
      nb_trees_per_ensemble (int): The number of trees in each ensemble.
      n_classes (int): Number of classes. Triggers regression (None) vs classification.
      max_depth (int): Optional. The depth for the trees. Default is 6.
      privacy_budget (float): Optional. The privacy budget available for the
          model. Default is 1.0.
      learning_rate (float): Optional. The learning rate. Default is 0.1.
      max_leaves (int): Optional. The max number of leaf nodes for the trees.
          Tree will grow in a best-leaf first fashion until it contains
          max_leaves or until it reaches maximum depth, whichever comes first.
      min_samples_split (int): Optional. The minimum number of samples required
          to split an internal node. Default is 2.
      balance_partition (bool): Optional. Balance data repartition for training
          the trees. The default is True, meaning all trees within an ensemble
          will receive an equal amount of training samples. If set to False,
          each tree will receive <x> samples where <x> is given in line 8 of
          the algorithm in the author's paper.
      use_bfs (bool): Optional. If max_leaves is specified, then this is
          automatically True. This will build the tree in a BFS fashion instead
          of DFS. Default is False.
      use_3_trees (bool): Optional. If True, only build trees that have 3
          nodes, and then assemble nb_trees based on these sub-trees, at random.
          Default is False.
      cat_idx (List): Optional. List of indices for categorical features.
      num_idx (List): Optional. List of indices for numerical features.
      """
        self.nb_trees = nb_trees
        self.nb_trees_per_ensemble = nb_trees_per_ensemble
        self.max_depth = max_depth
        self.privacy_budget = privacy_budget
        self.learning_rate = learning_rate
        self.max_leaves = max_leaves
        self.min_samples_split = min_samples_split
        self.balance_partition = balance_partition
        self.use_bfs = use_bfs
        self.use_3_trees = use_3_trees
        self.cat_idx = cat_idx
        self.num_idx = num_idx
        self.trees = []  # type: List[List[DifferentiallyPrivateTree]]
        # classification vs regression
        self.loss_ = MultinomialDeviance(
            n_classes) if n_classes else LeastSquaresError(
                1)  # type: LossFunction
        self.init_ = self.loss_.init_estimator()

        # Loss parameters
        self.l2_threshold = 1.0
        self.l2_lambda = 0.1

        # Initial score
        self.init_score = None

        if self.use_3_trees and self.use_bfs:
            # Since we're building 3-node trees it's the same anyways.
            self.use_bfs = False
def test_init_raw_predictions_values():
    # Make sure the get_init_raw_predictions() returns the expected values for
    # each loss.
    rng = np.random.RandomState(0)

    n_samples = 100
    X = rng.normal(size=(n_samples, 5))
    y = rng.normal(size=n_samples)

    # Least squares loss
    loss = LeastSquaresError(n_classes=1)
    init_estimator = loss.init_estimator().fit(X, y)
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    # Make sure baseline prediction is the mean of all targets
    assert_almost_equal(raw_predictions, y.mean())

    # Least absolute and huber loss
    for Loss in (LeastAbsoluteError, HuberLossFunction):
        loss = Loss(n_classes=1)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        # Make sure baseline prediction is the median of all targets
        assert_almost_equal(raw_predictions, np.median(y))

    # Quantile loss
    for alpha in (.1, .5, .9):
        loss = QuantileLossFunction(n_classes=1, alpha=alpha)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        # Make sure baseline prediction is the alpha-quantile of all targets
        assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100))

    y = rng.randint(0, 2, size=n_samples)

    # Binomial deviance
    loss = BinomialDeviance(n_classes=2)
    init_estimator = loss.init_estimator().fit(X, y)
    # Make sure baseline prediction is equal to link_function(p), where p
    # is the proba of the positive class. We want predict_proba() to return p,
    # and by definition
    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    p = y.mean()
    assert_almost_equal(raw_predictions, np.log(p / (1 - p)))

    # Exponential loss
    loss = ExponentialLoss(n_classes=2)
    init_estimator = loss.init_estimator().fit(X, y)
    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
    p = y.mean()
    assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))

    # Multinomial deviance loss
    for n_classes in range(3, 5):
        y = rng.randint(0, n_classes, size=n_samples)
        loss = MultinomialDeviance(n_classes=n_classes)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        for k in range(n_classes):
            p = (y == k).mean()
        assert_almost_equal(raw_predictions[:, k], np.log(p))