コード例 #1
0
def test_regression_dataset(n_bins):
    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
                           random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 10
    max_leaf_nodes = 30
    grower = TreeGrower(X_train_binned, gradients, hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
                        n_bins_non_missing=mapper.n_bins_non_missing_)
    grower.grow()

    predictor = grower.make_predictor(
        binning_thresholds=mapper.bin_thresholds_)

    known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    f_idx_map = np.zeros(0, dtype=np.uint32)

    y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map)
    assert r2_score(y_train, y_pred_train) > 0.82

    y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map)
    assert r2_score(y_test, y_pred_test) > 0.67
コード例 #2
0
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
    # Make sure root node isn't split if n_samples is not at least twice
    # min_samples_leaf
    rng = np.random.RandomState(seed=0)

    max_bins = 255

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = _BinMapper(max_bins=max_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=max_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    if n_samples >= min_samples_leaf * 2:
        assert len(grower.finalized_leaves) >= 2
    else:
        assert len(grower.finalized_leaves) == 1
コード例 #3
0
ファイル: test_grower.py プロジェクト: obulrdy6881/Drowsinss
def test_missing_value_predict_only():
    # Make sure that missing values are supported at predict time even if they
    # were not encountered in the training data: the missing values are
    # assigned to whichever child has the most samples.

    rng = np.random.RandomState(0)
    n_samples = 100
    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
    X_binned = np.asfortranarray(X_binned)

    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
                        has_missing_values=False)
    grower.grow()

    predictor = grower.make_predictor()

    # go from root to a leaf, always following node with the most samples.
    # That's the path nans are supposed to take
    node = predictor.nodes[0]
    while not node['is_leaf']:
        left = predictor.nodes[node['left']]
        right = predictor.nodes[node['right']]
        node = left if left['count'] > right['count'] else right

    prediction_main_path = node['value']

    # now build X_test with only nans, and make sure all predictions are equal
    # to prediction_main_path
    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
    assert np.all(predictor.predict(all_nans) == prediction_main_path)
コード例 #4
0
def test_regression_dataset(n_bins):
    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
                           random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 10
    max_leaf_nodes = 30
    grower = TreeGrower(X_train_binned, gradients, hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
                        n_bins_non_missing=mapper.n_bins_non_missing_)
    grower.grow()

    predictor = grower.make_predictor(num_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict(X_train)) > 0.82
    assert r2_score(y_test, predictor.predict(X_test)) > 0.67
コード例 #5
0
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
                          constant_hessian, noise):
    rng = np.random.RandomState(seed=0)
    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    if noise:
        y_scale = y.std()
        y += rng.normal(scale=noise, size=n_samples) * y_scale
    mapper = _BinMapper(max_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    shape_hessian = 1 if constant_hessian else all_gradients.shape
    all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    if n_samples >= min_samples_leaf:
        for node in predictor.nodes:
            if node['is_leaf']:
                assert node['count'] >= min_samples_leaf
    else:
        assert predictor.nodes.shape[0] == 1
        assert predictor.nodes[0]['is_leaf']
        assert predictor.nodes[0]['count'] == n_samples
コード例 #6
0
def test_boston_dataset(max_bins):
    X, y = load_boston(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    mapper = _BinMapper(max_bins=max_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes,
                        max_bins=max_bins,
                        actual_n_bins=mapper.actual_n_bins_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
コード例 #7
0
ファイル: test_grower.py プロジェクト: suchot/scikit-learn
def test_init_parameters_validation():
    X_binned, all_gradients, all_hessians = _make_training_data()
    with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"):

        TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1)

    with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"):
        TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1)
コード例 #8
0
def test_nodes_values(monotonic_cst, seed):
    # Build a single tree with only one feature, and make sure the nodes
    # values respect the monotonic constraints.

    # Considering the following tree with a monotonic POS constraint, we
    # should have:
    #
    #       root
    #      /    \
    #     5     10    # middle = 7.5
    #    / \   / \
    #   a  b  c  d
    #
    # a <= b and c <= d  (assert_children_values_monotonic)
    # a, b <= middle <= c, d (assert_children_values_bounded)
    # a <= b <= c <= d (assert_leaves_values_monotonic)
    #
    # The last one is a consequence of the others, but can't hurt to check

    rng = np.random.RandomState(seed)
    n_samples = 1000
    n_features = 1
    X_binned = rng.randint(0,
                           255,
                           size=(n_samples, n_features),
                           dtype=np.uint8)
    X_binned = np.asfortranarray(X_binned)

    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        monotonic_cst=[monotonic_cst],
                        shrinkage=.1)
    grower.grow()

    # grow() will shrink the leaves values at the very end. For our comparison
    # tests, we need to revert the shrinkage of the leaves, else we would
    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
    # test would not be correct.
    for leave in grower.finalized_leaves:
        leave.value /= grower.shrinkage

    # We pass undefined binning_thresholds because we won't use predict anyway
    predictor = grower.make_predictor(
        binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1)))

    # The consistency of the bounds can only be checked on the tree grower
    # as the node bounds are not copied into the predictor tree. The
    # consistency checks on the values of node children and leaves can be
    # done either on the grower tree or on the predictor tree. We only
    # do those checks on the predictor tree as the latter is derived from
    # the former.
    assert_children_values_monotonic(predictor, monotonic_cst)
    assert_children_values_bounded(grower, monotonic_cst)
    assert_leaves_values_monotonic(predictor, monotonic_cst)
コード例 #9
0
def test_grow_tree_categories():
    # Check that the grower produces the right predictor tree when a split is
    # categorical
    X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T
    X_binned = np.asfortranarray(X_binned)

    all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE)
    all_hessians = np.ones(1, dtype=G_H_DTYPE)
    is_categorical = np.ones(1, dtype=np.uint8)

    grower = TreeGrower(X_binned, all_gradients, all_hessians,
                        n_bins=4, shrinkage=1.0, min_samples_leaf=1,
                        is_categorical=is_categorical)
    grower.grow()
    assert grower.n_nodes == 3

    categories = [np.array([4, 9], dtype=X_DTYPE)]
    predictor = grower.make_predictor(binning_thresholds=categories)
    root = predictor.nodes[0]
    assert root['count'] == 23
    assert root['depth'] == 0
    assert root['is_categorical']

    left, right = predictor.nodes[root['left']], predictor.nodes[root['right']]

    # arbitrary validation, but this means ones go to the left.
    assert left['count'] >= right['count']

    # check binned category value (1)
    expected_binned_cat_bitset = [2**1] + [0] * 7
    binned_cat_bitset = predictor.binned_left_cat_bitsets
    assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset)

    # check raw category value (9)
    expected_raw_cat_bitsets = [2**9] + [0] * 7
    raw_cat_bitsets = predictor.raw_left_cat_bitsets
    assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets)

    # Note that since there was no missing values during training, the missing
    # values aren't part of the bitsets. However, we expect the missing values
    # to go to the biggest child (i.e. the left one).
    # The left child has a value of -1 = negative gradient.
    assert root['missing_go_to_left']

    # make sure binned missing values are mapped to the left child during
    # prediction
    prediction_binned = predictor.predict_binned(
        np.asarray([[6]]).astype(X_BINNED_DTYPE), missing_values_bin_idx=6)
    assert_allclose(prediction_binned, [-1])  # negative gradient

    # make sure raw missing values are mapped to the left child during
    # prediction
    known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)  # ignored anyway
    f_idx_map = np.array([0], dtype=np.uint32)
    prediction = predictor.predict(np.array([[np.nan]]), known_cat_bitsets,
                                   f_idx_map)
    assert_allclose(prediction, [-1])
コード例 #10
0
def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
    # Make sure that native categorical splits are equivalent to using a OHE,
    # when given enough depth

    rng = np.random.RandomState(0)
    n_samples = 10_000
    X_binned = rng.randint(0,
                           n_unique_categories,
                           size=(n_samples, 1),
                           dtype=np.uint8)

    X_ohe = OneHotEncoder(sparse=False).fit_transform(X_binned)
    X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)

    if target == "equal":
        gradients = X_binned.reshape(-1)
    elif target == "binary":
        gradients = (X_binned % 2).reshape(-1)
    else:
        gradients = rng.randn(n_samples)
    gradients = gradients.astype(G_H_DTYPE)

    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    grower_params = {
        "min_samples_leaf": min_samples_leaf,
        "max_depth": None,
        "max_leaf_nodes": None,
    }

    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        is_categorical=[True],
                        **grower_params)
    grower.grow()
    # we pass undefined bin_thresholds because we won't use predict()
    predictor = grower.make_predictor(
        binning_thresholds=np.zeros((1, n_unique_categories)))
    preds = predictor.predict_binned(X_binned,
                                     missing_values_bin_idx=255,
                                     n_threads=n_threads)

    grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params)
    grower_ohe.grow()
    predictor_ohe = grower_ohe.make_predictor(
        binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories)))
    preds_ohe = predictor_ohe.predict_binned(X_ohe,
                                             missing_values_bin_idx=255,
                                             n_threads=n_threads)

    assert predictor.get_max_depth() <= predictor_ohe.get_max_depth()
    if target == "binary" and n_unique_categories > 2:
        # OHE needs more splits to achieve the same predictions
        assert predictor.get_max_depth() < predictor_ohe.get_max_depth()

    np.testing.assert_allclose(preds, preds_ohe)
コード例 #11
0
ファイル: test_grower.py プロジェクト: suchot/scikit-learn
def test_input_validation():

    X_binned, all_gradients, all_hessians = _make_training_data()

    X_binned_float = X_binned.astype(np.float32)
    with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"):
        TreeGrower(X_binned_float, all_gradients, all_hessians)

    X_binned_C_array = np.ascontiguousarray(X_binned)
    with pytest.raises(
        ValueError, match="X_binned should be passed as Fortran contiguous array"
    ):
        TreeGrower(X_binned_C_array, all_gradients, all_hessians)
コード例 #12
0
ファイル: test_grower.py プロジェクト: suchot/scikit-learn
def test_predictor_from_grower():
    # Build a tree on the toy 3-leaf dataset to extract the predictor.
    n_bins = 256
    X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins)
    grower = TreeGrower(
        X_binned,
        all_gradients,
        all_hessians,
        n_bins=n_bins,
        shrinkage=1.0,
        max_leaf_nodes=3,
        min_samples_leaf=5,
    )
    grower.grow()
    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)

    # Check that the node structure can be converted into a predictor
    # object to perform predictions at scale
    # We pass undefined binning_thresholds because we won't use predict anyway
    predictor = grower.make_predictor(
        binning_thresholds=np.zeros((X_binned.shape[1], n_bins))
    )
    assert predictor.nodes.shape[0] == 5
    assert predictor.nodes["is_leaf"].sum() == 3

    # Probe some predictions for each leaf of the tree
    # each group of 3 samples corresponds to a condition in _make_training_data
    input_data = np.array(
        [
            [0, 0],
            [42, 99],
            [128, 254],
            [129, 0],
            [129, 85],
            [254, 85],
            [129, 86],
            [129, 254],
            [242, 100],
        ],
        dtype=np.uint8,
    )
    missing_values_bin_idx = n_bins - 1
    predictions = predictor.predict_binned(
        input_data, missing_values_bin_idx, n_threads
    )
    expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
    assert np.allclose(predictions, expected_targets)

    # Check that training set can be recovered exactly:
    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads)
    assert np.allclose(predictions, -all_gradients)
コード例 #13
0
def test_split_on_nan_with_infinite_values():
    # Make sure the split on nan situations are respected even when there are
    # samples with +inf values (we set the threshold to +inf when we have a
    # split on nan so this test makes sure this does not introduce edge-case
    # bugs). We need to use the private API so that we can also test
    # predict_binned().

    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
    # the gradient values will force a split on nan situation
    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    bin_mapper = _BinMapper()
    X_binned = bin_mapper.fit_transform(X)

    n_bins_non_missing = 3
    has_missing_values = True
    grower = TreeGrower(
        X_binned,
        gradients,
        hessians,
        n_bins_non_missing=n_bins_non_missing,
        has_missing_values=has_missing_values,
        min_samples_leaf=1,
        n_threads=n_threads,
    )

    grower.grow()

    predictor = grower.make_predictor(
        binning_thresholds=bin_mapper.bin_thresholds_)

    # sanity check: this was a split on nan
    assert predictor.nodes[0]["num_threshold"] == np.inf
    assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1

    known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()

    # Make sure in particular that the +inf sample is mapped to the left child
    # Note that lightgbm "fails" here and will assign the inf sample to the
    # right child, even though it's a "split on nan" situation.
    predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads)
    predictions_binned = predictor.predict_binned(
        X_binned,
        missing_values_bin_idx=bin_mapper.missing_values_bin_idx_,
        n_threads=n_threads,
    )
    np.testing.assert_allclose(predictions, -gradients)
    np.testing.assert_allclose(predictions_binned, -gradients)
コード例 #14
0
def test_plot():
    # just make sure estimators, growers and predictors are equally accepted
    X, y = make_regression()
    est = HistGradientBoostingRegressor()
    est.fit(X, y)
    plot_tree(est, view=False)
    plot_tree(est._predictors[0][0], view=False)

    gradients = np.random.normal(size=100).astype(G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    bin_mapper = _BinMapper()
    X_binned = bin_mapper.fit_transform(X)
    grower = TreeGrower(X_binned, gradients, hessians)
    grower.grow()
    plot_tree(grower, view=False)
コード例 #15
0
def test_missing_value_predict_only():
    # Make sure that missing values are supported at predict time even if they
    # were not encountered in the training data: the missing values are
    # assigned to whichever child has the most samples.

    rng = np.random.RandomState(0)
    n_samples = 100
    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
    X_binned = np.asfortranarray(X_binned)

    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=5,
                        has_missing_values=False)
    grower.grow()

    # We pass undefined binning_thresholds because we won't use predict anyway
    predictor = grower.make_predictor(
        binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1)))

    # go from root to a leaf, always following node with the most samples.
    # That's the path nans are supposed to take
    node = predictor.nodes[0]
    while not node["is_leaf"]:
        left = predictor.nodes[node["left"]]
        right = predictor.nodes[node["right"]]
        node = left if left["count"] > right["count"] else right

    prediction_main_path = node["value"]

    # now build X_test with only nans, and make sure all predictions are equal
    # to prediction_main_path
    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
    known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    f_idx_map = np.zeros(0, dtype=np.uint32)

    y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map,
                               n_threads)
    assert np.all(y_pred == prediction_main_path)
コード例 #16
0
def test_sum_hessians_are_sample_weight(loss_name):
    # For losses with constant hessians, the sum_hessians field of the
    # histograms must be equal to the sum of the sample weight of samples at
    # the corresponding bin.

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 2
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_features,
                           random_state=rng)
    bin_mapper = _BinMapper()
    X_binned = bin_mapper.fit_transform(X)

    # While sample weights are supposed to be positive, this still works.
    sample_weight = rng.normal(size=n_samples)

    loss = _LOSSES[loss_name](sample_weight=sample_weight)
    gradients, hessians = loss.init_gradient_and_hessian(n_samples=n_samples,
                                                         dtype=G_H_DTYPE)
    gradients, hessians = gradients.reshape((-1, 1)), hessians.reshape((-1, 1))
    raw_predictions = rng.normal(size=(n_samples, 1))
    loss.gradient_hessian(
        y_true=y,
        raw_prediction=raw_predictions,
        sample_weight=sample_weight,
        gradient_out=gradients,
        hessian_out=hessians,
        n_threads=n_threads,
    )

    # build sum_sample_weight which contains the sum of the sample weights at
    # each bin (for each feature). This must be equal to the sum_hessians
    # field of the corresponding histogram
    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
    for feature_idx in range(n_features):
        for sample_idx in range(n_samples):
            sum_sw[feature_idx,
                   X_binned[sample_idx,
                            feature_idx]] += sample_weight[sample_idx]

    # Build histogram
    grower = TreeGrower(X_binned,
                        gradients[:, 0],
                        hessians[:, 0],
                        n_bins=bin_mapper.n_bins)
    histograms = grower.histogram_builder.compute_histograms_brute(
        grower.root.sample_indices)

    for feature_idx in range(n_features):
        for bin_idx in range(bin_mapper.n_bins):
            assert histograms[feature_idx,
                              bin_idx]["sum_hessians"] == (pytest.approx(
                                  sum_sw[feature_idx, bin_idx], rel=1e-5))
コード例 #17
0
def test_max_depth(max_depth):
    # Make sure max_depth parameter works as expected
    rng = np.random.RandomState(seed=0)

    max_bins = 255
    n_samples = 1000

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = _BinMapper(max_bins=max_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
    grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
    grower.grow()

    depth = max(leaf.depth for leaf in grower.finalized_leaves)
    assert depth == max_depth
コード例 #18
0
def test_predictor_from_grower():
    # Build a tree on the toy 3-leaf dataset to extract the predictor.
    n_bins = 256
    X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins)
    grower = TreeGrower(X_binned,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=1.,
                        max_leaf_nodes=3,
                        min_samples_leaf=5)
    grower.grow()
    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)

    # Check that the node structure can be converted into a predictor
    # object to perform predictions at scale
    predictor = grower.make_predictor()
    assert predictor.nodes.shape[0] == 5
    assert predictor.nodes['is_leaf'].sum() == 3

    # Probe some predictions for each leaf of the tree
    # each group of 3 samples corresponds to a condition in _make_training_data
    input_data = np.array([
        [0, 0],
        [42, 99],
        [128, 255],
        [129, 0],
        [129, 85],
        [255, 85],
        [129, 86],
        [129, 255],
        [242, 100],
    ],
                          dtype=np.uint8)
    predictions = predictor.predict_binned(input_data)
    expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
    assert np.allclose(predictions, expected_targets)

    # Check that training set can be recovered exactly:
    predictions = predictor.predict_binned(X_binned)
    assert np.allclose(predictions, -all_gradients)
コード例 #19
0
def test_sum_hessians_are_sample_weight(loss_name):
    # For losses with constant hessians, the sum_hessians field of the
    # histograms must be equal to the sum of the sample weight of samples at
    # the corresponding bin.

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 2
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_features,
                           random_state=rng)
    bin_mapper = _BinMapper()
    X_binned = bin_mapper.fit_transform(X)

    sample_weight = rng.normal(size=n_samples)

    loss = _LOSSES[loss_name](sample_weight=sample_weight)
    gradients, hessians = loss.init_gradients_and_hessians(
        n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
    raw_predictions = rng.normal(size=(1, n_samples))
    loss.update_gradients_and_hessians(gradients, hessians, y, raw_predictions,
                                       sample_weight)

    # build sum_sample_weight which contains the sum of the sample weights at
    # each bin (for each feature). This must be equal to the sum_hessians
    # field of the corresponding histogram
    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
    for feature_idx in range(n_features):
        for sample_idx in range(n_samples):
            sum_sw[feature_idx,
                   X_binned[sample_idx,
                            feature_idx]] += (sample_weight[sample_idx])

    # Build histogram
    grower = TreeGrower(X_binned,
                        gradients[0],
                        hessians[0],
                        n_bins=bin_mapper.n_bins)
    histograms = grower.histogram_builder.compute_histograms_brute(
        grower.root.sample_indices)

    for feature_idx in range(n_features):
        for bin_idx in range(bin_mapper.n_bins):
            assert histograms[feature_idx,
                              bin_idx]['sum_hessians'] == (pytest.approx(
                                  sum_sw[feature_idx, bin_idx], rel=1e-5))
コード例 #20
0
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
    X_binned, all_gradients, all_hessians = _make_training_data(
        n_bins=n_bins, constant_hessian=constant_hessian)
    n_samples = X_binned.shape[0]

    if stopping_param == "max_leaf_nodes":
        stopping_param = {"max_leaf_nodes": 3}
    else:
        stopping_param = {"min_gain_to_split": 0.01}

    grower = TreeGrower(X_binned,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=shrinkage,
                        min_samples_leaf=1,
                        **stopping_param)

    # The root node is not yet splitted, but the best possible split has
    # already been evaluated:
    assert grower.root.left_child is None
    assert grower.root.right_child is None

    root_split = grower.root.split_info
    assert root_split.feature_idx == 0
    assert root_split.bin_idx == n_bins // 2
    assert len(grower.splittable_nodes) == 1

    # Calling split next applies the next split and computes the best split
    # for each of the two newly introduced children nodes.
    left_node, right_node = grower.split_next()

    # All training samples have ben splitted in the two nodes, approximately
    # 50%/50%
    _check_children_consistency(grower.root, left_node, right_node)
    assert len(left_node.sample_indices) > 0.4 * n_samples
    assert len(left_node.sample_indices) < 0.6 * n_samples

    if grower.min_gain_to_split > 0:
        # The left node is too pure: there is no gain to split it further.
        assert left_node.split_info.gain < grower.min_gain_to_split
        assert left_node in grower.finalized_leaves

    # The right node can still be splitted further, this time on feature #1
    split_info = right_node.split_info
    assert split_info.gain > 1.
    assert split_info.feature_idx == 1
    assert split_info.bin_idx == n_bins // 3
    assert right_node.left_child is None
    assert right_node.right_child is None

    # The right split has not been applied yet. Let's do it now:
    assert len(grower.splittable_nodes) == 1
    right_left_node, right_right_node = grower.split_next()
    _check_children_consistency(right_node, right_left_node, right_right_node)
    assert len(right_left_node.sample_indices) > 0.1 * n_samples
    assert len(right_left_node.sample_indices) < 0.2 * n_samples

    assert len(right_right_node.sample_indices) > 0.2 * n_samples
    assert len(right_right_node.sample_indices) < 0.4 * n_samples

    # All the leafs are pure, it is not possible to split any further:
    assert not grower.splittable_nodes

    # Check the values of the leaves:
    assert grower.root.left_child.value == approx(shrinkage)
    assert grower.root.right_child.left_child.value == approx(shrinkage)
    assert grower.root.right_child.right_child.value == approx(-shrinkage,
                                                               rel=1e-3)