Beispiel #1
0
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
    X_binned, all_gradients, all_hessians = _make_training_data(
        n_bins=n_bins, constant_hessian=constant_hessian)
    n_samples = X_binned.shape[0]

    if stopping_param == "max_leaf_nodes":
        stopping_param = {"max_leaf_nodes": 3}
    else:
        stopping_param = {"min_gain_to_split": 0.01}

    grower = TreeGrower(X_binned,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=shrinkage,
                        min_samples_leaf=1,
                        **stopping_param)

    # The root node is not yet splitted, but the best possible split has
    # already been evaluated:
    assert grower.root.left_child is None
    assert grower.root.right_child is None

    root_split = grower.root.split_info
    assert root_split.feature_idx == 0
    assert root_split.bin_idx == n_bins // 2
    assert len(grower.splittable_nodes) == 1

    # Calling split next applies the next split and computes the best split
    # for each of the two newly introduced children nodes.
    assert grower.can_split_further()
    left_node, right_node = grower.split_next()

    # All training samples have ben splitted in the two nodes, approximately
    # 50%/50%
    _check_children_consistency(grower.root, left_node, right_node)
    assert len(left_node.sample_indices) > 0.4 * n_samples
    assert len(left_node.sample_indices) < 0.6 * n_samples

    if grower.min_gain_to_split > 0:
        # The left node is too pure: there is no gain to split it further.
        assert left_node.split_info.gain < grower.min_gain_to_split
        assert left_node in grower.finalized_leaves

    # The right node can still be splitted further, this time on feature #1
    split_info = right_node.split_info
    assert split_info.gain > 1.
    assert split_info.feature_idx == 1
    assert split_info.bin_idx == n_bins // 3
    assert right_node.left_child is None
    assert right_node.right_child is None

    # The right split has not been applied yet. Let's do it now:
    assert grower.can_split_further()
    right_left_node, right_right_node = grower.split_next()
    _check_children_consistency(right_node, right_left_node, right_right_node)
    assert len(right_left_node.sample_indices) > 0.1 * n_samples
    assert len(right_left_node.sample_indices) < 0.2 * n_samples

    assert len(right_right_node.sample_indices) > 0.2 * n_samples
    assert len(right_right_node.sample_indices) < 0.4 * n_samples

    # All the leafs are pure, it is not possible to split any further:
    assert not grower.can_split_further()

    # Check the values of the leaves:
    assert grower.root.left_child.value == approx(shrinkage)
    assert grower.root.right_child.left_child.value == approx(shrinkage)
    assert grower.root.right_child.right_child.value == approx(-shrinkage)
Beispiel #2
0
TreeGrower(np.asfortranarray(binned_features[:5]),
           gradients[:5],
           hessians[:5],
           n_bins=n_bins,
           max_leaf_nodes=3).grow()
toc = time()
print(f"done in {toc - tic:0.3f}s")

print(f"Growing one tree on {binned_features.nbytes / 1e9:0.1f} GB of "
      f"random data ({n_samples:.0e} samples, {n_features} features).")
print("Finding the best split on the root node...")
tree_start = tic = time()
grower = TreeGrower(binned_features,
                    gradients,
                    hessians,
                    n_bins=n_bins,
                    max_leaf_nodes=255)
toc = time()
print(f"done in {toc - tic:0.3f}s")

while grower.can_split_further():
    print("Splitting next node...")
    tic = time()
    left, right = grower.split_next()
    toc = time()
    print("left node: ", left)
    print("right node: ", right)
    print(f"done in {toc - tic:0.3f}s")

print(f"{len(grower.finalized_leaves)} leaves in {time() - tree_start:0.3f}s")