def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): X_binned, all_gradients, all_hessians = _make_training_data( n_bins=n_bins, constant_hessian=constant_hessian) n_samples = X_binned.shape[0] if stopping_param == "max_leaf_nodes": stopping_param = {"max_leaf_nodes": 3} else: stopping_param = {"min_gain_to_split": 0.01} grower = TreeGrower(X_binned, all_gradients, all_hessians, max_bins=n_bins, shrinkage=shrinkage, min_samples_leaf=1, **stopping_param) # The root node is not yet splitted, but the best possible split has # already been evaluated: assert grower.root.left_child is None assert grower.root.right_child is None root_split = grower.root.split_info assert root_split.feature_idx == 0 assert root_split.bin_idx == n_bins // 2 assert len(grower.splittable_nodes) == 1 # Calling split next applies the next split and computes the best split # for each of the two newly introduced children nodes. assert grower.can_split_further() left_node, right_node = grower.split_next() # All training samples have ben splitted in the two nodes, approximately # 50%/50% _check_children_consistency(grower.root, left_node, right_node) assert len(left_node.sample_indices) > 0.4 * n_samples assert len(left_node.sample_indices) < 0.6 * n_samples if grower.min_gain_to_split > 0: # The left node is too pure: there is no gain to split it further. assert left_node.split_info.gain < grower.min_gain_to_split assert left_node in grower.finalized_leaves # The right node can still be splitted further, this time on feature #1 split_info = right_node.split_info assert split_info.gain > 1. assert split_info.feature_idx == 1 assert split_info.bin_idx == n_bins // 3 assert right_node.left_child is None assert right_node.right_child is None # The right split has not been applied yet. Let's do it now: assert grower.can_split_further() right_left_node, right_right_node = grower.split_next() _check_children_consistency(right_node, right_left_node, right_right_node) assert len(right_left_node.sample_indices) > 0.1 * n_samples assert len(right_left_node.sample_indices) < 0.2 * n_samples assert len(right_right_node.sample_indices) > 0.2 * n_samples assert len(right_right_node.sample_indices) < 0.4 * n_samples # All the leafs are pure, it is not possible to split any further: assert not grower.can_split_further() # Check the values of the leaves: assert grower.root.left_child.value == approx(shrinkage) assert grower.root.right_child.left_child.value == approx(shrinkage) assert grower.root.right_child.right_child.value == approx(-shrinkage)
TreeGrower(np.asfortranarray(binned_features[:5]), gradients[:5], hessians[:5], n_bins=n_bins, max_leaf_nodes=3).grow() toc = time() print(f"done in {toc - tic:0.3f}s") print(f"Growing one tree on {binned_features.nbytes / 1e9:0.1f} GB of " f"random data ({n_samples:.0e} samples, {n_features} features).") print("Finding the best split on the root node...") tree_start = tic = time() grower = TreeGrower(binned_features, gradients, hessians, n_bins=n_bins, max_leaf_nodes=255) toc = time() print(f"done in {toc - tic:0.3f}s") while grower.can_split_further(): print("Splitting next node...") tic = time() left, right = grower.split_next() toc = time() print("left node: ", left) print("right node: ", right) print(f"done in {toc - tic:0.3f}s") print(f"{len(grower.finalized_leaves)} leaves in {time() - tree_start:0.3f}s")