def test_mse_tree(training_data_1d): X, y = training_data_1d tree = DecisionTree(min_samples_leaf=2) tree.fit(X, y, splitter=splitters.MSESplitter()) preds = tree.predict(X) if isinstance(X, pd.DataFrame): assert list(preds.index) == list(X.index) if isinstance(y, pd.DataFrame): assert list(preds.columns) == list(y.columns) preds = preds[0].tolist() assert preds == pytest.approx([ 15.0, 15.0, 15.0, 25.0, 25.0, 25.0, 10.0, 10.0, 10.0, 17.0, 17.0, 17.0 ], abs=1e-1) # 4 leaves are formed: node_ids = tree.apply(X) assert len(set(node_ids)) == 4 for group_id in range(4): begin_idx = 3 * group_id end_idx = 3 * group_id + 2 assert len(set(node_ids[begin_idx:end_idx])) == 1
def test_mse_splitter_in_1d(training_data_1d): X, Y = training_data_1d training_data = TrainingData(X, Y) mse_splitter = splitters.MSESplitter() coeffs, cutpoint, cost = mse_splitter.select_feature_to_cut( training_data.X, training_data.Y, 1) index = np.argwhere(coeffs != 0.0)[0][0] assert training_data.X_names[index] in {'x', 0} # Brute force calculation reveals 5.5 results in best MSE reduction: assert cutpoint == 5.5
def test_mse_splitter_in_2d(training_data_mrt): X, Y = training_data_mrt training_data = TrainingData(X, Y) mse_splitter = splitters.MSESplitter() coeffs, cutpoint, cost = mse_splitter.select_feature_to_cut( training_data.X, training_data.Y, 1) index = np.argwhere(coeffs != 0.0)[0][0] assert training_data.X_names[index] in {'x', 0} # Brute force revealed X <= 13.5 gives best MSE reduction assert cutpoint == 13.5
def test_tree_returns_node_and_edges(training_data_1d): X, y = training_data_1d tree = DecisionTree(min_samples_leaf=2) tree.fit(X, y, splitter=splitters.MSESplitter()) nodes, edges = tree.get_nodes_and_edges(max_depth=2) root_id = tree.tree.get_root_id() left_id, right_id = tree.tree.get_children_ids(root_id) left_left_id, left_right_id = tree.tree.get_children_ids(left_id) assert nodes[:4] == [root_id, left_id, left_left_id, left_right_id] assert edges[:4] == [(root_id, left_id, '<='), (root_id, right_id, '>'), (left_id, left_left_id, '<='), (left_id, left_right_id, '>')]
import pytest import pandas as pd from pyboretum import ( MeanNode, MedianNode, splitters, # MAE and MSE splitters DecisionTree, ) @pytest.mark.parametrize('splitter, node_class', [ (splitters.MSESplitter(), MeanNode), (splitters.MAESplitter(), MedianNode), ]) def test_with_one_sample_nodes(splitter, node_class, training_data_1d): X, y = training_data_1d tree = DecisionTree(node_class=node_class, min_samples_leaf=1) tree.fit(X, y, splitter=splitter) # All y should form their own leaves: preds = tree.predict(X) if isinstance(X, pd.DataFrame): assert list(preds.index) == list(X.index) if isinstance(y, pd.DataFrame): assert list(preds.columns) == list(y.columns) preds = preds[0].tolist() assert preds == pytest.approx(y.tolist(), abs=1e-1)
import pytest from pyboretum import ( splitters, TrainingData, ) @pytest.mark.parametrize('splitter', [ splitters.MSESplitter(), splitters.MAESplitter(), ]) def test_min_samples_leaf_can_stop_splitting(splitter, training_data_1d): X, Y = training_data_1d training_data = TrainingData(X, Y) feature, cutpoint, cost = splitter.select_feature_to_cut( training_data.X, training_data.Y, len(training_data.X)) assert feature == None assert cutpoint == None assert cost == float('inf')