Ejemplo n.º 1
0
def test_mse_tree(training_data_1d):
    X, y = training_data_1d

    tree = DecisionTree(min_samples_leaf=2)
    tree.fit(X, y, splitter=splitters.MSESplitter())

    preds = tree.predict(X)
    if isinstance(X, pd.DataFrame):
        assert list(preds.index) == list(X.index)
        if isinstance(y, pd.DataFrame):
            assert list(preds.columns) == list(y.columns)

        preds = preds[0].tolist()

    assert preds == pytest.approx([
        15.0, 15.0, 15.0, 25.0, 25.0, 25.0, 10.0, 10.0, 10.0, 17.0, 17.0, 17.0
    ],
                                  abs=1e-1)

    # 4 leaves are formed:
    node_ids = tree.apply(X)
    assert len(set(node_ids)) == 4

    for group_id in range(4):
        begin_idx = 3 * group_id
        end_idx = 3 * group_id + 2
        assert len(set(node_ids[begin_idx:end_idx])) == 1
Ejemplo n.º 2
0
def test_mse_splitter_in_1d(training_data_1d):
    X, Y = training_data_1d
    training_data = TrainingData(X, Y)
    mse_splitter = splitters.MSESplitter()

    coeffs, cutpoint, cost = mse_splitter.select_feature_to_cut(
        training_data.X, training_data.Y, 1)

    index = np.argwhere(coeffs != 0.0)[0][0]
    assert training_data.X_names[index] in {'x', 0}
    # Brute force calculation reveals 5.5 results in best MSE reduction:
    assert cutpoint == 5.5
Ejemplo n.º 3
0
def test_mse_splitter_in_2d(training_data_mrt):
    X, Y = training_data_mrt
    training_data = TrainingData(X, Y)
    mse_splitter = splitters.MSESplitter()

    coeffs, cutpoint, cost = mse_splitter.select_feature_to_cut(
        training_data.X, training_data.Y, 1)

    index = np.argwhere(coeffs != 0.0)[0][0]
    assert training_data.X_names[index] in {'x', 0}
    # Brute force revealed X <= 13.5 gives best MSE reduction
    assert cutpoint == 13.5
Ejemplo n.º 4
0
def test_tree_returns_node_and_edges(training_data_1d):
    X, y = training_data_1d

    tree = DecisionTree(min_samples_leaf=2)
    tree.fit(X, y, splitter=splitters.MSESplitter())

    nodes, edges = tree.get_nodes_and_edges(max_depth=2)

    root_id = tree.tree.get_root_id()
    left_id, right_id = tree.tree.get_children_ids(root_id)
    left_left_id, left_right_id = tree.tree.get_children_ids(left_id)
    assert nodes[:4] == [root_id, left_id, left_left_id, left_right_id]
    assert edges[:4] == [(root_id, left_id, '<='), (root_id, right_id, '>'),
                         (left_id, left_left_id, '<='),
                         (left_id, left_right_id, '>')]
Ejemplo n.º 5
0
import pytest
import pandas as pd

from pyboretum import (
    MeanNode,
    MedianNode,
    splitters,  # MAE and MSE splitters
    DecisionTree,
)


@pytest.mark.parametrize('splitter, node_class', [
    (splitters.MSESplitter(), MeanNode),
    (splitters.MAESplitter(), MedianNode),
])
def test_with_one_sample_nodes(splitter, node_class, training_data_1d):
    X, y = training_data_1d

    tree = DecisionTree(node_class=node_class, min_samples_leaf=1)
    tree.fit(X, y, splitter=splitter)

    # All y should form their own leaves:
    preds = tree.predict(X)
    if isinstance(X, pd.DataFrame):
        assert list(preds.index) == list(X.index)
        if isinstance(y, pd.DataFrame):
            assert list(preds.columns) == list(y.columns)

        preds = preds[0].tolist()

    assert preds == pytest.approx(y.tolist(), abs=1e-1)
Ejemplo n.º 6
0
import pytest

from pyboretum import (
    splitters,
    TrainingData,
)


@pytest.mark.parametrize('splitter', [
    splitters.MSESplitter(),
    splitters.MAESplitter(),
])
def test_min_samples_leaf_can_stop_splitting(splitter, training_data_1d):
    X, Y = training_data_1d
    training_data = TrainingData(X, Y)

    feature, cutpoint, cost = splitter.select_feature_to_cut(
        training_data.X, training_data.Y, len(training_data.X))

    assert feature == None
    assert cutpoint == None
    assert cost == float('inf')