Example #1
0
def test_mae_tree(training_data_1d):
    X, y = training_data_1d

    tree = DecisionTree(node_class=MedianNode, min_samples_leaf=2)
    tree.fit(X, y, splitter=splitters.MAESplitter())

    preds = tree.predict(X)
    if isinstance(X, pd.DataFrame):
        assert list(preds.index) == list(X.index)
        if isinstance(y, pd.DataFrame):
            assert list(preds.columns) == list(y.columns)

        preds = preds[0].tolist()

    assert preds == pytest.approx(
        [15.1, 15.1, 15.1, 25.2, 25.2, 25.2, 9.8, 9.8, 9.8, 17.0, 17.0, 17.0],
        abs=1e-1)

    # 4 leaves are formed:
    node_ids = tree.apply(X)
    assert len(set(node_ids)) == 4

    for group_id in range(4):
        begin_idx = 3 * group_id
        end_idx = 3 * group_id + 2
        assert len(set(node_ids[begin_idx:end_idx])) == 1


#TODO: write a test where training X is a dataframe, and the predicting X has a different column order.
Example #2
0
def test_min_samples_leaf_affects_mae_split(training_data_1d):
    X, Y = training_data_1d
    training_data = TrainingData(X, Y)
    mae_splitter = splitters.MAESplitter()

    coeffs, cutpoint, cost = mae_splitter.select_feature_to_cut(
        training_data.X, training_data.Y,
        len(training_data.X) / 2)

    index = np.argwhere(coeffs != 0.0)[0][0]
    assert training_data.X_names[index] in {'x', 0}
    assert cutpoint == 5.5
Example #3
0
def test_mae_splitter(training_data_1d):
    X, Y = training_data_1d
    training_data = TrainingData(X, Y)
    mae_splitter = splitters.MAESplitter()

    coeffs, cutpoint, cost = mae_splitter.select_feature_to_cut(
        training_data.X, training_data.Y, 2)

    index = np.argwhere(coeffs != 0.0)[0][0]
    assert training_data.X_names[index] in {'x', 0}
    # Brute force calculation reveals 8.5 results in best MAE reduction:
    assert cutpoint == 8.5
Example #4
0
import pytest
import pandas as pd

from pyboretum import (
    MeanNode,
    MedianNode,
    splitters,  # MAE and MSE splitters
    DecisionTree,
)


@pytest.mark.parametrize('splitter, node_class', [
    (splitters.MSESplitter(), MeanNode),
    (splitters.MAESplitter(), MedianNode),
])
def test_with_one_sample_nodes(splitter, node_class, training_data_1d):
    X, y = training_data_1d

    tree = DecisionTree(node_class=node_class, min_samples_leaf=1)
    tree.fit(X, y, splitter=splitter)

    # All y should form their own leaves:
    preds = tree.predict(X)
    if isinstance(X, pd.DataFrame):
        assert list(preds.index) == list(X.index)
        if isinstance(y, pd.DataFrame):
            assert list(preds.columns) == list(y.columns)

        preds = preds[0].tolist()

    assert preds == pytest.approx(y.tolist(), abs=1e-1)
Example #5
0
import pytest

from pyboretum import (
    splitters,
    TrainingData,
)


@pytest.mark.parametrize('splitter', [
    splitters.MSESplitter(),
    splitters.MAESplitter(),
])
def test_min_samples_leaf_can_stop_splitting(splitter, training_data_1d):
    X, Y = training_data_1d
    training_data = TrainingData(X, Y)

    feature, cutpoint, cost = splitter.select_feature_to_cut(
        training_data.X, training_data.Y, len(training_data.X))

    assert feature == None
    assert cutpoint == None
    assert cost == float('inf')