Ejemplo n.º 1
0
def test_dimension_location():
    """
    Test dimension and location of split.
    """
    rng = np.random.RandomState(0)
    X = rng.rand(100, 2)
    X[:, 1] *= 100
    y = rng.randn(100)

    mtr = MondrianTreeRegressor(random_state=0, max_depth=1)
    n = 1000

    features = []
    thresholds = []
    for random_state in np.arange(1000):
        mtr.set_params(random_state=random_state).fit(X, y)
        features.append(mtr.tree_.feature[0])
        thresholds.append(mtr.tree_.threshold[0])

    # Check that this converges to the actual probability p of the bernoulli.
    diff = np.max(X, axis=0) - np.min(X, axis=0)
    p_act = diff / np.sum(diff)
    features = np.array(features)
    thresholds = np.array(thresholds)
    counts = np.bincount(features)
    p_sim = counts / np.sum(counts)
    assert_array_almost_equal(p_act, p_sim, 2)

    # Check that the split location converges to the (u + l) / 2 where
    # u and l are the upper and lower bounds of the feature.
    u = np.max(X, axis=0)[-1]
    l = np.min(X, axis=0)[-1]
    thresh_sim = np.mean(thresholds[features == 1])
    thresh_act = (u + l) / 2.0
    assert_array_almost_equal(thresh_act, thresh_sim, 2)
Ejemplo n.º 2
0
def test_min_samples_split():
    iris = load_iris()
    X, y = iris.data, iris.target
    mtr = MondrianTreeRegressor(min_samples_split=10, random_state=0)
    mtr.fit(X, y)
    n_node_samples = mtr.tree_.n_node_samples[mtr.tree_.children_left != -1]
    assert_less(np.min(n_node_samples), 11)
Ejemplo n.º 3
0
def test_tree_attributes():
    rng = np.random.RandomState(0)
    X = rng.randn(20, 5)
    y = np.sum(X[:, :4], axis=1)
    mr = MondrianTreeRegressor(random_state=0)
    mr.fit(X, y)
    check_tree_attributes(X, y, 0, mr.tree_)
Ejemplo n.º 4
0
def test_boston():
    """Consistency on boston house prices"""
    mtr = MondrianTreeRegressor(random_state=0)
    boston = load_boston()
    X, y = boston.data, boston.target
    mtr.fit(X, y)
    score = mean_squared_error(mtr.predict(X), y)
    assert_less(score, 1, "Failed with score = {0}".format(score))
Ejemplo n.º 5
0
def test_tree_predict():
    X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
    y = [-1, -1, -1, 1, 1, 1]
    T = [[-1, -1], [2, 2], [3, 2]]

    # This test is dependent on the random-state since the feature
    # and the threshold selected at every split is independent of the
    # label.
    mtr = MondrianTreeRegressor(max_depth=1, random_state=0)
    mtr.fit(X, y)
    mtr_tree = mtr.tree_
    cand_feature = mtr_tree.feature[0]
    cand_thresh = mtr_tree.threshold[0]
    assert_almost_equal(cand_thresh, -0.38669141)
    assert_almost_equal(cand_feature, 0.0)

    # Close to (1.0 / np.sum(np.max(X, axis=0) - np.min(X, axis=0)))
    assert_almost_equal(mtr_tree.tau[0], 0.07112633)

    # For [-1, -1]:
    # P_not_separated = 1.0
    # Root:
    # eta_root = 0.0 (inside the bounding boc of the root)
    # P_root = 1 - exp(0.0) = 0.0
    # weight_root = P_root
    # mean_root = 0.0
    # Leaf:
    # P_not_separated = 1.0 * (1 - 0.0) = 1.0
    # weight_leaf = P_not_separated = 1.0
    # mean_leaf = -1.0
    # prediction = 0.0 - 1.0 = -1.0

    # variance = (weight_root * (var_root + mean_root**2) +
    #             weight_leaf * (var_leaf + mean_leaf**2)) - mean**2
    # This reduces to weight_leaf * mean_leaf**2 - mean**2 = 1.0 * (1.0 - 1.0)
    # = 0.0

    # Similarly for [2, 2]:
    # prediction = 0.0 + 1.0
    # Variance reduces to zero

    # For [3, 2]
    # P_not_separated = 1.0
    # Root:
    # Delta_root = 0.07112633
    # eta_root = 1.0
    # weight_root = 1 - exp(-0.07112633) = 0.0686
    # Leaf:
    # weight_leaf = P_not_separated = (1 - 0.0686) = 0.93134421
    # prediction = weight_leaf

    # variance = (weight_root * (var_root + mean_root**2) +
    #             weight_leaf * (var_leaf + mean_leaf**2)) - mean**2
    # = 0.0686 * (1 + 0) + 0.93134 * (0 + 1) - 0.93134421**2 = 0.132597

    T_predict, T_std = mtr.predict(T, return_std=True)
    assert_array_almost_equal(T_predict, [-1.0, 1.0, 0.93134421])
    assert_array_almost_equal(T_std, np.sqrt([0.0, 0.0, 0.132597]))
Ejemplo n.º 6
0
def test_std_positive():
    """Sometimes variance can be slightly negative due to numerical errors."""
    X = np.linspace(-np.pi, np.pi, 20)
    y = 2 * np.sin(X)
    X_train = np.reshape(X, (-1, 1))
    mr = MondrianTreeRegressor(random_state=0)
    mr.fit(X_train, y)

    X_test = np.array([[2.87878788], [2.97979798], [3.08080808]])
    _, y_std = mr.predict(X_test, return_std=True)
    assert_false(np.any(np.isnan(y_std)))
    assert_false(np.any(np.isinf(y_std)))
Ejemplo n.º 7
0
def test_tau():
    """
    Test time of split for the root.
    """
    X, y = make_regression(random_state=0, n_features=10)
    rate = np.sum(np.max(X, axis=0) - np.min(X, axis=0))
    mtr = MondrianTreeRegressor(random_state=0, max_depth=1)

    taus = []
    for random_state in np.arange(100):
        mtr.set_params(random_state=random_state).fit(X, y)
        taus.append(mtr.tree_.tau[0])
    assert_almost_equal(np.mean(taus), 1.0 / rate, 2)
Ejemplo n.º 8
0
def test_weighted_decision_path_regression():
    X_train, X_test, y_train, y_test = load_scaled_boston()
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.fit(X_train, y_train)
    check_weighted_decision_path_regression(mtr, X_test)
    mtr.partial_fit(X_train, y_train)
    check_weighted_decision_path_regression(mtr, X_test)
Ejemplo n.º 9
0
def test_weighted_decision_path_test_regression():
    X_train, X_test, y_train, y_test = load_scaled_boston()
    n_train = X_train.shape[0]
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.fit(X_train, y_train)
    weights = mtr.weighted_decision_path(X_test)
    node_means = mtr.tree_.mean
    node_variances = mtr.tree_.variance
    variances1 = []
    means1 = []

    for startptr, endptr in zip(weights.indptr[:-1], weights.indptr[1:]):
        curr_nodes = weights.indices[startptr:endptr]
        curr_weights = weights.data[startptr:endptr]
        curr_means = node_means[curr_nodes]
        curr_var = node_variances[curr_nodes]

        means1.append(np.sum(curr_weights * curr_means))
        variances1.append(np.sum(curr_weights * (curr_var + curr_means**2)))

    means1 = np.array(means1)
    variances1 = np.array(variances1)
    variances1 -= means1**2
    means2, std2 = mtr.predict(X_test, return_std=True)
    assert_array_almost_equal(means1, means2, 5)
    assert_array_almost_equal(variances1, std2**2, 3)
Ejemplo n.º 10
0
def test_node_weights():
    """
    Test the implementation of node_weights.
    """
    rng = np.random.RandomState(0)
    boston = load_boston()
    X, y = boston.data, boston.target
    n_train = 100
    n_test = 100
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[-n_test:], y[-n_test:]

    minmax = MinMaxScaler()
    X_train = minmax.fit_transform(X_train)
    X_test = minmax.transform(X_test)

    # Test that when all samples are in the training data all weights
    # should be concentrated at the leaf.
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.fit(X_train, y_train)
    leaf_nodes = mtr.apply(X_train)
    weights_sparse = mtr.weighted_decision_path(X_train)
    assert_array_equal(weights_sparse.data, np.ones(X_train.shape[0]))
    assert_array_equal(weights_sparse.indices, leaf_nodes)
    assert_array_equal(weights_sparse.indptr, np.arange(n_train + 1))

    # Test prediction using the node_weights function gives similar results
    # to that using the prediction method.
    weights = mtr.weighted_decision_path(X_test)
    node_means = mtr.tree_.mean
    node_variances = mtr.tree_.variance
    variances1 = []
    means1 = []

    for startptr, endptr in zip(weights.indptr[:-1], weights.indptr[1:]):
        curr_nodes = weights.indices[startptr:endptr]
        curr_weights = weights.data[startptr:endptr]
        curr_means = node_means[curr_nodes]
        curr_var = node_variances[curr_nodes]

        means1.append(np.sum(curr_weights * curr_means))
        variances1.append(np.sum(curr_weights * (curr_var + curr_means**2)))

    means1 = np.array(means1)
    variances1 = np.array(variances1)
    variances1 -= means1**2
    means2, std2 = mtr.predict(X_test, return_std=True)
    assert_array_almost_equal(means1, means2, 5)
    assert_array_almost_equal(variances1, std2**2, 3)
Ejemplo n.º 11
0
def test_mean_std_reg_convergence():
    X_train, _, y_train, _ = load_scaled_boston()
    mr = MondrianTreeRegressor(random_state=0)
    mr.fit(X_train, y_train)
    check_mean_std_reg_convergence(mr, X_train, y_train)

    n_s = int(len(X_train) / 2)
    mr.partial_fit(X_train[:n_s], y_train[:n_s])
    mr.partial_fit(X_train[n_s:], y_train[n_s:])
    check_mean_std_reg_convergence(mr, X_train, y_train)
Ejemplo n.º 12
0
def test_numerical_stability():
    X = np.array([[152.08097839, 140.40744019, 129.75102234, 159.90493774],
                  [142.50700378, 135.81935120, 117.82884979, 162.75781250],
                  [127.28772736, 140.40744019, 129.75102234, 159.90493774],
                  [132.37025452, 143.71923828, 138.35694885, 157.84558105],
                  [103.10237122, 143.71928406, 138.35696411, 157.84559631],
                  [127.71276855, 143.71923828, 138.35694885, 157.84558105],
                  [120.91514587, 140.40744019, 129.75102234, 159.90493774]])

    y = np.array(
        [1., 0.70209277, 0.53896582, 0., 0.90914464, 0.48026916, 0.49622521])

    with np.errstate(all="raise"):
        mtr = MondrianTreeRegressor(random_state=0)
        mtr.fit(X, y)
        mtr.fit(X, -y)
        mtr.fit(-X, y)
        mtr.fit(-X, -y)
Ejemplo n.º 13
0
def test_mean_std_reg_convergence():
    X_train, _, y_train, _ = load_scaled_boston()
    mr = MondrianTreeRegressor(random_state=0)
    mr.fit(X_train, y_train)

    # For points completely in the training data and when
    # tree is grown to full depth.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = mr.predict(X_train, return_std=True)
    assert_array_almost_equal(mean, y_train, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (20.0 * np.ones(X_train.shape[1]), -20.0 * np.ones(X_train.shape[1])))
    inf_mean, inf_std = mr.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y_train.mean(), 1)
    assert_array_almost_equal(inf_std, y_train.std(), 2)
Ejemplo n.º 14
0
def test_mean_std():
    boston = load_boston()
    X, y = boston.data, boston.target
    X = MinMaxScaler().fit_transform(X)
    mr = MondrianTreeRegressor(random_state=0)
    mr.fit(X, y)

    # For points completely in the training data.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = mr.predict(X, return_std=True)
    assert_array_almost_equal(mean, y, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (20.0 * np.ones(X.shape[1]), -20.0 * np.ones(X.shape[1])))
    inf_mean, inf_std = mr.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y.mean(), 1)
    assert_array_almost_equal(inf_std, y.std(), 2)
Ejemplo n.º 15
0
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_less
from sklearn.utils.testing import assert_true

from skgarden.mondrian import MondrianTreeClassifier
from skgarden.mondrian import MondrianTreeRegressor

estimators = [MondrianTreeRegressor(random_state=0),
              MondrianTreeClassifier(random_state=0)]


def test_tree_predict():
    X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
    y = [-1, -1, -1, 1, 1, 1]
    T = [[-1, -1], [2, 2], [3, 2]]

    # This test is dependent on the random-state since the feature
    # and the threshold selected at every split is independent of the
    # label.
    for est_true in estimators:
        est = clone(est_true)
        est.set_params(random_state=0, max_depth=1)
        est.fit(X, y)
Ejemplo n.º 16
0
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from skgarden.utils.testing import assert_array_equal
from skgarden.utils.testing import assert_array_almost_equal
from skgarden.utils.testing import assert_almost_equal
from skgarden.utils.testing import assert_equal
from skgarden.utils.testing import assert_false
from skgarden.utils.testing import assert_less
from skgarden.utils.testing import assert_true

from skgarden.mondrian import MondrianTreeClassifier
from skgarden.mondrian import MondrianTreeRegressor

estimators = [
    MondrianTreeRegressor(random_state=0),
    MondrianTreeClassifier(random_state=0)
]


def test_tree_predict():
    X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
    y = [-1, -1, -1, 1, 1, 1]
    T = [[-1, -1], [2, 2], [3, 2]]

    # This test is dependent on the random-state since the feature
    # and the threshold selected at every split is independent of the
    # label.
    for est_true in estimators:
        est = clone(est_true)
        est.set_params(random_state=0, max_depth=1)
Ejemplo n.º 17
0
def test_pure_set():
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
    y = [1, 1, 1, 1, 1, 1]
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.fit(X, y)
    assert_array_almost_equal(mtr.predict(X), y)
Ejemplo n.º 18
0
def test_array_repr():
    X = np.arange(10000)[:, np.newaxis]
    y = np.arange(10000)
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.fit(X, y)