Esempio n. 1
0
def test_create_Gamma_eta_tree_more_per_classification():
    """
    test for create_Gamma_eta_tree_more, class tree - standard depth only

    Both static and random tests (random tests are more relative to structure
    than exact answers)
    """


    # random - structure output check
    # data creation
    n = 200
    min_size_leaf = 1

    X = np.random.uniform(size = (n, 510), low = -1,high = 1)
    y = 10 * np.sin(np.pi * X[:,0]*X[:,1]) + 20 * ( X[:,2] - .5)**2 +\
        10 * X[:,3] + 5 * X[:,4] + np.random.normal(size = n)

    y_cat = np.array(
                     pd.cut(y, bins = 5, labels = np.arange(5, dtype = np.int)),
                     dtype = np.int)

    y = y_cat

    num_classes = len(Counter(y_cat).keys())

    rf_class = sklearn.ensemble.RandomForestClassifier(n_estimators = 2,
                                            min_samples_leaf = min_size_leaf)
    random_forest = rf_class.fit(X = X,
                                 y = y.ravel())

    tree = random_forest.estimators_[0]

    max_depth_range = np.max(smooth_rf.depth_per_node(tree)) + 1

    G, n, ln, ld, li, fd, fi = smooth_rf.create_Gamma_eta_tree_more_per(tree)

    assert G.shape == (num_classes,
                       np.sum(tree.tree_.children_left == -1),
                       max_depth_range), \
        "Gamma returned does not have the correct shape"

    assert n.shape ==  G.shape[1:3], \
        "eta returned does not have the correct shape"

    assert np.all(n >= 0), \
        "eta returned has negative values"

    assert np.all(n[:,0] ==
        tree.tree_.weighted_n_node_samples[tree.tree_.children_left == -1]),\
        "eta structure doesn't match up with number of observes per leaf"

    # new tests
    assert ln.shape[0] == G.shape[1] and ld.shape[0] == G.shape[1] and \
        li.shape[0] == G.shape[1], \
        "leaf based outputs should have same number of leaves and Gamma"

    assert np.all(np.ceil(ln) == ln) and np.all(ln > 0), \
        "leaf counts should be  strictly positive and integers"

    assert np.all(ln ==
        tree.tree_.weighted_n_node_samples[tree.tree_.children_left == -1]), \
        "number of obs in each leaf not matching tree structure"

    assert np.all(np.ceil(ld) == ld) and np.all(ld >= 0), \
        "leaf depth should be positive and integers"

    # newest tests (fd, fi)
    assert fd.shape == G.shape[1:] and fi.shape == G.shape[1:], \
        "shapes of full depth and impurity should make shape of Gamma"

    assert np.all(fd[:,0]  == ld) and np.all(np.ceil(fd) == fd) and \
        np.all(fd >= 0), \
        "full depth shape should mirror leaf depth structure"

    assert np.all(fi[:,0] == li), \
        "full impurity (gini) should mirror leaf impurity structure"


    # static check

   # tree structure:
    # ~upper: left, lower: right~
    #                       num obs     class 1   class 2    depth
    #    |--1                   10          5       5           1
    # -0-|                       34         21      13          0
    #    |   |--3              9            9       0           2
    #    |-2-|                  24          16      8           1
    #        |   |--5         8             7       1           3
    #        |-4-|             15           7       8           2
    #            |--6         7             0       7           3


    # eta
    # (1) 10 | 24 | 0  | 0
    # (3) 9  | 15 | 10 | 0
    # (5) 8  | 7  | 9  | 10
    # (6) 7  | 8  | 9  | 10


    # Gamma (class 1)
    # (1) 5 | 9+7 = 16| 0 | 0
    # (3) 9 | 7       | 5 | 0
    # (5) 7 | 0       | 9 | 5
    # (6) 0 | 7       | 9 | 5

    # Gamma (class 2)
    # (1) 5 | 1+7 = 8| 0 | 0
    # (3) 0 | 8      | 5 | 0
    # (5) 1 | 7      | 0 | 5
    # (6) 7 | 1      | 0 | 5

    def gini(vec):
        p = vec / vec.sum()
        return p.T @ (1-p)

    class inner_fake_tree():
        def __init__(self, nn, cl, cr, v):
            self.weighted_n_node_samples = nn
            self.children_left = cl
            self.children_right = cr
            self.value = v
            self.impurity = np.array([gini(v[i,:,:].ravel()) for i in range(v.shape[0])])

    class fake_tree():
        def __init__(self, nn, cl, cr, v):
            self.tree_ = inner_fake_tree(nn, cl, cr, v)
            self.__class__ = sklearn.tree.tree.DecisionTreeClassifier

    weighted_n_node_samples = np.array([34,10,24,9,15,8,7], dtype = np.int)
    children_left = np.array([2,-1,4,-1,6,-1,-1], dtype = np.int)
    children_right = np.array([1,-1,3,-1,5,-1,-1], dtype = np.int)
    value = np.array([[21, 13],
                      [5, 5],
                      [16, 8],
                      [9, 0],
                      [7, 8],
                      [7, 1],
                      [0, 7]], dtype = np.float).reshape((-1,1,2))

    test = fake_tree(weighted_n_node_samples,
                     children_left,
                     children_right,
                     value)

    n_leaf = 4

    g_static, n_static, ln_static, ld_static, li_static, \
        fd_static, fi_static =  \
        smooth_rf.create_Gamma_eta_tree_more_per(test)

    n_expected = np.array([[10,24,0,0],
                           [9,15,10,0],
                           [8,7,9,10],
                           [7,8,9,10]])
    g_expected = np.array([[[5,16,0,0],
                            [9,7,5,0],
                            [7,0,9,5],
                            [0,7,9,5]],
                           [[5,8,0,0],
                            [0,8,5,0],
                            [1,7,0,5],
                            [7,1,0,5]]])
    ln_expected = np.array([10,9,8,7])
    ld_expected = np.array([1,2,3,3])
    fd_expected = np.array([[1,0,0,0],
                            [2,1,0,0],
                            [3,2,1,0],
                            [3,2,1,0]])
    li_expected = np.array([gini(value[i,:,:].ravel()) for i in range(value.shape[0])])[np.array([1,3,5,6])]
    li_expected2 = np.array([gini(value[i,:,:].ravel()) for i in range(value.shape[0])])[np.array([0,2,4,4])]
    li_expected3 = np.array([gini(value[i,:,:].ravel()) for i in range(value.shape[0])])[np.array([0,0,2,2])]
    li_expected4 = np.array([gini(value[i,:,:].ravel()) for i in range(value.shape[0])])[np.array([0,0,0,0])]

    fi_expected = np.array([li_expected,li_expected2,li_expected3,li_expected4],
                           ).T

    assert np.all(g_static == g_expected), \
        "static test's Gamma failed to reproduce correct solutions"
    assert np.all(n_static == n_expected), \
        "static test's eta failed to reproduce correct solutions"
    assert np.all(ln_static == ln_expected), \
        "static test's leaf count failed to reproduce correct solutions"
    assert np.all(ld_static == ld_expected), \
        "static test's leaf depth failed to reproduce correct solutions"
    assert np.all(li_static == li_expected), \
        "static test's leaf impurity failed to reproduce correct solutions"
    assert np.all(fd_static == fd_expected), \
        "static test's full depth failed to reproduce correct solutions"
    assert np.all(fi_static == fi_expected), \
        "static test's full impurity failed to reproduce correct solutions"
Esempio n. 2
0
def test_create_Gamma_eta_forest_more_classification():
    """
    test create_Gamma_eta_forestmore, class forests - standard depth only

    compares to what is expected to be returned from create_Gamma_eta_tree -
    mostly just structurally
    """
    n = 200
    n_tree = 10
    min_size_leaf = 1

    X = np.random.uniform(size = (n, 510), low = -1,high = 1)
    y = 10 * np.sin(np.pi * X[:,0]*X[:,1]) + 20 * ( X[:,2] - .5)**2 +\
        10 * X[:,3] + 5 * X[:,4] + np.random.normal(size = n)

    y_cat = np.array(pd.cut(y, bins = 5, labels = np.arange(5, dtype = np.int)),
                     dtype = np.int)

    y = y_cat

    num_classes = len(Counter(y_cat).keys())

    rf_class = sklearn.ensemble.RandomForestClassifier(n_estimators = n_tree,
                                            min_samples_leaf = min_size_leaf)
    random_forest = rf_class.fit(X = X,
                                 y = y.ravel())

    g, n, t, ln, ld, li, fd, fi = \
        smooth_rf.create_Gamma_eta_forest_more(random_forest)

    assert g.shape[1:] == n.shape, \
        "Gamma and eta matrices are not the correct shared size"
    assert g.shape[1] == t.shape[0], \
        "the tree index vector doesn't have the correct number of observations"
    assert g.shape[0] == num_classes, \
        "Gamma matrix dimensions don't match the number of classes correctly"
    # new checks
    assert t.shape == ln.shape and t.shape == ld.shape and t.shape == li.shape,\
        "the leaf number, depth, or impurity don't have the correct dim"

    assert g.shape[1:] == fd.shape and g.shape[1:] == fi.shape, \
        "the full depth or impurity doens't have the correct dim"
    # ----


    assert np.all(
        np.array(list(dict(Counter(t)).keys())) == np.arange(n_tree)),\
        "tree index doesn't contain expected tree index values"

    for t_idx, tree in enumerate(random_forest.estimators_):
        max_depth_range = np.int(np.max(smooth_rf.depth_per_node(tree)) + 1)
        G_tree, n_tree, ln_tree, ld_tree, li_tree, fd_tree, fi_tree = \
            smooth_rf.create_Gamma_eta_tree_more_per(tree)

        assert G_tree.shape[1] == np.sum(t == t_idx), \
            "shape of single Gamma from create_Gamma_eta_tree" +\
            "does not match structure from t_idx output"

        assert np.all(G_tree == g[:,t==t_idx,:][:,:,:max_depth_range]), \
            "doesn't match create_Gamma_eta_tree function for Gamma"
        if max_depth_range != g.shape[1]:
            assert np.all(g[:,t==t_idx,][:,:,max_depth_range:] == 0), \
                "extra dimensions, based on the global forest having larger" +\
                "depth than the individual tree (num %d) in Gamma are "+\
                "non-zero" %t_idx

        assert np.all(n_tree == n[t==t_idx,:][:,:max_depth_range]), \
            "doesn't match create_Gamma_eta_tree function for eta"
        if max_depth_range != g.shape[1]:
            assert np.all(n[t==t_idx,][:,max_depth_range:] == 0), \
                "extra dimensions, based on the global forest having larger" +\
                "depth than the individual tree (num %d) in eta are "+\
                "non-zero" %t_idx

        # new checks
        assert np.all(ln_tree == ln[t==t_idx]), \
            "attributes in leaf number should match the base function"
        assert np.all(ld_tree == ld[t==t_idx]), \
            "attributes in leaf depth should match the base function"
        assert np.all(li_tree == li[t==t_idx]), \
            "attributes in leaf impurity should match the base function"
        assert np.all(ln_tree == ln[t==t_idx]), \
            "attributes in leaf number should match the base function"
        assert np.all(fd_tree == fd[t==t_idx,:][:,:max_depth_range]), \
            "attributes in full depth should match the base function"
        assert np.all(fi_tree == fi[t==t_idx,:][:,:max_depth_range]), \
            "attributes in full impurity should match the base function"
Esempio n. 3
0
def test_create_Gamma_eta_tree_more_per_regression():
    """
    test for create_Gamma_eta_tree_more_per, reg tree - standard depth only

    Both static and random tests (random tests are more relative to structure
    than exact answers)
    """


    # random - structure output check
    # data creation
    n = 200
    min_size_leaf = 1

    X = np.random.uniform(size = (n, 510), low = -1,high = 1)
    y = 10 * np.sin(np.pi * X[:,0]*X[:,1]) + 20 * ( X[:,2] - .5)**2 +\
        10 * X[:,3] + 5 * X[:,4] + np.random.normal(size = n)

    rf_class = sklearn.ensemble.RandomForestRegressor(n_estimators = 2,
                                            min_samples_leaf = min_size_leaf)
    random_forest = rf_class.fit(X = X,
                                 y = y.ravel())

    tree = random_forest.estimators_[0]

    max_depth_range = np.max(smooth_rf.depth_per_node(tree)) + 1

    G, n, ln, ld, li, fd, fi = smooth_rf.create_Gamma_eta_tree_more_per(tree)

    assert G.shape == (np.sum(tree.tree_.children_left == -1),
                       max_depth_range), \
        "Gamma returned does not have the correct shape"

    assert n.shape ==  G.shape, \
        "eta returned does not have the correct shape"

    assert np.all(n >= 0), \
        "eta returned has negative values"

    assert np.all(n[:,0] ==
        tree.tree_.weighted_n_node_samples[tree.tree_.children_left == -1]),\
        "eta structure doesn't match up with number of observes per leaf"

    # new tests (ln,ld,li)
    assert ln.shape[0] == G.shape[0] and ld.shape[0] == G.shape[0] and \
        li.shape[0] == G.shape[0], \
        "leaf based outputs should have same number of leaves and Gamma"

    assert np.all(np.ceil(ln) == ln) and np.all(ln > 0), \
        "leaf counts should be  strictly positive and integers"

    assert np.all(ln ==
        tree.tree_.weighted_n_node_samples[tree.tree_.children_left == -1]), \
        "number of obs in each leaf not matching tree structure"

    assert np.all(np.ceil(ld) == ld) and np.all(ld >= 0), \
        "leaf depth should be positive and integers"

    assert np.all(li >= - 1e-10), \
        "leaf impurity (mse) should be non-negative"

    # newest tests (fd, fi)
    assert fd.shape == G.shape and fi.shape == G.shape, \
        "shapes of full depth and impurity should make shape of Gamma"

    assert np.all(fd[:,0]  == ld) and np.all(np.ceil(fd) == fd) and \
        np.all(fd >= 0), \
        "full depth shape should mirror leaf depth structure"

    assert np.all(fi[:,0] == li) and np.all(fi >= - 1e-10), \
        "full impurity (mse) should mirror leaf impurity structure"

    # for c_idx in range(fi.shape[1] - 1):
    #     assert np.all(fi[:,c_idx] - fi[:,c_idx + 1] <= 1e-10), \
    #         "impurity should be increasing (mse)"

    # static check

   # tree structure:
    # ~upper: left, lower: right~
    #                       num obs   depth
    #    |--1                   10      1
    # -0-|                       34     0
    #    |   |--3              9        2
    #    |-2-|                  24      1
    #        |   |--5         8         3
    #        |-4-|             15       2
    #            |--6         7         3


    # eta
    # (1) 10 | 24 | 0  | 0
    # (3) 9  | 15 | 10 | 0
    # (5) 8  | 7  | 9  | 10
    # (6) 7  | 8  | 9  | 10

    # Gamma
    # (1) 10         | 18+24+28 = 70 | 0  | 0
    # (3) 9 * 2 = 18 | 24+28 = 52    | 10 | 0
    # (5) 8 * 3 = 24 | 28            | 18 | 10
    # (6) 7 * 4 = 28 | 24            | 18 | 10



    class inner_fake_tree():
        def __init__(self, nn, cl, cr, v):
            self.weighted_n_node_samples = nn
            self.children_left = cl
            self.children_right = cr
            self.value = v
            self.impurity = np.zeros(v.shape[0]) # this isn't a good test

    class fake_tree():
        def __init__(self, nn, cl, cr, v):
            self.tree_ = inner_fake_tree(nn, cl, cr, v)
            self.__class__ = sklearn.tree.tree.DecisionTreeRegressor

    weighted_n_node_samples = np.array([34,10,24,9,15,8,7], dtype = np.int)
    children_left = np.array([2,-1,4,-1,6,-1,-1], dtype = np.int)
    children_right = np.array([1,-1,3,-1,5,-1,-1], dtype = np.int)
    value = np.array([-99, 1, -99, 2, -99, 3, 4]).reshape((-1,1,1))

    test = fake_tree(weighted_n_node_samples,
                     children_left,
                     children_right,
                     value)

    n_leaf = 4

    g_static, n_static, ln_static, ld_static, li_static, \
        fd_static, fi_static = \
        smooth_rf.create_Gamma_eta_tree_more_per(test)

    n_expected = np.array([[10,24,0,0],
                           [9,15,10,0],
                           [8,7,9,10],
                           [7,8,9,10]])
    g_expected = np.array([[10,70,0,0],
                           [18,52,10,0],
                           [24,28,18,10],
                           [28,24,18,10]])
    ln_expected = n_expected[:,0]
    ld_expected = np.array([1,2,3,3])
    fd_expected = np.array([[1,0,0,0],
                            [2,1,0,0],
                            [3,2,1,0],
                            [3,2,1,0]])

    assert np.all(g_static == g_expected), \
        "static test's Gamma failed to reproduce correct solutions"
    assert np.all(n_static == n_expected), \
        "static test's eta failed to reproduce correct solutions"
    assert np.all(ln_static == ln_expected), \
        "static test's leaf count failed to reproduce correct solutions"
    assert np.all(ld_static == ld_expected), \
        "static test's leaf depth failed to reproduce correct solutions"
    assert np.all(fd_static == fd_expected), \
        "static test's full depth failed to reproduce correct solutions"