Ejemplo n.º 1
0
def test_apply_until_unbalanced(unbalanced_tree):
    X, _, tree = unbalanced_tree

    # all samples are in the root
    expected = np.ones((9, 1))
    root = tree_utils.apply_until(tree, X, depth=0).toarray()
    np.testing.assert_allclose(root, expected)

    # three samples go left the rest go right
    expected = np.array([[1, 0],
                         [1, 0],
                         [1, 0],
                         [0, 1],
                         [0, 1],
                         [0, 1],
                         [0, 1],
                         [0, 1],
                         [0, 1]])
    depth_one = tree_utils.apply_until(tree, X, depth=1).toarray()
    np.testing.assert_allclose(depth_one, expected)

    # the remaing six are split at the next level
    expected = np.array([[1, 0, 0],
                         [1, 0, 0],
                         [1, 0 ,0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 1],
                         [0, 0, 1]])
    depth_two = tree_utils.apply_until(tree, X, depth=2).toarray()
    np.testing.assert_allclose(depth_two, expected)
Ejemplo n.º 2
0
def test_node_similarity_XY_long_similarity(unbalanced_tree):
    X, _, tree = unbalanced_tree
    Y = X[:2]

    nodes_X = tree_utils.apply_until(tree, X, depth=2)
    nodes_Y = tree_utils.apply_until(tree, Y, depth=2)

    S_expected = np.array([[1, 1, 1, 0, 0, 0, 0, 0, 0],
                           [1, 1, 1, 0, 0, 0, 0, 0, 0]]).T
    S = tree_utils.node_similarity(nodes_X, nodes_Y)

    assert S.shape == (len(X), len(Y))
    np.testing.assert_allclose(S, S_expected)
Ejemplo n.º 3
0
def test_apply_until_balanced(balanced_tree):
    X, _, tree = balanced_tree

    # all samples are in the root
    expected = np.ones((6, 1))
    root = tree_utils.apply_until(tree, X, depth=0).toarray()
    np.testing.assert_allclose(root, expected)

    # three samples go left and three samples go right
    expected = np.array([[1, 0],
                         [1, 0],
                         [1, 0],
                         [0, 1],
                         [0, 1],
                         [0, 1]])
    depth_one = tree_utils.apply_until(tree, X, depth=1).toarray()
    np.testing.assert_allclose(depth_one, expected)
Ejemplo n.º 4
0
def test_apply_until_does_not_drop_columns(unbalanced_tree):
    """Test that if X does not occupy all nodes the indicator matrix
    still includes them."""
    X, _, tree = unbalanced_tree

    expected = np.array([[1, 0, 0],
                         [1, 0, 0]])
    depth_two = tree_utils.apply_until(tree, X[:2], depth=2).toarray()
    np.testing.assert_allclose(depth_two, expected)
Ejemplo n.º 5
0
def test_node_similarity_balanced(balanced_tree):
    X, _, tree = balanced_tree

    # this is a balanced tree so targets are grouped together
    S_expected = np.array([[1, 1, 1, 0, 0, 0],
                           [1, 1, 1, 0, 0, 0],
                           [1, 1, 1, 0, 0, 0],
                           [0, 0, 0, 1, 1, 1],
                           [0, 0, 0, 1, 1, 1],
                           [0, 0, 0, 1, 1, 1]])
    node_indicators = tree_utils.apply_until(tree, X, depth=-1)
    S = tree_utils.node_similarity(node_indicators)
    np.testing.assert_allclose(S, S_expected)
Ejemplo n.º 6
0
def test_apply_until_negative_depth(unbalanced_tree):
    """Test depth == -1 returns the leaf nodes"""
    X, _, tree = unbalanced_tree

    expected = np.array([[1, 0, 0],
                         [1, 0, 0],
                         [1, 0 ,0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 1],
                         [0, 0, 1]])
    depth_two = tree_utils.apply_until(tree, X, depth=-1).toarray()
    np.testing.assert_allclose(depth_two, expected)
Ejemplo n.º 7
0
def test_apply_until_too_high_depth(unbalanced_tree):
    """Test a depth larger than max_depth returns leaf nodes."""
    X, _, tree = unbalanced_tree

    # the remaing six are split at the next level
    expected = np.array([[1, 0, 0],
                         [1, 0, 0],
                         [1, 0 ,0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 1],
                         [0, 0, 1]])
    depth_two = tree_utils.apply_until(tree, X, depth=100).toarray()
    np.testing.assert_allclose(depth_two, expected)
Ejemplo n.º 8
0
def random_partitions_kernel(forest,
                             X,
                             Y=None,
                             tree_depths='random',
                             random_state=123):
    """Random Partition Kernel induced by an ensemble of decision trees.

    A random partition kernel is a kernel-function induced by a distribution
    over partitions (or random partitions) of a dataset. Since an ensemble of
    trees such as a random-forest partitions a dataset into groups
    (the tree nodes), these models can be thought of random partition
    generators and so induce a kernel-function.

    By repeatedly cutting a data-set into random partitions we would expect
    data points that are similar to each other to be grouped together
    more often then other samples. Likewise nodes in the
    decision tree should contain similar datapoints. In order to sample the
    whole hierachal structure of the forest a depth is chosen at random to
    sample and then the common partitions are added up. The kernel is as
    follows:
                  Number of times x_i and x_j occur in the same node
        K(i, j) = --------------------------------------------------
                  Total number of trees in the ensemble
    Parameters
    ----------
    forest: A class instance derived from `sklearn.ensemble.BaseForest`.
        The forest from which the kernel is calculated.

    X: array-like, shape = [n_samples, n_features]
       The data to train the kernel on.

    tree_depths: list or str, optional (default='random')
        A list of depths to use for each tree. if `tree_depths`='random'
        then the depths are randomly sampled from a discrete uniform
        distribution between 1 and max_depth.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    K : array-like, shape = [n_samples, n_samples]
        A kernel matrix K such that K_{i, j} is the similarity between
        the ith and jth vectors.
    [1]

    References
    ----------
    .. [1] A. Davis, Z. Ghahramani, "The Random Forest Kernel and creating
           other kernels for big data from random partitions",
           CoRR, 2014.
    """
    X = check_array(X, accept_sparse='csc')
    if Y is not None:
        Y = check_array(Y, accept_sparse='csr')

    if tree_depths == 'random':
        tree_depths = sample_depths(forest, random_state=random_state)

    n_samples_x = X.shape[0]
    n_samples_y = Y.shape[0] if Y is not None else n_samples_x
    kernel = np.zeros(shape=(n_samples_x, n_samples_y))
    for tree_idx, tree in enumerate(forest.estimators_):
        node_indicator_X = tree_utils.apply_until(tree,
                                                  X,
                                                  depth=tree_depths[tree_idx])

        if Y is not None:
            node_indicator_Y = tree_utils.apply_until(
                tree, Y, depth=tree_depths[tree_idx])
        else:
            node_indicator_Y = node_indicator_X

        kernel += tree_utils.node_similarity(node_indicator_X,
                                             node_indicator_Y)

    return kernel / len(forest.estimators_)