Ejemplo n.º 1
0
def make_decision_split_node(node, feature_indices):
    """

    :param node: Node
        Node to be split

    :param feature_indices: array-like of shape (D_try, )
        Contains feature indices to be considered in the present split

    :return: tuple
        Tuple of left and right children nodes (to be placed on the stack)
    """
    n, D = node.data.shape

    # Find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = 1e100
    j_min, t_min = 0, 0
    for j in feature_indices:
        # Remove duplicate features
        dj = np.sort(np.unique(node.data[:, j]))
        # Compute candidate thresholds
        tj = (dj[1:] + dj[:-1]) / 2

        # Compute Gini-impurity of resulting children nodes for each candidate threshold
        for t in tj:
            left_indices = node.data[:, j] <= t

            nl = np.sum(node.data[:, j] <= t)
            ll = node.labels[left_indices]
            el = nl * (1 - np.sum(np.square(np.bincount(ll) / nl)))

            nr = n - nl
            # lr = node.labels[node.data[:, j] > t]
            lr = node.labels[~left_indices]
            er = nr * (1 - np.sum(np.square(np.bincount(lr) / nr)))

            if el + er < e_min:
                e_min = el + er
                j_min = j
                t_min = t

    # Create children
    left = Node()
    right = Node()

    # Initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    left.data = node.data[node.data[:, j_min] <= t_min, :]
    left.labels = node.labels[node.data[:, j_min] <= t_min]

    right.data = node.data[node.data[:, j_min] > t_min, :]
    right.labels = node.labels[node.data[:, j_min] > t_min]

    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    return left, right
Ejemplo n.º 2
0
def make_density_split_node(node, N, feature_indices):
    """
    Selects dimension and threshold where node is to be split up

    :param node: Node
        Node to be split

    :param N: int
        Number of training instances

    :param feature_indices: array-like of shape (D_try, )
        Contains feature indices to be considered in the present split

    :return: tuple
        Tuple of left and right children nodes (to be placed on the stack)

    """
    n, D = node.data.shape
    m, M = node.box
    v = np.prod(M - m)
    if v <= 0:
        raise ValueError("Zero volume (should not happen)")

    # Find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = float("inf")
    j_min, t_min = None, None

    for j in feature_indices:
        # Duplicate feature values have to be removed because candidate thresholds are
        # the midpoints of consecutive feature values, not the feature value itself
        dj = np.sort(np.unique(node.data[:, j]))
        # Compute candidate thresholds
        tj = (dj[1:] + dj[:-1]) / 2

        # Compute Leave-One-Out error of resulting children nodes for each candidate threshold
        for t in tj:
            # Compute number of instances in left and right children
            nl = np.sum(node.data[:, j] <= t)
            nr = n - nl
            # Compute volume of left and right nodes
            vl = t / (M[j] - m[j])  # vl = v * t / (M[j] - m[j])
            vr = 1.0 - vl  # vr = v - vl
            # Notice actual volumes are commented. These differ by the constant factor v.

            if vl == 0 or vr == 0:
                continue
            # Compute LOO errors
            el = (nl / (N * vl)) * (nl / N - 2.0 * ((nl - 1) / (n - 1)))
            er = (nr / (N * vr)) * (nr / N - 2.0 * ((nr - 1) / (n - 1)))

            # Choose best threshold that minimizes sum of LOO error
            loo_error = el + er
            if loo_error < e_min:
                e_min = loo_error
                j_min = j
                t_min = t

    # Create children
    left = Node()
    right = Node()

    # Initialize 'left' and 'right' with the data subsets and bounding boxes
    # according to the optimal split found above
    left.data = node.data[node.data[:, j_min] <= t_min, :]
    left.box = m.copy(), M.copy()
    left.box[1][j_min] = t_min

    right.data = node.data[node.data[:, j_min] > t_min, :]
    right.box = m.copy(), M.copy()
    right.box[0][j_min] = t_min

    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    return left, right