def make_decision_split_node(node, feature_indices): """ :param node: Node Node to be split :param feature_indices: array-like of shape (D_try, ) Contains feature indices to be considered in the present split :return: tuple Tuple of left and right children nodes (to be placed on the stack) """ n, D = node.data.shape # Find best feature j (among 'feature_indices') and best threshold t for the split e_min = 1e100 j_min, t_min = 0, 0 for j in feature_indices: # Remove duplicate features dj = np.sort(np.unique(node.data[:, j])) # Compute candidate thresholds tj = (dj[1:] + dj[:-1]) / 2 # Compute Gini-impurity of resulting children nodes for each candidate threshold for t in tj: left_indices = node.data[:, j] <= t nl = np.sum(node.data[:, j] <= t) ll = node.labels[left_indices] el = nl * (1 - np.sum(np.square(np.bincount(ll) / nl))) nr = n - nl # lr = node.labels[node.data[:, j] > t] lr = node.labels[~left_indices] er = nr * (1 - np.sum(np.square(np.bincount(lr) / nr))) if el + er < e_min: e_min = el + er j_min = j t_min = t # Create children left = Node() right = Node() # Initialize 'left' and 'right' with the data subsets and labels # according to the optimal split found above left.data = node.data[node.data[:, j_min] <= t_min, :] left.labels = node.labels[node.data[:, j_min] <= t_min] right.data = node.data[node.data[:, j_min] > t_min, :] right.labels = node.labels[node.data[:, j_min] > t_min] node.left = left node.right = right node.feature = j_min node.threshold = t_min return left, right
def make_density_split_node(node, N, feature_indices): """ Selects dimension and threshold where node is to be split up :param node: Node Node to be split :param N: int Number of training instances :param feature_indices: array-like of shape (D_try, ) Contains feature indices to be considered in the present split :return: tuple Tuple of left and right children nodes (to be placed on the stack) """ n, D = node.data.shape m, M = node.box v = np.prod(M - m) if v <= 0: raise ValueError("Zero volume (should not happen)") # Find best feature j (among 'feature_indices') and best threshold t for the split e_min = float("inf") j_min, t_min = None, None for j in feature_indices: # Duplicate feature values have to be removed because candidate thresholds are # the midpoints of consecutive feature values, not the feature value itself dj = np.sort(np.unique(node.data[:, j])) # Compute candidate thresholds tj = (dj[1:] + dj[:-1]) / 2 # Compute Leave-One-Out error of resulting children nodes for each candidate threshold for t in tj: # Compute number of instances in left and right children nl = np.sum(node.data[:, j] <= t) nr = n - nl # Compute volume of left and right nodes vl = t / (M[j] - m[j]) # vl = v * t / (M[j] - m[j]) vr = 1.0 - vl # vr = v - vl # Notice actual volumes are commented. These differ by the constant factor v. if vl == 0 or vr == 0: continue # Compute LOO errors el = (nl / (N * vl)) * (nl / N - 2.0 * ((nl - 1) / (n - 1))) er = (nr / (N * vr)) * (nr / N - 2.0 * ((nr - 1) / (n - 1))) # Choose best threshold that minimizes sum of LOO error loo_error = el + er if loo_error < e_min: e_min = loo_error j_min = j t_min = t # Create children left = Node() right = Node() # Initialize 'left' and 'right' with the data subsets and bounding boxes # according to the optimal split found above left.data = node.data[node.data[:, j_min] <= t_min, :] left.box = m.copy(), M.copy() left.box[1][j_min] = t_min right.data = node.data[node.data[:, j_min] > t_min, :] right.box = m.copy(), M.copy() right.box[0][j_min] = t_min node.left = left node.right = right node.feature = j_min node.threshold = t_min return left, right