def test_list_to_discrete_rv(self): cases = [ ([0, 2, 2, 3], ([0, 2, 3], [0.25, 0.5, 0.25])), ] for case in cases: arg = case[0] result_expected = case[1] result_actual = list_to_discrete_rv(np.array(arg)) self.assertTrue(np.all(result_actual[0] == result_expected[0])) self.assertTrue(np.all(result_actual[1] == result_expected[1]))
def _build_tree_recursive_oblivious(self, tree, cur_level): nodes_on_current_level = tree.nodes_at_level(cur_level, kind='all') for node_id in nodes_on_current_level: X, y = self._data_per_node[node_id] tree._leaf_n_samples[node_id] = len(y) leaves_reached = False if self.max_depth is not None and cur_level >= self.max_depth: if TreeBuilderObliviousCart.debug: TreeBuilderObliviousCart.logger.debug('Max depth reached at level {}'.format(cur_level)) leaves_reached = True best_layer_split = self.find_best_layer_split(nodes_on_current_level) if best_layer_split is None: if TreeBuilderObliviousCart.debug: TreeBuilderObliviousCart.logger.debug('No split found at level {}'.format(cur_level)) leaves_reached = True self._no_split_found = True if self.switch_criterion.should_switch(self): if TreeBuilderObliviousCart.debug: TreeBuilderObliviousCart.logger.debug('Switching to CART tree at level {}'.format(cur_level)) self.mode = TreeType.CART for node in nodes_on_current_level: self._build_tree_recursive_cart(tree, node, self._data_per_node[node][0], self._data_per_node[node][1]) return if leaves_reached: # Process nodes that won't be splitted and are going to become leaves in the final tree for node_id in nodes_on_current_level: _, y = self._data_per_node[node_id] if self.is_regression: tree._leaf_values[node_id] = np.mean(y) else: if self.leaf_prediction_rule == 'majority': tree._leaf_values[node_id] = scipy.stats.mode(y).mode[0] elif self.leaf_prediction_rule == 'distribution': values, probabilities = list_to_discrete_rv(y) distribution = scipy.stats.rv_discrete(values=(values, probabilities)) func = lambda d: d.rvs() tree._leaf_functions[node_id] = (func, distribution) else: raise ValueError('Invalid value for leaf_prediction_rule: {}'.format(self.leaf_prediction_rule)) return else: for node_id, node_split in zip(nodes_on_current_level, best_layer_split): self.apply_node_split(tree, node_split) if self.max_depth is not None and cur_level < self.max_depth: self._build_tree_recursive_oblivious(tree, cur_level + 1)
def _build_tree_recursive_cart(self, tree, cur_node, X, y): n_samples, n_features = X.shape if n_samples < 1: return leaf_reached = False if n_samples <= self.min_samples_per_leaf: leaf_reached = True depth = tree.depth(cur_node) if self.max_depth is not None and depth >= self.max_depth: leaf_reached = True best_split = None if not leaf_reached: if TreeBuilderObliviousCart.debug: TreeBuilderObliviousCart.logger.debug('Split at node {}, n = {}'.format(cur_node, n_samples)) best_split = self.find_best_split(X, y) if best_split is None: if TreeBuilderObliviousCart.debug: TreeBuilderObliviousCart.logger.debug('No split found for at node {}'.format(cur_node)) leaf_reached = True tree._leaf_n_samples[cur_node] = len(y) if leaf_reached: if self.is_regression: tree._leaf_values[cur_node] = np.mean(y) else: if self.leaf_prediction_rule == 'majority': tree._leaf_values[cur_node] = scipy.stats.mode(y).mode[0] elif self.leaf_prediction_rule == 'distribution': values, probabilities = list_to_discrete_rv(y) distribution = scipy.stats.rv_discrete(values=(values, probabilities)) func = lambda d: d.rvs() tree._leaf_functions[cur_node] = (func, distribution) else: raise ValueError('Invalid value for leaf_prediction_rule: {}'.format(self.leaf_prediction_rule)) else: tree.split_node(cur_node, best_split) left_child = tree.left_child(cur_node) right_child = tree.right_child(cur_node) X_left, X_right, y_left, y_right = split_dataset( X, y, best_split.feature_id, best_split.value) self._build_tree_recursive_cart(tree, left_child, X_left, y_left) self._build_tree_recursive_cart(tree, right_child, X_right, y_right)