def _kill_subtree(self, node: AnyTimeSplitNode): """ Kill subtree that starts from node. Parameters ---------- node: AnyTimeActiveLearningNode The node to reevaluate. Returns ------- AnyTimeActiveLearningNode The new leaf. """ leaf = self._new_learning_node() leaf.set_observed_class_distribution(node.get_observed_class_distribution()) leaf.set_attribute_observers(node.get_attribute_observers()) return leaf
def _activate_learning_node(self, to_activate: AnyTimeInactiveLearningNode, parent: AnyTimeSplitNode, parent_branch: int): """ Activate a learning node. Parameters ---------- to_activate: AnyTimeInactiveLearningNode The node to activate. parent: AnyTimeSplitNode The node's parent. parent_branch: int Parent node's branch index. """ new_leaf = self._new_learning_node(to_activate.get_observed_class_distribution()) if parent is None: self._tree_root = new_leaf else: parent.set_child(parent_branch, new_leaf) self._active_leaf_node_cnt += 1 self._inactive_leaf_node_cnt -= 1
def _reevaluate_best_split(self, node: AnyTimeSplitNode, parent, branch_index): """ Reevaluate the best split for a node. If the samples seen so far are not from the same class then: 1. Find split candidates and select the best one. 2. Compute the Hoeffding bound. 3. If the don't split candidate is higher than the top split candidate: 3.1 Kill subtree and replace it with a leaf. 3.2 Update the tree. 3.3 Update tree's metrics 4. If the difference between the top split candidate and the current split is larger than the Hoeffding bound: 4.1 Create a new split node. 4.2 Update the tree. 4.3 Update tree's metrics 5. If the top split candidate is the current split but with different split test: 5.1 Update the split test of the current split. Parameters ---------- node: AnyTimeSplitNode The node to reevaluate. parent: AnyTimeSplitNode The node's parent. branch_index: int Parent node's branch index. Returns ------- boolean flag to stop moving in depth. """ stop_flag = False if not node.observed_class_distribution_is_pure(): if self._split_criterion == GINI_SPLIT: split_criterion = GiniSplitCriterion() elif self._split_criterion == INFO_GAIN_SPLIT: split_criterion = InfoGainSplitCriterion() else: split_criterion = InfoGainSplitCriterion() best_split_suggestions = node.get_best_split_suggestions( split_criterion, self) if len(best_split_suggestions) > 0: # Compute Gini (or Information Gain) for each attribute (except the null one) best_split_suggestions.sort(key=attrgetter('merit')) # x_best is the attribute with the highest G_int x_best = best_split_suggestions[-1] id_best = x_best.split_test.get_atts_test_depends_on()[0] # x_current is the current attribute used in this SplitNode id_current = node.get_split_test().get_atts_test_depends_on( )[0] x_current = node.find_attribute(id_current, best_split_suggestions) # Get x_null x_null = node.get_null_split(split_criterion) # Force x_null merit to get 0 instead of -infinity if x_null.merit == -np.inf: x_null.merit = 0.0 # Compute Hoeffding bound hoeffding_bound = self.compute_hoeffding_bound( split_criterion.get_range_of_merit( node.get_observed_class_distribution()), self.split_confidence, node.get_weight_seen()) if x_null.merit - x_best.merit > hoeffding_bound: # Kill subtree & replace the AnyTimeSplitNode by AnyTimeActiveLearningNode best_split = self._kill_subtree(node) # update EFDT if parent is None: # Root case : replace the root node by a new split node self._tree_root = best_split else: parent.set_child(branch_index, best_split) deleted_node_cnt = node.count_nodes() self._active_leaf_node_cnt += 1 self._active_leaf_node_cnt -= deleted_node_cnt[1] self._decision_node_cnt -= deleted_node_cnt[0] stop_flag = True # Manage memory self.enforce_tracker_limit() elif (x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold) and ( id_current != id_best): # Create a new branch new_split = self.new_split_node( x_best.split_test, node.get_observed_class_distribution(), node.get_attribute_observers()) # Update weights in new_split new_split.update_weight_seen_at_last_split_reevaluation() # Update EFDT for i in range(x_best.num_splits()): new_child = self._new_learning_node( x_best.resulting_class_distribution_from_split(i)) new_split.set_child(i, new_child) deleted_node_cnt = node.count_nodes() self._active_leaf_node_cnt -= deleted_node_cnt[1] self._decision_node_cnt -= deleted_node_cnt[0] self._decision_node_cnt += 1 self._active_leaf_node_cnt += x_best.num_splits() if parent is None: # Root case : replace the root node by a new split node self._tree_root = new_split else: parent.set_child(branch_index, new_split) stop_flag = True # Manage memory self.enforce_tracker_limit() elif (x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold) and (id_current == id_best): node._split_test = x_best.split_test return stop_flag
def new_split_node(self, split_test, class_observations, attribute_observers): """ Create a new split node.""" return AnyTimeSplitNode(split_test, class_observations, attribute_observers)