def _attempt_to_split(self, node, parent, branch_index): """ Attempt to split a node. If the samples seen so far are not from the same class then: 1. Find split candidates and select the best one. 2. Compute the Hoeffding bound. 3. If the difference between the best split candidate and the don't split candidate is larger than the Hoeffding bound: 3.1 Replace the leaf node by a split node. 3.2 Add a new leaf node on each branch of the new split node. 3.3 Update tree's metrics Parameters ---------- node: AnyTimeActiveLearningNode The node to reevaluate. parent: AnyTimeSplitNode The node's parent. branch_index: int Parent node's branch index. """ if not node.observed_class_distribution_is_pure(): if self._split_criterion == GINI_SPLIT: split_criterion = GiniSplitCriterion() elif self._split_criterion == INFO_GAIN_SPLIT: split_criterion = InfoGainSplitCriterion() else: split_criterion = InfoGainSplitCriterion() best_split_suggestions = node.get_best_split_suggestions( split_criterion, self) if len(best_split_suggestions) > 0: # x_best is the attribute with the highest G_int best_split_suggestions.sort(key=attrgetter('merit')) x_best = best_split_suggestions[-1] # Get x_null x_null = node.get_null_split(split_criterion) # Force x_null merit to get 0 instead of -infinity if x_null.merit == -np.inf: x_null.merit = 0.0 hoeffding_bound = self.compute_hoeffding_bound( split_criterion.get_range_of_merit( node.get_observed_class_distribution()), self.split_confidence, node.get_weight_seen()) if x_best.merit - x_null.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold: # Split new_split = self.new_split_node( x_best.split_test, node.get_observed_class_distribution(), node.get_attribute_observers()) # update weights in new_split.update_weight_seen_at_last_split_reevaluation() for i in range(x_best.num_splits()): new_child = self._new_learning_node( x_best.resulting_class_distribution_from_split(i)) new_split.set_child(i, new_child) self._active_leaf_node_cnt -= 1 self._decision_node_cnt += 1 self._active_leaf_node_cnt += x_best.num_splits() if parent is None: # root case : replace the root node by a new split node self._tree_root = new_split else: parent.set_child(branch_index, new_split) # Manage memory self.enforce_tracker_limit()
def _attempt_to_split(self, node: ActiveLearningNode, parent: SplitNode, parent_idx: int): """ Attempt to split a node. If the samples seen so far are not from the same class then: 1. Find split candidates and select the top 2. 2. Compute the Hoeffding bound. 3. If the difference between the top 2 split candidates is larger than the Hoeffding bound: 3.1 Replace the leaf node by a split node. 3.2 Add a new leaf node on each branch of the new split node. 3.3 Update tree's metrics Optional: Disable poor attribute. Depends on the tree's configuration. Parameters ---------- node: ActiveLearningNode The node to evaluate. parent: SplitNode The node's parent. parent_idx: int Parent node's branch index. """ if not node.observed_class_distribution_is_pure(): if self._split_criterion == GINI_SPLIT: split_criterion = GiniSplitCriterion() elif self._split_criterion == INFO_GAIN_SPLIT: split_criterion = InfoGainSplitCriterion() elif self._split_criterion == HELLINGER: split_criterion = HellingerDistanceCriterion() else: split_criterion = InfoGainSplitCriterion() best_split_suggestions = node.get_best_split_suggestions( split_criterion, self) best_split_suggestions.sort(key=attrgetter('merit')) should_split = False if len(best_split_suggestions) < 2: should_split = len(best_split_suggestions) > 0 else: hoeffding_bound = self.compute_hoeffding_bound( split_criterion.get_range_of_merit( node.get_observed_class_distribution()), self.split_confidence, node.get_weight_seen()) best_suggestion = best_split_suggestions[-1] second_best_suggestion = best_split_suggestions[-2] if (best_suggestion.merit - second_best_suggestion.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold ): # best_suggestion.merit > 1e-10 and \ should_split = True if self.remove_poor_atts is not None and self.remove_poor_atts: poor_atts = set() # Scan 1 - add any poor attribute to set for i in range(len(best_split_suggestions)): if best_split_suggestions[i] is not None: split_atts = best_split_suggestions[ i].split_test.get_atts_test_depends_on() if len(split_atts) == 1: if best_suggestion.merit - best_split_suggestions[ i].merit > hoeffding_bound: poor_atts.add(int(split_atts[0])) # Scan 2 - remove good attributes from set for i in range(len(best_split_suggestions)): if best_split_suggestions[i] is not None: split_atts = best_split_suggestions[ i].split_test.get_atts_test_depends_on() if len(split_atts) == 1: if best_suggestion.merit - best_split_suggestions[ i].merit < hoeffding_bound: poor_atts.remove(int(split_atts[0])) for poor_att in poor_atts: node.disable_attribute(poor_att) if should_split: split_decision = best_split_suggestions[-1] if split_decision.split_test is None: # Preprune - null wins self._deactivate_learning_node(node, parent, parent_idx) else: new_split = self.new_split_node( split_decision.split_test, node.get_observed_class_distribution()) for i in range(split_decision.num_splits()): new_child = self._new_learning_node( split_decision. resulting_class_distribution_from_split(i)) new_split.set_child(i, new_child) self._active_leaf_node_cnt -= 1 self._decision_node_cnt += 1 self._active_leaf_node_cnt += split_decision.num_splits() if parent is None: self._tree_root = new_split else: parent.set_child(parent_idx, new_split) # Manage memory self.enforce_tracker_limit()
def _reevaluate_best_split(self, node: AnyTimeSplitNode, parent, branch_index): """ Reevaluate the best split for a node. If the samples seen so far are not from the same class then: 1. Find split candidates and select the best one. 2. Compute the Hoeffding bound. 3. If the don't split candidate is higher than the top split candidate: 3.1 Kill subtree and replace it with a leaf. 3.2 Update the tree. 3.3 Update tree's metrics 4. If the difference between the top split candidate and the current split is larger than the Hoeffding bound: 4.1 Create a new split node. 4.2 Update the tree. 4.3 Update tree's metrics 5. If the top split candidate is the current split but with different split test: 5.1 Update the split test of the current split. Parameters ---------- node: AnyTimeSplitNode The node to reevaluate. parent: AnyTimeSplitNode The node's parent. branch_index: int Parent node's branch index. Returns ------- boolean flag to stop moving in depth. """ stop_flag = False if not node.observed_class_distribution_is_pure(): if self._split_criterion == GINI_SPLIT: split_criterion = GiniSplitCriterion() elif self._split_criterion == INFO_GAIN_SPLIT: split_criterion = InfoGainSplitCriterion() else: split_criterion = InfoGainSplitCriterion() best_split_suggestions = node.get_best_split_suggestions( split_criterion, self) if len(best_split_suggestions) > 0: # Compute Gini (or Information Gain) for each attribute (except the null one) best_split_suggestions.sort(key=attrgetter('merit')) # x_best is the attribute with the highest G_int x_best = best_split_suggestions[-1] id_best = x_best.split_test.get_atts_test_depends_on()[0] # x_current is the current attribute used in this SplitNode id_current = node.get_split_test().get_atts_test_depends_on( )[0] x_current = node.find_attribute(id_current, best_split_suggestions) # Get x_null x_null = node.get_null_split(split_criterion) # Force x_null merit to get 0 instead of -infinity if x_null.merit == -np.inf: x_null.merit = 0.0 # Compute Hoeffding bound hoeffding_bound = self.compute_hoeffding_bound( split_criterion.get_range_of_merit( node.get_observed_class_distribution()), self.split_confidence, node.get_weight_seen()) if x_null.merit - x_best.merit > hoeffding_bound: # Kill subtree & replace the AnyTimeSplitNode by AnyTimeActiveLearningNode best_split = self._kill_subtree(node) # update EFDT if parent is None: # Root case : replace the root node by a new split node self._tree_root = best_split else: parent.set_child(branch_index, best_split) deleted_node_cnt = node.count_nodes() self._active_leaf_node_cnt += 1 self._active_leaf_node_cnt -= deleted_node_cnt[1] self._decision_node_cnt -= deleted_node_cnt[0] stop_flag = True # Manage memory self.enforce_tracker_limit() elif (x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold) and ( id_current != id_best): # Create a new branch new_split = self.new_split_node( x_best.split_test, node.get_observed_class_distribution(), node.get_attribute_observers()) # Update weights in new_split new_split.update_weight_seen_at_last_split_reevaluation() # Update EFDT for i in range(x_best.num_splits()): new_child = self._new_learning_node( x_best.resulting_class_distribution_from_split(i)) new_split.set_child(i, new_child) deleted_node_cnt = node.count_nodes() self._active_leaf_node_cnt -= deleted_node_cnt[1] self._decision_node_cnt -= deleted_node_cnt[0] self._decision_node_cnt += 1 self._active_leaf_node_cnt += x_best.num_splits() if parent is None: # Root case : replace the root node by a new split node self._tree_root = new_split else: parent.set_child(branch_index, new_split) stop_flag = True # Manage memory self.enforce_tracker_limit() elif (x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold) and (id_current == id_best): node._split_test = x_best.split_test return stop_flag