Beispiel #1
0
    def _attempt_to_split(self, node, parent, branch_index):
        """ Attempt to split a node.

        If the samples seen so far are not from the same class then:

        1. Find split candidates and select the best one.
        2. Compute the Hoeffding bound.
        3. If the difference between the best split candidate and the don't split candidate is larger than
        the Hoeffding bound:
            3.1 Replace the leaf node by a split node.
            3.2 Add a new leaf node on each branch of the new split node.
            3.3 Update tree's metrics

        Parameters
        ----------
        node: AnyTimeActiveLearningNode
            The node to reevaluate.
        parent: AnyTimeSplitNode
            The node's parent.
        branch_index: int
            Parent node's branch index.

        """

        if not node.observed_class_distribution_is_pure():
            if self._split_criterion == GINI_SPLIT:
                split_criterion = GiniSplitCriterion()
            elif self._split_criterion == INFO_GAIN_SPLIT:
                split_criterion = InfoGainSplitCriterion()
            else:
                split_criterion = InfoGainSplitCriterion()

            best_split_suggestions = node.get_best_split_suggestions(
                split_criterion, self)

            if len(best_split_suggestions) > 0:

                # x_best is the attribute with the highest G_int
                best_split_suggestions.sort(key=attrgetter('merit'))
                x_best = best_split_suggestions[-1]

                # Get x_null
                x_null = node.get_null_split(split_criterion)

                # Force x_null merit to get 0 instead of -infinity
                if x_null.merit == -np.inf:
                    x_null.merit = 0.0

                hoeffding_bound = self.compute_hoeffding_bound(
                    split_criterion.get_range_of_merit(
                        node.get_observed_class_distribution()),
                    self.split_confidence, node.get_weight_seen())

                if x_best.merit - x_null.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold:

                    # Split
                    new_split = self.new_split_node(
                        x_best.split_test,
                        node.get_observed_class_distribution(),
                        node.get_attribute_observers())

                    # update weights in
                    new_split.update_weight_seen_at_last_split_reevaluation()

                    for i in range(x_best.num_splits()):
                        new_child = self._new_learning_node(
                            x_best.resulting_class_distribution_from_split(i))
                        new_split.set_child(i, new_child)
                    self._active_leaf_node_cnt -= 1
                    self._decision_node_cnt += 1
                    self._active_leaf_node_cnt += x_best.num_splits()

                    if parent is None:
                        # root case : replace the root node by a new split node
                        self._tree_root = new_split
                    else:
                        parent.set_child(branch_index, new_split)

                    # Manage memory
                    self.enforce_tracker_limit()
    def _attempt_to_split(self, node: ActiveLearningNode, parent: SplitNode,
                          parent_idx: int):
        """ Attempt to split a node.

        If the samples seen so far are not from the same class then:

        1. Find split candidates and select the top 2.
        2. Compute the Hoeffding bound.
        3. If the difference between the top 2 split candidates is larger than the Hoeffding bound:
           3.1 Replace the leaf node by a split node.
           3.2 Add a new leaf node on each branch of the new split node.
           3.3 Update tree's metrics

        Optional: Disable poor attribute. Depends on the tree's configuration.

        Parameters
        ----------
        node: ActiveLearningNode
            The node to evaluate.
        parent: SplitNode
            The node's parent.
        parent_idx: int
            Parent node's branch index.

        """
        if not node.observed_class_distribution_is_pure():
            if self._split_criterion == GINI_SPLIT:
                split_criterion = GiniSplitCriterion()
            elif self._split_criterion == INFO_GAIN_SPLIT:
                split_criterion = InfoGainSplitCriterion()
            elif self._split_criterion == HELLINGER:
                split_criterion = HellingerDistanceCriterion()
            else:
                split_criterion = InfoGainSplitCriterion()
            best_split_suggestions = node.get_best_split_suggestions(
                split_criterion, self)
            best_split_suggestions.sort(key=attrgetter('merit'))
            should_split = False
            if len(best_split_suggestions) < 2:
                should_split = len(best_split_suggestions) > 0
            else:
                hoeffding_bound = self.compute_hoeffding_bound(
                    split_criterion.get_range_of_merit(
                        node.get_observed_class_distribution()),
                    self.split_confidence, node.get_weight_seen())
                best_suggestion = best_split_suggestions[-1]
                second_best_suggestion = best_split_suggestions[-2]
                if (best_suggestion.merit - second_best_suggestion.merit >
                        hoeffding_bound or hoeffding_bound < self.tie_threshold
                    ):  # best_suggestion.merit > 1e-10 and \
                    should_split = True
                if self.remove_poor_atts is not None and self.remove_poor_atts:
                    poor_atts = set()
                    # Scan 1 - add any poor attribute to set
                    for i in range(len(best_split_suggestions)):
                        if best_split_suggestions[i] is not None:
                            split_atts = best_split_suggestions[
                                i].split_test.get_atts_test_depends_on()
                            if len(split_atts) == 1:
                                if best_suggestion.merit - best_split_suggestions[
                                        i].merit > hoeffding_bound:
                                    poor_atts.add(int(split_atts[0]))
                    # Scan 2 - remove good attributes from set
                    for i in range(len(best_split_suggestions)):
                        if best_split_suggestions[i] is not None:
                            split_atts = best_split_suggestions[
                                i].split_test.get_atts_test_depends_on()
                            if len(split_atts) == 1:
                                if best_suggestion.merit - best_split_suggestions[
                                        i].merit < hoeffding_bound:
                                    poor_atts.remove(int(split_atts[0]))
                    for poor_att in poor_atts:
                        node.disable_attribute(poor_att)
            if should_split:
                split_decision = best_split_suggestions[-1]
                if split_decision.split_test is None:
                    # Preprune - null wins
                    self._deactivate_learning_node(node, parent, parent_idx)
                else:
                    new_split = self.new_split_node(
                        split_decision.split_test,
                        node.get_observed_class_distribution())

                    for i in range(split_decision.num_splits()):
                        new_child = self._new_learning_node(
                            split_decision.
                            resulting_class_distribution_from_split(i))
                        new_split.set_child(i, new_child)
                    self._active_leaf_node_cnt -= 1
                    self._decision_node_cnt += 1
                    self._active_leaf_node_cnt += split_decision.num_splits()
                    if parent is None:
                        self._tree_root = new_split
                    else:
                        parent.set_child(parent_idx, new_split)
                # Manage memory
                self.enforce_tracker_limit()
Beispiel #3
0
    def _reevaluate_best_split(self, node: AnyTimeSplitNode, parent,
                               branch_index):
        """ Reevaluate the best split for a node.

        If the samples seen so far are not from the same class then:

        1. Find split candidates and select the best one.
        2. Compute the Hoeffding bound.
        3. If the don't split candidate is higher than the top split candidate:
            3.1 Kill subtree and replace it with a leaf.
            3.2 Update the tree.
            3.3 Update tree's metrics
        4. If the difference between the top split candidate and the current split is larger than
        the Hoeffding bound:
           4.1 Create a new split node.
           4.2 Update the tree.
           4.3 Update tree's metrics
        5. If the top split candidate is the current split but with different split test:
           5.1 Update the split test of the current split.

        Parameters
        ----------
        node: AnyTimeSplitNode
            The node to reevaluate.
        parent: AnyTimeSplitNode
            The node's parent.
        branch_index: int
            Parent node's branch index.
        Returns
        -------
        boolean
            flag to stop moving in depth.
        """

        stop_flag = False
        if not node.observed_class_distribution_is_pure():
            if self._split_criterion == GINI_SPLIT:
                split_criterion = GiniSplitCriterion()
            elif self._split_criterion == INFO_GAIN_SPLIT:
                split_criterion = InfoGainSplitCriterion()
            else:
                split_criterion = InfoGainSplitCriterion()

            best_split_suggestions = node.get_best_split_suggestions(
                split_criterion, self)
            if len(best_split_suggestions) > 0:

                # Compute Gini (or Information Gain) for each attribute (except the null one)
                best_split_suggestions.sort(key=attrgetter('merit'))
                # x_best is the attribute with the highest G_int

                x_best = best_split_suggestions[-1]
                id_best = x_best.split_test.get_atts_test_depends_on()[0]

                # x_current is the current attribute used in this SplitNode
                id_current = node.get_split_test().get_atts_test_depends_on(
                )[0]
                x_current = node.find_attribute(id_current,
                                                best_split_suggestions)

                # Get x_null
                x_null = node.get_null_split(split_criterion)

                # Force x_null merit to get 0 instead of -infinity
                if x_null.merit == -np.inf:
                    x_null.merit = 0.0

                #  Compute Hoeffding bound
                hoeffding_bound = self.compute_hoeffding_bound(
                    split_criterion.get_range_of_merit(
                        node.get_observed_class_distribution()),
                    self.split_confidence, node.get_weight_seen())

                if x_null.merit - x_best.merit > hoeffding_bound:

                    # Kill subtree & replace the AnyTimeSplitNode by AnyTimeActiveLearningNode

                    best_split = self._kill_subtree(node)

                    # update EFDT
                    if parent is None:
                        # Root case : replace the root node by a new split node
                        self._tree_root = best_split
                    else:
                        parent.set_child(branch_index, best_split)

                    deleted_node_cnt = node.count_nodes()

                    self._active_leaf_node_cnt += 1
                    self._active_leaf_node_cnt -= deleted_node_cnt[1]
                    self._decision_node_cnt -= deleted_node_cnt[0]
                    stop_flag = True

                    # Manage memory
                    self.enforce_tracker_limit()

                elif (x_best.merit - x_current.merit > hoeffding_bound
                      or hoeffding_bound < self.tie_threshold) and (
                          id_current != id_best):

                    # Create a new branch
                    new_split = self.new_split_node(
                        x_best.split_test,
                        node.get_observed_class_distribution(),
                        node.get_attribute_observers())
                    # Update weights in new_split
                    new_split.update_weight_seen_at_last_split_reevaluation()

                    # Update EFDT
                    for i in range(x_best.num_splits()):
                        new_child = self._new_learning_node(
                            x_best.resulting_class_distribution_from_split(i))
                        new_split.set_child(i, new_child)

                    deleted_node_cnt = node.count_nodes()

                    self._active_leaf_node_cnt -= deleted_node_cnt[1]
                    self._decision_node_cnt -= deleted_node_cnt[0]
                    self._decision_node_cnt += 1
                    self._active_leaf_node_cnt += x_best.num_splits()

                    if parent is None:
                        # Root case : replace the root node by a new split node
                        self._tree_root = new_split
                    else:
                        parent.set_child(branch_index, new_split)

                    stop_flag = True

                    # Manage memory
                    self.enforce_tracker_limit()

                elif (x_best.merit - x_current.merit > hoeffding_bound or
                      hoeffding_bound < self.tie_threshold) and (id_current
                                                                 == id_best):
                    node._split_test = x_best.split_test

        return stop_flag