def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
                                         att_idx, binary_only):
     best_suggestion = None
     att_values = sorted(
         set([
             att_val
             for att_val_per_class in self._att_val_dist_per_class.values()
             for att_val in att_val_per_class
         ]))
     if not binary_only:
         post_split_dist = self.get_class_dist_from_multiway_split()
         merit = criterion.get_merit_of_split(pre_split_dist,
                                              post_split_dist)
         branch_mapping = {
             attr_val: branch_id
             for branch_id, attr_val in enumerate(att_values)
         }
         best_suggestion = AttributeSplitSuggestion(
             NominalAttributeMultiwayTest(att_idx, branch_mapping),
             post_split_dist, merit)
     for att_val in att_values:
         post_split_dist = self.get_class_dist_from_binary_split(att_val)
         merit = criterion.get_merit_of_split(pre_split_dist,
                                              post_split_dist)
         if best_suggestion is None or merit > best_suggestion.merit:
             best_suggestion = AttributeSplitSuggestion(
                 NominalAttributeBinaryTest(att_idx, att_val),
                 post_split_dist, merit)
     return best_suggestion
Exemple #2
0
    def get_best_evaluated_split_suggestion(self,
                                            criterion,
                                            pre_split_dist,
                                            att_idx,
                                            binary_only=True):
        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._att_idx = att_idx

        self._aux_sum_weight = 0

        # Handles both single-target and multi-target tasks
        if np.ndim(pre_split_dist[1]) == 0:
            self._aux_sum = 0.0
            self._aux_sum_sq = 0.0
        else:
            self._aux_sum = np.zeros_like(pre_split_dist[1])
            self._aux_sum_sq = np.zeros_like(pre_split_dist[2])

        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

        best_split = self._find_best_split(self._root, candidate)

        # Reset auxiliary variables
        self._criterion = None
        self._pre_split_dist = None
        self._att_idx = None
        self._aux_sum_weight = None
        self._aux_sum = None
        self._aux_sum_sq = None

        return best_split
Exemple #3
0
    def search_for_best_binary_split_option(self, current_node,
                                            current_best_option, criterion,
                                            att_idx):
        if current_node is None or self._count_rest == 0:
            return current_best_option

        if current_node._child is not None:
            current_best_option = self.search_for_best_binary_split_option(
                current_node._left, current_best_option, criterion, att_idx)
        self._sum_one = current_node._statistics.get(1)
        self._sum_rest = self._sum_total - self._sum_one
        self._sum_sq_one = current_node._statistics.get(2)
        self._sum_sq_rest = self._sum_sq_total - self._sum_sq_one
        self._count_one = current_node._statistics.get(0)
        self._count_rest = self._count - self._count_one

        one_dict = {self._count_one, self._sum_one, self._sum_sq_one}
        rest_dict = {self._count_rest, self._sum_rest, self._sum_sq_rest}
        post_split_dists = [one_dict, rest_dict]
        pre_split_dist = [{self._count, self._sum_total, self._sum_sq_total}]

        merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)

        if current_best_option is None or merit > current_best_option.merit:
            nom_att_binary_test = NominalAttributeBinaryTest(
                att_idx, current_node._cut_point)
            current_best_option = AttributeSplitSuggestion(
                nom_att_binary_test, post_split_dists, merit)

        return current_best_option
    def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
                                            att_idx, binary_only):
        best_suggestion = None
        suggested_split_values = self.get_split_point_suggestions()
        for split_value in suggested_split_values:
            post_split_dist = self.get_class_dists_from_binary_split(
                split_value)
            # merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist)
            merit = -np.inf
            if ('0' in self._att_val_dist_per_class) and (
                    '1' in self._att_val_dist_per_class):
                if (self._att_val_dist_per_class[0]
                        is not None) and (self._att_val_dist_per_class[1]
                                          is not None):
                    n_mean = self._att_val_dist_per_class[0].get_mean()
                    n_variance = self._att_val_dist_per_class[0].get_variance()
                    p_mean = self._att_val_dist_per_class[1].get_mean()
                    p_variance = self._att_val_dist_per_class[1].get_variance()
                    merit = GaussianHellingerDistanceCriterion.compute_hellinger(
                        p_mean, p_variance, n_mean, n_variance)

            if best_suggestion is None or merit > best_suggestion.merit:
                num_att_binary_test = NumericAttributeBinaryTest(
                    att_idx, split_value, True)
                best_suggestion = AttributeSplitSuggestion(
                    num_att_binary_test, post_split_dist, merit)
        return best_suggestion
    def get_best_split_suggestions(self, criterion, ht):
        """ Find possible split candidates.

        Parameters
        ----------
        criterion: SplitCriterion
            The splitting criterion to be used.
        ht: HoeffdingTreeClassifier
            Hoeffding Tree.

        Returns
        -------
        list
            Split candidates.

        """
        best_suggestions = []
        pre_split_dist = self._observed_class_distribution
        if not ht.no_preprune:
            # Add null split as an option
            null_split = AttributeSplitSuggestion(
                None, [{}],
                criterion.get_merit_of_split(pre_split_dist, [pre_split_dist]))
            best_suggestions.append(null_split)
        for i, obs in self._attribute_observers.items():
            best_suggestion = obs.get_best_evaluated_split_suggestion(
                criterion, pre_split_dist, i, ht.binary_split)
            if best_suggestion is not None:
                best_suggestions.append(best_suggestion)
        return best_suggestions
Exemple #6
0
 def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
                                         att_idx, binary_only):
     best_suggestion = None
     if not binary_only:
         post_split_dist = self.get_class_dist_from_multiway_split()
         merit = criterion.get_merit_of_split(pre_split_dist,
                                              post_split_dist)
         best_suggestion = AttributeSplitSuggestion(
             NominalAttributeMultiwayTest(att_idx), post_split_dist, merit)
     for val_idx in self._att_val_dist_per_class.keys():
         post_split_dist = self.get_class_dist_from_binary_split(val_idx)
         merit = criterion.get_merit_of_split(pre_split_dist,
                                              post_split_dist)
         if best_suggestion is None or merit > best_suggestion.merit:
             best_suggestion = AttributeSplitSuggestion(
                 NominalAttributeBinaryTest(att_idx, val_idx),
                 post_split_dist, merit)
     return best_suggestion
Exemple #7
0
 def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only):
     best_suggestion = None
     suggested_split_values = self.get_split_point_suggestions()
     for split_value in suggested_split_values:
         post_split_dist = self.get_class_dists_from_binary_split(split_value)
         merit = criterion.get_merit_of_split(pre_split_dist, post_split_dist)
         if best_suggestion is None or merit > best_suggestion.merit:
             num_att_binary_test = NumericAttributeBinaryTest(att_idx, split_value, True)
             best_suggestion = AttributeSplitSuggestion(num_att_binary_test, post_split_dist, merit)
     return best_suggestion
Exemple #8
0
 def get_best_split_suggestions(self, criterion, hot):
     best_suggestions = []
     pre_split_dist = self._observed_class_distribution
     null_split = AttributeSplitSuggestion(None, [{}],
                                           criterion.get_merit_of_split(pre_split_dist, [pre_split_dist]))
     best_suggestions.append(null_split)
     for i, obs in self._attribute_observers.items():
         best_suggestion = obs.get_best_evaluated_split_suggestion(criterion, pre_split_dist,
                                                                   i, hot.binary_split)
         if best_suggestion is not None:
             best_suggestions.append(best_suggestion)
     return best_suggestions
    def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
                                            att_idx, binary_only):
        current_best = None
        ordered_feature_values = sorted(list(self._statistics.keys()))
        if not binary_only:
            post_split_dist = [
                self._statistics[k] for k in ordered_feature_values
            ]

            merit = criterion.get_merit_of_split(pre_split_dist,
                                                 post_split_dist)
            branch_mapping = {
                attr_val: branch_id
                for branch_id, attr_val in enumerate(ordered_feature_values)
            }
            current_best = AttributeSplitSuggestion(
                NominalAttributeMultiwayTest(att_idx, branch_mapping),
                post_split_dist, merit)

        for att_val in ordered_feature_values:
            actual_dist = self._statistics[att_val]
            remaining_dist = {
                0: pre_split_dist[0] - actual_dist[0],
                1: pre_split_dist[1] - actual_dist[1],
                2: pre_split_dist[2] - actual_dist[2]
            }
            post_split_dist = [actual_dist, remaining_dist]

            merit = criterion.get_merit_of_split(pre_split_dist,
                                                 post_split_dist)

            if current_best is None or merit > current_best.merit:
                nom_att_binary_test = NominalAttributeBinaryTest(
                    att_idx, att_val)
                current_best = AttributeSplitSuggestion(
                    nom_att_binary_test, post_split_dist, merit)

        return current_best
    def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
                                            att_idx, binary_only=True):
        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._att_idx = att_idx

        self._aux_k = 0
        self._aux_sum = np.zeros_like(pre_split_dist[1])
        self._aux_sq_sum = np.zeros_like(pre_split_dist[2])

        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

        best_split = self._find_best_split(self._root, candidate)

        return best_split
    def search_for_best_split_option(self, current_node, current_best_option,
                                     criterion, att_idx):
        if current_node is None or self._count_right_total == 0:
            return current_best_option
        if current_node._left is not None:
            current_best_option = self.search_for_best_split_option(
                current_node._left, current_best_option, criterion, att_idx)
        self._sum_total_left += current_node._left_statistics[1]
        self._sum_total_right -= current_node._left_statistics[1]
        self._sum_sq_total_left += current_node._left_statistics[2]
        self._sum_sq_total_right -= current_node._left_statistics[2]
        self._count_right_total -= current_node._left_statistics[0]
        self._count_left_total += current_node._left_statistics[0]

        lhs_dist = {}
        rhs_dist = {}
        lhs_dist[0] = self._count_left_total
        lhs_dist[1] = self._sum_total_left
        lhs_dist[2] = self._sum_sq_total_left
        rhs_dist[0] = self._count_right_total
        rhs_dist[1] = self._sum_total_right
        rhs_dist[2] = self._sum_sq_total_right
        post_split_dists = [lhs_dist, rhs_dist]
        pre_split_dist = [(self._count_left_total + self._count_right_total),
                          (self._sum_total_left + self._sum_total_right),
                          (self._sum_sq_total_left + self._sum_sq_total_right)]

        merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)

        if current_best_option is None or merit > current_best_option.merit:
            num_att_binary_test = NumericAttributeBinaryTest(
                att_idx, current_node._cut_point, True)
            current_best_option = AttributeSplitSuggestion(
                num_att_binary_test, post_split_dists, merit)

        if current_node._right is not None:
            current_best_option = self.search_for_best_split_option(
                current_node._right, current_best_option, criterion, att_idx)

        self._sum_total_left -= current_node._left_statistics.get(1)
        self._sum_total_right += current_node._left_statistics.get(1)
        self._sum_sq_total_left -= current_node._left_statistics.get(2)
        self._sum_sq_total_right += current_node._left_statistics.get(2)
        self._count_left_total -= current_node._left_statistics.get(0)
        self._count_right_total += current_node._left_statistics.get(0)

        return current_best_option
Exemple #12
0
        def get_null_split(self, criterion):
            """ Compute the null split (don't split).

            Parameters
            ----------
            criterion: SplitCriterion
                The splitting criterion to be used.

            Returns
            -------
            list
                Split candidates.

            """

            pre_split_dist = self._observed_class_distribution
            null_split = AttributeSplitSuggestion(None, [{}],
                                                  criterion.get_merit_of_split(pre_split_dist, [pre_split_dist]))
            return null_split
    def _find_best_split(self, node, candidate):
        if node._left is not None:
            candidate = self._find_best_split(node._left, candidate)
        # Left post split distribution
        left_dist = {}
        left_dist[0] = node.k + self._aux_k
        left_dist[1] = node.sum_target + self._aux_sum
        left_dist[2] = node.sum_sq_target + self._aux_sq_sum

        # The right split distribution is calculated as the difference
        # between the total distribution (pre split distribution) and
        # the left distribution
        right_dist = {}
        right_dist[0] = self._pre_split_dist[0] - left_dist[0]
        right_dist[1] = self._pre_split_dist[1] - left_dist[1]
        right_dist[2] = self._pre_split_dist[2] - left_dist[2]

        post_split_dists = [left_dist, right_dist]

        merit = self._criterion.get_merit_of_split(self._pre_split_dist,
                                                   post_split_dists)
        if merit > candidate.merit:
            num_att_binary_test = NumericAttributeBinaryTest(self._att_idx,
                                                             node.att_val,
                                                             True)
            candidate = AttributeSplitSuggestion(num_att_binary_test,
                                                 post_split_dists, merit)

        if node._right is not None:
            self._aux_k += node.k
            self._aux_sum += node.sum_target
            self._aux_sq_sum += node.sum_sq_target

            right_candidate = self._find_best_split(node._right, candidate)

            if right_candidate.merit > candidate.merit:
                candidate = right_candidate

            self._aux_k -= node.k
            self._aux_sum -= node.sum_target
            self._aux_sq_sum -= node.sum_sq_target

        return candidate
Exemple #14
0
    def search_for_best_multiway_split_option(self, current_node,
                                              current_best_option, criterion,
                                              att_idx):
        post_split_dists = np.zeros([self._number_of_possible_values, 3])
        if current_node is None or self._count_rest == 0:
            return current_best_option
        for i in range(self._number_of_possible_values):
            post_split_dists[i, 0] = current_node._statistics.get(0)
            post_split_dists[i, 1] = current_node._statistics.get(1)
            post_split_dists[i, 2] = current_node._statistics.get(2)
            current_node = current_node._child

        pre_split_dist = [{self._count, self._sum_total, self._sum_sq_total}]
        merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)
        if current_best_option is None or merit > current_best_option.merit:
            nom_att_mutliway_test = NominalAttributeMultiwayTest(att_idx)
            current_best_option = AttributeSplitSuggestion(
                nom_att_mutliway_test, post_split_dists, merit)

        return current_best_option
    def search_for_best_split_option(self, current_node, current_best_option,
                                     actual_parent_left, parent_left,
                                     parent_right, left_child, criterion,
                                     pre_split_dist, att_idx):

        if current_node is None:
            return current_best_option
        left_dist = {}
        right_dist = {}

        if parent_left is None:

            left_dist.update(
                dict(
                    Counter(left_dist) +
                    Counter(current_node._class_count_left)))

            right_dist.update(
                dict(
                    Counter(right_dist) +
                    Counter(current_node._class_count_right)))
        else:
            left_dist.update(dict(Counter(left_dist) + Counter(parent_left)))

            right_dist.update(
                dict(Counter(right_dist) + Counter(parent_right)))

            if left_child:
                """get the exact statistics of the parent value"""
                exact_parent_dist = {}
                exact_parent_dist.update(
                    dict(
                        Counter(exact_parent_dist) +
                        Counter(actual_parent_left)))

                exact_parent_dist.update(
                    dict(
                        Counter(exact_parent_dist) -
                        Counter(current_node._class_count_left)))

                exact_parent_dist.update(
                    dict(
                        Counter(exact_parent_dist) -
                        Counter(current_node._class_count_right)))
                """move the subtrees"""
                left_dist.update(
                    dict(
                        Counter(left_dist) -
                        Counter(current_node._class_count_right)))

                right_dist.update(
                    dict(
                        Counter(right_dist) +
                        Counter(current_node._class_count_right)))
                """move the exact value from the parent"""
                right_dist.update(
                    dict(Counter(right_dist) + Counter(exact_parent_dist)))

                left_dist.update(
                    dict(Counter(left_dist) - Counter(exact_parent_dist)))

            else:
                left_dist.update(
                    dict(
                        Counter(left_dist) +
                        Counter(current_node._class_count_left)))

                right_dist.update(
                    dict(
                        Counter(right_dist) -
                        Counter(current_node._class_count_left)))

        post_split_dists = [left_dist, right_dist]
        merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)

        if current_best_option is None or merit > current_best_option.merit:
            num_att_binary_test = \
                NumericAttributeBinaryTest(
                    att_idx=att_idx,
                    att_value=current_node._cut_point,
                    equal_passes_test=True
                )

            current_best_option = \
                AttributeSplitSuggestion(
                    split_test=num_att_binary_test,
                    resulting_class_distributions=post_split_dists,
                    merit=merit
                )

        current_best_option = \
            self.search_for_best_split_option(
                current_node=current_node._left,
                current_best_option=current_best_option,
                actual_parent_left=current_node._class_count_left,
                parent_left=post_split_dists[0],
                parent_right=post_split_dists[1],
                left_child=True,
                criterion=criterion,
                pre_split_dist=pre_split_dist,
                att_idx=att_idx
            )

        current_best_option = \
            self.search_for_best_split_option(
                current_node=current_node._right,
                current_best_option=current_best_option,
                actual_parent_left=current_node._class_count_left,
                parent_left=post_split_dists[0],
                parent_right=post_split_dists[1],
                left_child=False,
                criterion=criterion,
                pre_split_dist=pre_split_dist,
                att_idx=att_idx
            )

        return current_best_option