Beispiel #1
0
    def _find_best_split(self, feature):
        """Find the best threshold value to split this node on, using @feature.
        Returns (less_than_threshold, split_err).
        The @less_than_threshold is what you use to "decide", i.e.:
            if example[feature] < less_than_threshold:
                decide_left ...
            else:
                decide_right ...
        Note that this method doesn't actually split anything: it just figures out
        which threshold value would be best to split at.
        """
        self._sort_by_features(feature)
        left_output_stats = SummaryStats()
        right_output_stats = SummaryStats()
        assert len(self.examples) == len(self.examples_sorted_by_feature[feature])

        # To begin, let's assume we push all examples into the right child node.
        for example in self.examples:
            right_output_stats.add(float(example["_OUTPUT"]))

        # Now, move the examples one by one to the left child node.
        # (Note the examples sorted by value -- it's as if we're adjusting the
        # less_than_threshold.)
        # After each example, calculate the goodness-of-split, and track the best.
        best_threshold = None
        best_err = None
        last_feature_value = None
        for example in self.examples_sorted_by_feature[feature]:
            feature_value = example[feature]
            output_value = float(example["_OUTPUT"])

            # Speed optimization: skip over examples with same feature value.
            if feature_value == last_feature_value:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue

            last_feature_value = feature_value  # remember for next iteration

            left_count = left_output_stats.count()
            right_count = right_output_stats.count()

            # Edge-case: left or right child is empty
            if left_count == 0 or right_count == 0:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue  # not a true split

            # Compute goodness-of-split: weighted average of the 2 output variances.
            if left_count <= 1: left_err = 0
            else: left_err = (left_count - 1) * left_output_stats.var()

            if right_count <= 1: right_err = 0
            else: right_err = (right_count - 1) * right_output_stats.var()

            err = left_err + right_err
            if best_err is None or err < best_err:
                best_threshold = feature_value
                best_err = err

            left_output_stats.add(output_value)
            right_output_stats.remove(output_value)

        # to save memory, delete this sorted array (we'll never use it again anyway)
        del self.examples_sorted_by_feature[feature]
        return (best_threshold, best_err)