def _find_best_split(self, feature): """Find the best threshold value to split this node on, using @feature. Returns (less_than_threshold, split_err). The @less_than_threshold is what you use to "decide", i.e.: if example[feature] < less_than_threshold: decide_left ... else: decide_right ... Note that this method doesn't actually split anything: it just figures out which threshold value would be best to split at. """ self._sort_by_features(feature) left_output_stats = SummaryStats() right_output_stats = SummaryStats() assert len(self.examples) == len(self.examples_sorted_by_feature[feature]) # To begin, let's assume we push all examples into the right child node. for example in self.examples: right_output_stats.add(float(example["_OUTPUT"])) # Now, move the examples one by one to the left child node. # (Note the examples sorted by value -- it's as if we're adjusting the # less_than_threshold.) # After each example, calculate the goodness-of-split, and track the best. best_threshold = None best_err = None last_feature_value = None for example in self.examples_sorted_by_feature[feature]: feature_value = example[feature] output_value = float(example["_OUTPUT"]) # Speed optimization: skip over examples with same feature value. if feature_value == last_feature_value: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue last_feature_value = feature_value # remember for next iteration left_count = left_output_stats.count() right_count = right_output_stats.count() # Edge-case: left or right child is empty if left_count == 0 or right_count == 0: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue # not a true split # Compute goodness-of-split: weighted average of the 2 output variances. if left_count <= 1: left_err = 0 else: left_err = (left_count - 1) * left_output_stats.var() if right_count <= 1: right_err = 0 else: right_err = (right_count - 1) * right_output_stats.var() err = left_err + right_err if best_err is None or err < best_err: best_threshold = feature_value best_err = err left_output_stats.add(output_value) right_output_stats.remove(output_value) # to save memory, delete this sorted array (we'll never use it again anyway) del self.examples_sorted_by_feature[feature] return (best_threshold, best_err)