def _calculate_variance_reduction(self, y, y_1, y_2): var_tot = calculate_variance(np.expand_dims(y, axis=1)) var_1 = calculate_variance(np.expand_dims(y_1, axis=1)) var_2 = calculate_variance(np.expand_dims(y_2, axis=1)) frac_1 = len(y_1) / len(y) frac_2 = len(y_2) / len(y) # Calculate the variance reduction variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2) return variance_reduction
def _calculate_variance_reduction(self, y, y1, y2): var_tot = calculate_variance(y) var_1 = calculate_variance(y1) var_2 = calculate_variance(y2) frac_1 = len(y1) / len(y) frac_2 = len(y2) / len(y) # Calculate the variance reduction variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2) return sum(variance_reduction)
def _build_tree(self, X, y): largest_variance_reduction = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) # If one subset there is no use of calculating the # information gain if len(Xy_1) > 0 and len(Xy_2) > 0: X_1 = Xy_1[:, :-1] X_2 = Xy_2[:, :-1] # Calculate the variance of the data set and the # two split sets var_tot = calculate_variance(X) var_1 = calculate_variance(X_1) var_2 = calculate_variance(X_2) # Calculate the variance reduction variance_reduction = var_tot - (var_1 + var_2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if variance_reduction > largest_variance_reduction: largest_variance_reduction = variance_reduction best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "left_branch": Xy_1, "right_branch": Xy_2 } # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and largest_variance_reduction > self.min_var_red: leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[ "left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[ "right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY) false_branch = self._build_tree(rightX, rightY) self.current_depth += 1 return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # Set y prediction for this leaf as the mean # of the y training data values of this leaf return RegressionNode(value=np.mean(y))
def _build_tree(self, X, y, current_depth=0): largest_variance_reduction = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the variance reduction for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Find points to split at as the mean of every following # pair of points x = unique_values split_points = [(x[i-1]+x[i])/2 for i in range(1,len(x))] # Iterate through all unique values of feature column i and # calculate the variance reduction for threshold in split_points: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) if len(Xy_1) > 0 and len(Xy_2) > 0: y_1 = Xy_1[:, -1] y_2 = Xy_2[:, -1] var_tot = calculate_variance(np.expand_dims(y, axis=1)) var_1 = calculate_variance(np.expand_dims(y_1, axis=1)) var_2 = calculate_variance(np.expand_dims(y_2, axis=1)) frac_1 = len(y_1) / len(y) frac_2 = len(y_2) / len(y) # Calculate the variance reduction variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2) # If this threshold resulted in a larger variance reduction than # previously registered we save the feature index and threshold # and the two sets if variance_reduction > largest_variance_reduction: largest_variance_reduction = variance_reduction best_criteria = { "feature_i": feature_i, "threshold": threshold} best_sets = { "left_branch": Xy_1, "right_branch": Xy_2} # If we have any information gain to go by we build the tree deeper if current_depth < self.max_depth and largest_variance_reduction > self.min_var_red: leftX, leftY = best_sets["left_branch"][ :, :-1], best_sets["left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][ :, :-1], best_sets["right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY, current_depth + 1) false_branch = self._build_tree(rightX, rightY, current_depth + 1) return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ "threshold"], true_branch=true_branch, false_branch=false_branch) # Set y prediction for this leaf as the mean # of the y training data values of this leaf return RegressionNode(value=np.mean(y))