def _build_tree(self, X, y, current_depth=0): largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data expand_needed = len(np.shape(y)) == 1 if expand_needed: y = np.expand_dims(y, axis=1) # Add y as last column of X X_y = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) if len(Xy_1) > 0 and len(Xy_2) > 0: y_1 = Xy_1[:, n_features:] y_2 = Xy_2[:, n_features:] # Calculate impurity impurity = self._impurity_calculation(y, y_1, y_2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if impurity > largest_impurity: largest_impurity = impurity best_criteria = { "feature_i": feature_i, "threshold": threshold} best_sets = { "left_branch": Xy_1, "right_branch": Xy_2} if largest_impurity > self.min_impurity: leftX = best_sets["left_branch"][:, :n_features] leftY = best_sets["left_branch"][:, n_features:] # X - all cols. but last, y - last rightX = best_sets["right_branch"][:, :n_features] rightY = best_sets["right_branch"][:, n_features:] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY, current_depth + 1) false_branch = self._build_tree(rightX, rightY, current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ "threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): # print('Start building the tree') largest_impurity = 0 best_criteria = None best_sets = None if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) largest_impurity = 0 Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if current_depth <= self.max_depth and n_samples >= self.min_samples_split: # for-loop: Calculate the impurity for each feature for feature_i in range(n_features): # Get all unique values in curent feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) # comfirm that the divide is done, if there is no subtree here, we just skip it if len(Xy1) > 0 and len(Xy2) > 0: y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] # Calculate the current impurity impurity = self._impurity_calculation(y, y1, y2) if impurity > largest_impurity: largest_impurity = impurity best_criteria = {'feature_i': feature_i, 'threshold': threshold} best_sets = { 'leftX': Xy1[:, :n_features], 'lefty': Xy1[:, n_features:], 'rightX': Xy2[:, :n_features], 'righty': Xy2[:, n_features:] } # we don't need exactly stop the process until we cannot get any impurity impoving, we just set a min_impurity if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches # true_branch is just 'left' branch true_branch = self._build_tree(best_sets['leftX'], best_sets['lefty'], current_depth + 1) false_branch = self._build_tree(best_sets['rightX'], best_sets['righty'], current_depth + 1) return DecisionNode(feature_i=best_criteria['feature_i'], threshold=best_criteria['threshold'], true_branch=true_branch, false_branch=false_branch) # we should know where return the result, including the leaf node and the decision node! # we are at leaf node, so we determine the value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y): largest_variance_reduction = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) # If one subset there is no use of calculating the # information gain if len(Xy_1) > 0 and len(Xy_2) > 0: X_1 = Xy_1[:, :-1] X_2 = Xy_2[:, :-1] # Calculate the variance of the data set and the # two split sets var_tot = calculate_variance(X) var_1 = calculate_variance(X_1) var_2 = calculate_variance(X_2) # Calculate the variance reduction variance_reduction = var_tot - (var_1 + var_2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if variance_reduction > largest_variance_reduction: largest_variance_reduction = variance_reduction best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "left_branch": Xy_1, "right_branch": Xy_2 } # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and largest_variance_reduction > self.min_var_red: leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[ "left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[ "right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY) false_branch = self._build_tree(rightX, rightY) self.current_depth += 1 return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # Set y prediction for this leaf as the mean # of the y training data values of this leaf return RegressionNode(value=np.mean(y))
def _build_tree(self, X, y): # Calculate the entropy by the label values entropy = calculate_entropy(y) highest_info_gain = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) # If one subset there is no use of calculating the information gain if len(Xy_1) > 0 and len(Xy_2) > 0: # Calculate information gain p = len(Xy_1) / n_samples y1 = Xy_1[:, -1] y2 = Xy_2[:, -1] info_gain = entropy - p * calculate_entropy(y1) - ( 1 - p) * calculate_entropy(y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature index if info_gain > highest_info_gain: highest_info_gain = info_gain best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "left_branch": Xy_1, "right_branch": Xy_2 } # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and highest_info_gain > self.min_gain: leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[ "left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[ "right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY) false_branch = self._build_tree(rightX, rightY) self.current_depth += 1 return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # There's no recorded information gain so we are at a leaf most_common = None max_count = 0 results = {} for label in np.unique(y): count = len(y[y == label]) if count > max_count: most_common = label max_count = count return DecisionNode(label=most_common)
def _build_tree(self, X, y): # Calculate the entropy by the label values entropy = calculate_entropy(y) # Save the best informaion gain highest_info_gain = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) if np.shape(X_y)[0] != np.shape(Xy_1)[0] + np.shape(Xy_2)[0]: print "Aj" sys.exit(0) # If one subset there is no use of calculating the information gain if len(Xy_1) > 0 and len(Xy_2) > 0: # Calculate information gain p = len(Xy_1) / n_samples y1 = Xy_1[:,-1] y2 = Xy_2[:,-1] info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature index if info_gain > highest_info_gain: highest_info_gain = info_gain best_criteria = {"feature_i": feature_i, "threshold": threshold} best_sets = np.array([Xy_1, Xy_2]) # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and highest_info_gain > self.min_gain: X_1, y_1 = best_sets[0][:, :-1], best_sets[0][:, -1] X_2, y_2 = best_sets[1][:, :-1], best_sets[1][:, -1] true_branch = self._build_tree(X_1, y_1) false_branch = self._build_tree(X_2, y_2) self.current_depth += 1 return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # There's no recorded information gain so we are at a leaf most_common = None max_count = 0 results = {} for label in np.unique(y): count = len(y[y == label]) if count > max_count: most_common = label max_count = count return DecisionNode(label=most_common)
def _build_tree(self, X, y, current_depth=0): largest_variance_reduction = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the variance reduction for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Find points to split at as the mean of every following # pair of points x = unique_values split_points = [(x[i-1]+x[i])/2 for i in range(1,len(x))] # Iterate through all unique values of feature column i and # calculate the variance reduction for threshold in split_points: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) if len(Xy_1) > 0 and len(Xy_2) > 0: y_1 = Xy_1[:, -1] y_2 = Xy_2[:, -1] var_tot = calculate_variance(np.expand_dims(y, axis=1)) var_1 = calculate_variance(np.expand_dims(y_1, axis=1)) var_2 = calculate_variance(np.expand_dims(y_2, axis=1)) frac_1 = len(y_1) / len(y) frac_2 = len(y_2) / len(y) # Calculate the variance reduction variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2) # If this threshold resulted in a larger variance reduction than # previously registered we save the feature index and threshold # and the two sets if variance_reduction > largest_variance_reduction: largest_variance_reduction = variance_reduction best_criteria = { "feature_i": feature_i, "threshold": threshold} best_sets = { "left_branch": Xy_1, "right_branch": Xy_2} # If we have any information gain to go by we build the tree deeper if current_depth < self.max_depth and largest_variance_reduction > self.min_var_red: leftX, leftY = best_sets["left_branch"][ :, :-1], best_sets["left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][ :, :-1], best_sets["right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY, current_depth + 1) false_branch = self._build_tree(rightX, rightY, current_depth + 1) return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ "threshold"], true_branch=true_branch, false_branch=false_branch) # Set y prediction for this leaf as the mean # of the y training data values of this leaf return RegressionNode(value=np.mean(y))