def _find_best_split(self, X, target, n_features): """Find best feature and value for a split. Greedy algorithm.""" # Sample random subset of features subset = random.sample(list(range(0, X.shape[1])), n_features) max_gain, max_col, max_val = None, None, None for column in subset: split_values = self._find_splits(X[:, column]) for value in split_values: if self.loss is None: # Random forest splits = split(X[:, column], target["y"], value) gain = self.criterion(target["y"], splits) else: # Gradient boosting left, right = split_dataset(X, target, column, value, return_X=False) gain = xgb_criterion(target, left, right, self.loss) if (max_gain is None) or (gain > max_gain): max_col, max_val, max_gain = column, value, gain return max_col, max_val, max_gain
def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None): """Build a decision tree from training set. Parameters ---------- X : array-like Feature dataset. target : dictionary or array-like Target values. max_features : int or None The number of features to consider when looking for the best split. min_samples_split : int The minimum number of samples required to split an internal node. max_depth : int Maximum depth of the tree. minimum_gain : float, default 0.01 Minimum gain required for splitting. loss : function, default None Loss function for gradient boosting. """ if not isinstance(target, dict): target = {'y': target} # Loss for gradient boosting if loss is not None: self.loss = loss try: # Exit from recursion using assert syntax assert (X.shape[0] > min_samples_split) assert (max_depth > 0) column, value, gain = self._find_best_split(X, target, max_features) assert gain is not None if self.regression: assert (gain != 0) else: assert (gain > minimum_gain) self.column_index = column self.threshold = value self.impurity = gain # Split dataset left_X, right_X, left_target, right_target = split_dataset(X, target, column, value) self.left_child = Tree(self.regression, self.criterion) self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss) self.right_child = Tree(self.regression, self.criterion) self.right_child.train(right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss) except AssertionError: self._calculate_leaf_value(target)
def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01): try: # Exit from recursion using assert syntax assert X.shape[0] > min_samples_split assert max_depth > 0 if max_features is None: max_features = X.shape[1] column, value, gain = self._find_best_split( X, target, max_features) assert gain is not None if self.regression: assert gain != 0 else: assert gain > minimum_gain self.column_index = column self.threshold = value self.impurity = gain # Split dataset left_X, right_X, left_target, right_target = split_dataset( X, target, column, value) # Grow left and right child self.left_child = Tree(self.regression, self.criterion, self.n_classes) self.left_child._train(left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain) self.right_child = Tree(self.regression, self.criterion, self.n_classes) self.right_child._train(right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain) except AssertionError: self._calculate_leaf_value(target)
def _find_best_split(self, X, target, n_features): """Find best feature and value for a split. Greedy algorithm.""" # Sample random subset of features subset = random.sample(list(range(0, X.shape[1])), n_features) max_gain, max_col, max_val = None, None, None for column in subset: split_values = self._find_splits(X[:, column]) for value in split_values: if self.loss is None: # Random forest splits = split(X[:, column], target['y'], value) gain = self.criterion(target['y'], splits) else: # Gradient boosting left, right = split_dataset(X, target, column, value, return_X=False) gain = xgb_criterion(target, left, right, self.loss) if (max_gain is None) or (gain > max_gain): max_col, max_val, max_gain = column, value, gain return max_col, max_val, max_gain
def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None): """Build a decision tree from training set. Parameters ---------- X : array-like Feature dataset. target : dictionary or array-like Target values. max_features : int or None The number of features to consider when looking for the best split. min_samples_split : int The minimum number of samples required to split an internal node. max_depth : int Maximum depth of the tree. minimum_gain : float, default 0.01 Minimum gain required for splitting. loss : function, default None Loss function for gradient boosting. """ if not isinstance(target, dict): target = {'y': target} # Loss for gradient boosting if loss is not None: self.loss = loss try: # Exit from recursion using assert syntax assert (X.shape[0] > min_samples_split) assert (max_depth > 0) if max_features is None: max_features = X.shape[1] column, value, gain = self._find_best_split(X, target, max_features) assert gain is not None if self.regression: assert (gain != 0) else: assert (gain > minimum_gain) self.column_index = column self.threshold = value self.impurity = gain # Split dataset left_X, right_X, left_target, right_target = split_dataset(X, target, column, value) # Grow left and right child self.left_child = Tree(self.regression, self.criterion) self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss) self.right_child = Tree(self.regression, self.criterion) self.right_child.train(right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss) except AssertionError: self._calculate_leaf_value(target)