Exemple #1
0
    def _find_best_split(self, X, target, n_features):
        """Find best feature and value for a split. Greedy algorithm."""

        # Sample random subset of features
        subset = random.sample(list(range(0, X.shape[1])), n_features)
        max_gain, max_col, max_val = None, None, None

        for column in subset:
            split_values = self._find_splits(X[:, column])
            for value in split_values:
                if self.loss is None:
                    # Random forest
                    splits = split(X[:, column], target["y"], value)
                    gain = self.criterion(target["y"], splits)
                else:
                    # Gradient boosting
                    left, right = split_dataset(X,
                                                target,
                                                column,
                                                value,
                                                return_X=False)
                    gain = xgb_criterion(target, left, right, self.loss)

                if (max_gain is None) or (gain > max_gain):
                    max_col, max_val, max_gain = column, value, gain
        return max_col, max_val, max_gain
Exemple #2
0
    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
        """Build a decision tree from training set.

        Parameters
        ----------

        X : array-like
            Feature dataset.
        target : dictionary or array-like
            Target values.
        max_features : int or None
            The number of features to consider when looking for the best split.
        min_samples_split : int
            The minimum number of samples required to split an internal node.
        max_depth : int
            Maximum depth of the tree.
        minimum_gain : float, default 0.01
            Minimum gain required for splitting.
        loss : function, default None
            Loss function for gradient boosting.
        """

        if not isinstance(target, dict):
            target = {'y': target}

        # Loss for gradient boosting
        if loss is not None:
            self.loss = loss

        try:
            # Exit from recursion using assert syntax
            assert (X.shape[0] > min_samples_split)
            assert (max_depth > 0)

            column, value, gain = self._find_best_split(X, target, max_features)
            assert gain is not None
            if self.regression:
                assert (gain != 0)
            else:
                assert (gain > minimum_gain)

            self.column_index = column
            self.threshold = value
            self.impurity = gain

            # Split dataset
            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)

            self.left_child = Tree(self.regression, self.criterion)
            self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1,
                                  minimum_gain, loss)

            self.right_child = Tree(self.regression, self.criterion)
            self.right_child.train(right_X, right_target, max_features, min_samples_split, max_depth - 1,
                                   minimum_gain, loss)
        except AssertionError:
            self._calculate_leaf_value(target)
Exemple #3
0
    def _train(self,
               X,
               target,
               max_features=None,
               min_samples_split=10,
               max_depth=None,
               minimum_gain=0.01):
        try:
            # Exit from recursion using assert syntax
            assert X.shape[0] > min_samples_split
            assert max_depth > 0

            if max_features is None:
                max_features = X.shape[1]

            column, value, gain = self._find_best_split(
                X, target, max_features)
            assert gain is not None
            if self.regression:
                assert gain != 0
            else:
                assert gain > minimum_gain

            self.column_index = column
            self.threshold = value
            self.impurity = gain

            # Split dataset
            left_X, right_X, left_target, right_target = split_dataset(
                X, target, column, value)

            # Grow left and right child
            self.left_child = Tree(self.regression, self.criterion,
                                   self.n_classes)
            self.left_child._train(left_X, left_target, max_features,
                                   min_samples_split, max_depth - 1,
                                   minimum_gain)

            self.right_child = Tree(self.regression, self.criterion,
                                    self.n_classes)
            self.right_child._train(right_X, right_target, max_features,
                                    min_samples_split, max_depth - 1,
                                    minimum_gain)
        except AssertionError:
            self._calculate_leaf_value(target)
Exemple #4
0
    def _find_best_split(self, X, target, n_features):
        """Find best feature and value for a split. Greedy algorithm."""

        # Sample random subset of features
        subset = random.sample(list(range(0, X.shape[1])), n_features)
        max_gain, max_col, max_val = None, None, None

        for column in subset:
            split_values = self._find_splits(X[:, column])
            for value in split_values:
                if self.loss is None:
                    # Random forest
                    splits = split(X[:, column], target['y'], value)
                    gain = self.criterion(target['y'], splits)
                else:
                    # Gradient boosting
                    left, right = split_dataset(X, target, column, value, return_X=False)
                    gain = xgb_criterion(target, left, right, self.loss)

                if (max_gain is None) or (gain > max_gain):
                    max_col, max_val, max_gain = column, value, gain
        return max_col, max_val, max_gain
Exemple #5
0
    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
        """Build a decision tree from training set.

        Parameters
        ----------

        X : array-like
            Feature dataset.
        target : dictionary or array-like
            Target values.
        max_features : int or None
            The number of features to consider when looking for the best split.
        min_samples_split : int
            The minimum number of samples required to split an internal node.
        max_depth : int
            Maximum depth of the tree.
        minimum_gain : float, default 0.01
            Minimum gain required for splitting.
        loss : function, default None
            Loss function for gradient boosting.
        """

        if not isinstance(target, dict):
            target = {'y': target}

        # Loss for gradient boosting
        if loss is not None:
            self.loss = loss

        try:
            # Exit from recursion using assert syntax
            assert (X.shape[0] > min_samples_split)
            assert (max_depth > 0)

            if max_features is None:
                max_features = X.shape[1]
                
            column, value, gain = self._find_best_split(X, target, max_features)
            assert gain is not None
            if self.regression:
                assert (gain != 0)
            else:
                assert (gain > minimum_gain)

            self.column_index = column
            self.threshold = value
            self.impurity = gain

            # Split dataset
            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)

            # Grow left and right child
            self.left_child = Tree(self.regression, self.criterion)
            self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1,
                                  minimum_gain, loss)

            self.right_child = Tree(self.regression, self.criterion)
            self.right_child.train(right_X, right_target, max_features, min_samples_split, max_depth - 1,
                                   minimum_gain, loss)
        except AssertionError:
            self._calculate_leaf_value(target)