Example #1
0
def deserialize_tree(tree_dict, n_features, n_classes, n_outputs):
    tree_dict['nodes'] = [tuple(lst) for lst in tree_dict['nodes']]

    names = ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples']
    tree_dict['nodes'] = np.array(tree_dict['nodes'], dtype=np.dtype({'names': names, 'formats': tree_dict['nodes_dtype']}))
    tree_dict['values'] = np.array(tree_dict['values'])

    tree = Tree(n_features, np.array([n_classes], dtype=np.intp), n_outputs)
    tree.__setstate__(tree_dict)

    return tree
Example #2
0
    def estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_

        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        # convert model to estimators
        est = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)
        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = self.classes_
            est_i.n_classes_ = self.n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i,
                                                      self.n_classes_)

            node_ndarray = tree_i_state_class.node_ar
            value_ndarray = tree_i_state_class.value_ar
            value_shape = (node_ndarray.shape[0], self.n_outputs_,
                           self.n_classes_)
            # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer"
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            #
            est_i.tree_ = Tree(self.n_features_,
                               np.array([self.n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self._cached_estimators_ = estimators_
        return estimators_
Example #3
0
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_
        if sklearn_check_version('0.22'):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        # convert model to estimators
        params = {
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'random_state': None,
        }
        if not sklearn_check_version('1.0'):
            params['min_impurity_split'] = self.min_impurity_split
        est = DecisionTreeClassifier(**params)

        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_in_, np.array([1],
                                                             dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        return estimators_
Example #4
0
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_
        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        # convert model to estimators
        est = DecisionTreeRegressor(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        return estimators_
Example #5
0
    def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        # Convert data
        if check_input:
            X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    "Invalid value for max_features. Allowed string " 'values are "auto", "sqrt" or "log2".'
                )
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous:
                sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError(
                    "Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)
                )

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(
            self.n_features_,
            self.n_classes_,
            self.n_outputs_,
            splitter,
            max_depth,
            min_samples_split,
            self.min_samples_leaf,
            random_state,
        )

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Example #6
0
class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin)):
    """Base class for decision trees.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """

    @abstractmethod
    def __init__(self, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, max_features, random_state):
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state

        self.n_features_ = None
        self.n_outputs_ = None
        self.classes_ = None
        self.n_classes_ = None

        self.splitter_ = None
        self.tree_ = None

    def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        # Convert data
        if check_input:
            X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    "Invalid value for max_features. Allowed string " 'values are "auto", "sqrt" or "log2".'
                )
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous:
                sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError(
                    "Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)
                )

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(
            self.n_features_,
            self.n_classes_,
            self.n_outputs_,
            splitter,
            max_depth,
            min_samples_split,
            self.min_samples_leaf,
            random_state,
        )

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self

    def predict(self, X):
        """Predict class or regression value for X.

        For a classification model, the predicted class for each sample in X is
        returned. For a regression model, the predicted value based on X is
        returned.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes, or the predict values.
        """
        if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
            X = array2d(X, dtype=DTYPE)

        n_samples, n_features = X.shape

        if self.tree_ is None:
            raise Exception("Tree not initialized. Perform a fit first")

        if self.n_features_ != n_features:
            raise ValueError(
                "Number of features of the model must "
                " match the input. Model n_features is %s and "
                " input n_features is %s " % (self.n_features_, n_features)
            )

        proba = self.tree_.predict(X)

        # Classification
        if isinstance(self, ClassifierMixin):
            if self.n_outputs_ == 1:
                return self.classes_.take(np.argmax(proba, axis=1), axis=0)

            else:
                predictions = np.zeros((n_samples, self.n_outputs_))

                for k in xrange(self.n_outputs_):
                    predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)

                return predictions

        # Regression
        else:
            if self.n_outputs_ == 1:
                return proba[:, 0]

            else:
                return proba[:, :, 0]

    @property
    def feature_importances_(self):
        """Return the feature importances.

        The importance of a feature is computed as the (normalized) total
        reduction of the criterion brought by that feature.
        It is also known as the Gini importance.

        Returns
        -------
        feature_importances_ : array, shape = [n_features]
        """
        if self.tree_ is None:
            raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.")

        return self.tree_.compute_feature_importances()
Example #7
0
        def fit(self, X, y, sample_weight=None, check_input=True,
                X_idx_sorted=None):

            random_state = check_random_state(self.random_state)

            if self.ccp_alpha < 0.0:
                raise ValueError(
                    "ccp_alpha must be greater than or equal to 0")

            if check_input:
                # Need to validate separately here.
                # We can't pass multi_ouput=True because that would allow y to be
                # csr.
                check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
                check_y_params = dict(ensure_2d=False, dtype=None)
                X, y = self._validate_data(X, y,
                                           validate_separately=(check_X_params,
                                                                check_y_params))
                if issparse(X):
                    X.sort_indices()

                    if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
                        raise ValueError("No support for np.int64 index based "
                                         "sparse matrices")

            # Determine output settings
            n_samples, self.n_features_ = X.shape
            is_classification = is_classifier(self)

            y = np.atleast_1d(y)
            expanded_class_weight = None

            if y.ndim == 1:
                # reshape is necessary to preserve the data contiguity against vs
                # [:, np.newaxis] that does not.
                y = np.reshape(y, (-1, 1))

            self.n_outputs_ = y.shape[1]

            if is_classification:
                check_classification_targets(y)
                y = np.copy(y)
                # print(y)
                self.classes_ = []
                self.n_classes_ = []

                if self.class_weight is not None:
                    y_original = np.copy(y)

                y_encoded = np.zeros(y.shape, dtype=np.int)
                for k in range(self.n_outputs_):
                    classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                           return_inverse=True)
                    self.classes_.append(classes_k)
                    self.n_classes_.append(classes_k.shape[0])
                y = y_encoded

                if self.class_weight is not None:
                    expanded_class_weight = compute_sample_weight(
                        self.class_weight, y_original)

                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
                y = np.ascontiguousarray(y, dtype=DOUBLE)

            # Check parameters
            max_depth = (np.iinfo(np.int32).max if self.max_depth is None
                         else self.max_depth)
            max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                              else self.max_leaf_nodes)

            if isinstance(self.min_samples_leaf, numbers.Integral):
                if not 1 <= self.min_samples_leaf:
                    raise ValueError("min_samples_leaf must be at least 1 "
                                     "or in (0, 0.5], got %s"
                                     % self.min_samples_leaf)
                min_samples_leaf = self.min_samples_leaf
            else:  # float
                if not 0. < self.min_samples_leaf <= 0.5:
                    raise ValueError("min_samples_leaf must be at least 1 "
                                     "or in (0, 0.5], got %s"
                                     % self.min_samples_leaf)
                min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

            if isinstance(self.min_samples_split, numbers.Integral):
                if not 2 <= self.min_samples_split:
                    raise ValueError("min_samples_split must be an integer "
                                     "greater than 1 or a float in (0.0, 1.0]; "
                                     "got the integer %s"
                                     % self.min_samples_split)
                min_samples_split = self.min_samples_split
            else:  # float
                if not 0. < self.min_samples_split <= 1.:
                    raise ValueError("min_samples_split must be an integer "
                                     "greater than 1 or a float in (0.0, 1.0]; "
                                     "got the float %s"
                                     % self.min_samples_split)
                min_samples_split = int(
                    ceil(self.min_samples_split * n_samples))
                min_samples_split = max(2, min_samples_split)

            min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

            if isinstance(self.max_features, str):
                if self.max_features == "auto":
                    if is_classification:
                        max_features = max(1, int(np.sqrt(self.n_features_)))
                    else:
                        max_features = self.n_features_
                elif self.max_features == "sqrt":
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                elif self.max_features == "log2":
                    max_features = max(1, int(np.log2(self.n_features_)))
                else:
                    raise ValueError("Invalid value for max_features. "
                                     "Allowed string values are 'auto', "
                                     "'sqrt' or 'log2'.")
            elif self.max_features is None:
                max_features = self.n_features_
            elif isinstance(self.max_features, numbers.Integral):
                max_features = self.max_features
            else:  # float
                if self.max_features > 0.0:
                    max_features = max(1,
                                       int(self.max_features * self.n_features_))
                else:
                    max_features = 0

            self.max_features_ = max_features

            if len(y) != n_samples:
                raise ValueError("Number of labels=%d does not match "
                                 "number of samples=%d" % (len(y), n_samples))
            if not 0 <= self.min_weight_fraction_leaf <= 0.5:
                raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
            if max_depth <= 0:
                raise ValueError("max_depth must be greater than zero. ")
            if not (0 < max_features <= self.n_features_):
                raise ValueError("max_features must be in (0, n_features]")
            if not isinstance(max_leaf_nodes, numbers.Integral):
                raise ValueError("max_leaf_nodes must be integral number but was "
                                 "%r" % max_leaf_nodes)
            if -1 < max_leaf_nodes < 2:
                raise ValueError(("max_leaf_nodes {0} must be either None "
                                  "or larger than 1").format(max_leaf_nodes))

            if sample_weight is not None:
                sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

            if expanded_class_weight is not None:
                if sample_weight is not None:
                    sample_weight = sample_weight * expanded_class_weight
                else:
                    sample_weight = expanded_class_weight

            # Set min_weight_leaf from min_weight_fraction_leaf
            if sample_weight is None:
                min_weight_leaf = (self.min_weight_fraction_leaf *
                                   n_samples)
            else:
                min_weight_leaf = (self.min_weight_fraction_leaf *
                                   np.sum(sample_weight))

            min_impurity_split = self.min_impurity_split
            if min_impurity_split is not None:
                warnings.warn("The min_impurity_split parameter is deprecated. "
                              "Its default value has changed from 1e-7 to 0 in "
                              "version 0.23, and it will be removed in 0.25. "
                              "Use the min_impurity_decrease parameter instead.",
                              FutureWarning)

                if min_impurity_split < 0.:
                    raise ValueError("min_impurity_split must be greater than "
                                     "or equal to 0")
            else:
                min_impurity_split = 0

            if self.min_impurity_decrease < 0.:
                raise ValueError("min_impurity_decrease must be greater than "
                                 "or equal to 0")

            if self.presort != 'deprecated':
                warnings.warn("The parameter 'presort' is deprecated and has no "
                              "effect. It will be removed in v0.24. You can "
                              "suppress this warning by not passing any value "
                              "to the 'presort' parameter.",
                              FutureWarning)

            # Build tree
            criterion = self.criterion
            if not isinstance(criterion, Criterion):
                if is_classification:
                    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                             self.n_classes_)
                else:
                    criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                             n_samples)

            SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS

            splitter = self.splitter
            if not isinstance(self.splitter, Splitter):
                splitter = SPLITTERS[self.splitter](criterion,
                                                    self.max_features_,
                                                    min_samples_leaf,
                                                    min_weight_leaf,
                                                    random_state)

            if is_classifier(self):
                self.tree_ = Tree(self.n_features_,
                                  self.n_classes_, self.n_outputs_)

            else:
                self.tree_ = Tree(self.n_features_,
                                  # TODO: tree should't need this in this case
                                  np.array([1] * self.n_outputs_,
                                           dtype=np.intp),
                                  self.n_outputs_)

            # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
            if max_leaf_nodes < 0:
                builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                                min_samples_leaf,
                                                min_weight_leaf,
                                                max_depth,
                                                self.min_impurity_decrease,
                                                min_impurity_split)
            else:
                builder = BestFirstTreeBuilder(splitter, min_samples_split,
                                               min_samples_leaf,
                                               min_weight_leaf,
                                               max_depth,
                                               max_leaf_nodes,
                                               self.min_impurity_decrease,
                                               min_impurity_split)

            builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
            # print(self.tree_.children_left.shape)
            if self.n_outputs_ == 1 and is_classifier(self):
                self.n_classes_ = self.n_classes_[0]
                self.classes_ = self.classes_[0]
            # print(self.tree_.weighted_n_node_samples)

            e = self.e
            # print(e)
            # for i in range(self.tree_.value.shape[0]):
            #
            # 	for j in range(self.tree_.value.shape[2]):
            #
            # 		self.e = e /((self.tree_.value[i][0][j] + max_depth))
            # 		#print(self.tree_.value[i][0][j])
            # 		self.tree_.value[i][0][j] = self.addNoise(self.tree_.value[i][0][j])
            # 		#print(self.tree_.value[i][0][j])

            # print(self.tree_.value[0][0])

            for i in range(self.tree_.value.shape[0]):
                fr = np.sum(self.tree_.value[i][0])
                self.e = e / (fr + max_depth)
                self.tree_.value[i][0] = self.addNoise(self.tree_.value[i][0])

            self._prune_tree()
            # print(self.tree_.value[0][0])
            return self
Example #8
0
    def fit(self, X, y, sample_weight=None, check_input=True,
            X_idx_sorted="deprecated"):
        """Build a survival tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        X_idx_sorted : deprecated, default="deprecated"
            This parameter is deprecated and has no effect

        Returns
        -------
        self
        """
        random_state = check_random_state(self.random_state)

        if check_input:
            X, event, time = check_arrays_survival(X, y)
            time = time.astype(np.float64)
            self.event_times_ = np.unique(time[event])

            y_numeric = np.empty((X.shape[0], 2), dtype=np.float64)
            y_numeric[:, 0] = time
            y_numeric[:, 1] = event.astype(np.float64)
        else:
            y_numeric, self.event_times_ = y

        n_samples, self.n_features_ = X.shape
        self.n_features_in_ = self.n_features_
        params = self._check_params(n_samples)

        if not isinstance(X_idx_sorted, str) or X_idx_sorted != "deprecated":
            warnings.warn(
                "The parameter 'X_idx_sorted' is deprecated and has no "
                "effect. It will be removed in sklearn 1.1 (renaming of 0.26). "
                "You can suppress this warning by not passing any value to the "
                "'X_idx_sorted' parameter.",
                FutureWarning
            )

        self.n_outputs_ = self.event_times_.shape[0]
        # one "class" for CHF, one for survival function
        self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2

        # Build tree
        criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = DENSE_SPLITTERS[self.splitter](
                criterion,
                self.max_features_,
                params["min_samples_leaf"],
                params["min_weight_leaf"],
                random_state)

        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if params["max_leaf_nodes"] < 0:
            builder = DepthFirstTreeBuilder(splitter,
                                            params["min_samples_split"],
                                            params["min_samples_leaf"],
                                            params["min_weight_leaf"],
                                            params["max_depth"],
                                            0.0,  # min_impurity_decrease
                                            params["min_impurity_split"])
        else:
            builder = BestFirstTreeBuilder(splitter,
                                           params["min_samples_split"],
                                           params["min_samples_leaf"],
                                           params["min_weight_leaf"],
                                           params["max_depth"],
                                           params["max_leaf_nodes"],
                                           0.0,  # min_impurity_decrease
                                           params["min_impurity_split"])

        builder.build(self.tree_, X, y_numeric, sample_weight)

        return self
Example #9
0
    def fit(self,
            X,
            y,
            sample_mask=None,
            X_argsorted=None,
            check_input=True,
            sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        # Convert data
        if check_input:
            X, = check_arrays(X,
                              dtype=DTYPE,
                              sparse_format="dense",
                              check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE
                    or not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(sample_weight,
                                                     dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split,
                                2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features,
                                                self.min_samples_leaf,
                                                random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_,
                          splitter, max_depth, min_samples_split,
                          self.min_samples_leaf, random_state)

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Example #10
0
 def fit(self, X, y, check_input=True, sample_weight=None):
     # Poll the randome state from the tree
     random_state = check_random_state(self.random_state)
     # If the data hasn't yet been formated
     if check_input:
         # Then convert the X data
         X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
     # Get the dimentions of X
     n_samples, self.n_features_ = X.shape
     # Make sure that y is a 1d and not a id.T
     y = np.atleast_1d(y)
     # If our output is 1d
     if y.ndim == 1:
         # Reshape y to preserve the data contiguity
         y = np.reshape(y, (-1, 1))
     # Get the number of outputs
     self.n_outputs_ = y.shape[1]
     y = np.copy(y)
     # Make a container for all unique classes
     self.classes_ = []
     # Make a container for number of instances of each unique classe
     self.n_classes_ = []
     # For each output of y
     for k in xrange(self.n_outputs_):
         # Get the unique classe lables and an array of indexs pointing to the lable
         classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
         # Store the unique classe lables
         self.classes_.append(classes_k)
         # And store the unique classe lables' length
         self.n_classes_.append(classes_k.shape[0])
     # Lets make this numpy array type ints for speed
     self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
     if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
         y = np.ascontiguousarray(y, dtype=DOUBLE)
     # Check parameters
     # If no maxdepth was given
     max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth
     # If defult was given 
     if isinstance(self.max_features, six.string_types):
         # then set it to the sqrt of number of features 
         max_features = max(1, int(np.sqrt(self.n_features_)))
     # If None was given 
     elif self.max_features is None:
         # Just use all of them
         max_features = self.n_features_
     # Otherwise
     else:
         # Use whats given
         max_features = self.max_features
     # We we we're given a sample weight
     if sample_weight is not None:
         # Then  we'll nedd to make sure its double precision
         if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous):
             sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
     min_samples_split = self.min_samples_split
     criterion = self.criterion
     # If we have not yet inti our tree criterion
     if criterion is None:
         # Lets inti our entropy criterion
         criterion = Entropy(self.n_outputs_, self.n_classes_)
     splitter = self.splitter
     # If we have not yet inti our tree splitter
     if splitter is None:
         # Lets inti our best binary splitter
         splitter = BestSplitter(criterion, max_features, self.min_samples_leaf, random_state)
     # We'll save these so we don't have to init them agian a second time for retraining
     self.criterion_ = criterion
     self.splitter_ = splitter
     # Now lets init
     self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state)
     # and fit our tree database
     self.tree_.build(X, y, sample_weight=sample_weight)
     # If we only have one output
     if self.n_outputs_ == 1:
         # Then just save the first class
         self.n_classes_ = self.n_classes_[0]
         self.classes_ = self.classes_[0]
     # Then save our tree
     return self
Example #11
0
    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(
                _class_name +
                " does not currently support multi-output data. Consider using OneHotEncoder"
            )

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError(
                "Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = self.classes_
            est_i.n_classes_ = self.n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(model, i,
                                                      self.n_classes_)

            node_ndarray = tree_i_state_class.node_ar
            value_ndarray = tree_i_state_class.value_ar
            value_shape = (node_ndarray.shape[0], self.n_outputs_,
                           self.n_classes_)

            # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer"

            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            #
            est_i.tree_ = Tree(self.n_features_,
                               np.array([self.n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Example #12
0
if getattr(y_train, "dtype", None) != DOUBLE or not y_train.flags.contigous:
    y_train = np.ascontiguousarray(y_train, dtype=DOUBLE)

max_depth = (np.iinfo(np.int32).max if max_depth is None else max_depth)
max_leaf_nodes = (-1 if max_leaf_nodes is None else max_leaf_nodes)

max_features = max(1, int(np.sqrt(n_features_)))

criterion = CRITERIA_CLF[criterion](n_outputs_, n_classes_)

SPLITTERS = DENSE_SPLITTERS

splitter = SPLITTERS[splitter](criterion, max_features, min_samples_leaf,
                               min_weight_leaf, random_state)

tree_ = Tree(n_features_, n_classes_, n_outputs_)

builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf,
                                min_weight_leaf, max_depth,
                                min_impurity_decrease, min_impurity_split)

builder.build(tree_, X_train, y_train)

classes_ = classes_[0]

n_classes_ = np.atleast_1d(n_classes_)
pruned_tree = Tree(n_features_, n_classes_, n_outputs_)
_build_pruned_tree_ccp(pruned_tree, tree_, 0)
tree_ = pruned_tree

X_test = check_array(X_test, dtype=DTYPE, accept_sparse="csr")
Example #13
0
class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
    """A survival tree.

    The quality of a split is measured by the
    log-rank splitting rule.

    See [1]_, [2]_ and [3]_ for further description.

    Parameters
    ----------
    splitter : string, optional, default: "best"
        The strategy used to choose the split at each node. Supported
        strategies are "best" to choose the best split and "random" to choose
        the best random split.

    max_depth : int or None, optional, default: None
        The maximum depth of the tree. If None, then nodes are expanded until
        all leaves are pure or until all leaves contain less than
        min_samples_split samples.

    min_samples_split : int, float, optional, default: 6
        The minimum number of samples required to split an internal node:

        - If int, then consider `min_samples_split` as the minimum number.
        - If float, then `min_samples_split` is a fraction and
          `ceil(min_samples_split * n_samples)` are the minimum
          number of samples for each split.

    min_samples_leaf : int, float, optional, default: 3
        The minimum number of samples required to be at a leaf node.
        A split point at any depth will only be considered if it leaves at
        least ``min_samples_leaf`` training samples in each of the left and
        right branches.  This may have the effect of smoothing the model,
        especially in regression.

        - If int, then consider `min_samples_leaf` as the minimum number.
        - If float, then `min_samples_leaf` is a fraction and
          `ceil(min_samples_leaf * n_samples)` are the minimum
          number of samples for each node.

    min_weight_fraction_leaf : float, optional, default: 0.
        The minimum weighted fraction of the sum total of weights (of all
        the input samples) required to be at a leaf node. Samples have
        equal weight when sample_weight is not provided.

    max_features : int, float, string or None, optional, default: None
        The number of features to consider when looking for the best split:

            - If int, then consider `max_features` features at each split.
            - If float, then `max_features` is a fraction and
              `int(max_features * n_features)` features are considered at each
              split.
            - If "auto", then `max_features=sqrt(n_features)`.
            - If "sqrt", then `max_features=sqrt(n_features)`.
            - If "log2", then `max_features=log2(n_features)`.
            - If None, then `max_features=n_features`.

        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    max_leaf_nodes : int or None, optional, default: None
        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
        Best nodes are defined as relative reduction in impurity.
        If None then unlimited number of leaf nodes.

    presort : deprecated, optional, default: 'deprecated'
        This parameter is deprecated and will be removed in a future version.

    Attributes
    ----------
    event_times_ : array of shape = (n_event_times,)
        Unique time points where events occurred.

    max_features_ : int,
        The inferred value of max_features.

    n_features_ : int
        The number of features when ``fit`` is performed.

    tree_ : Tree object
        The underlying Tree object. Please refer to
        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object.

    See also
    --------
    sksurv.ensemble.RandomSurvivalForest
        An ensemble of SurvivalTrees.

    References
    ----------
    .. [1] Leblanc, M., & Crowley, J. (1993). Survival Trees by Goodness of Split.
           Journal of the American Statistical Association, 88(422), 457–467.

    .. [2] Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008).
           Random survival forests. The Annals of Applied Statistics, 2(3), 841–860.

    .. [3] Ishwaran, H., Kogalur, U. B. (2007). Random survival forests for R.
           R News, 7(2), 25–31. https://cran.r-project.org/doc/Rnews/Rnews_2007-2.pdf.
    """
    def __init__(self,
                 splitter="best",
                 max_depth=None,
                 min_samples_split=6,
                 min_samples_leaf=3,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 presort='deprecated'):
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.presort = presort

    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):
        """Build a survival tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        X_idx_sorted : array-like, shape = (n_samples, n_features), optional
            The indexes of the sorted training input samples. If many tree
            are grown on the same dataset, this allows the ordering to be
            cached between trees. If None, the data will be sorted here.
            Don't use this parameter unless you know what to do.

        Returns
        -------
        self
        """
        random_state = check_random_state(self.random_state)

        if check_input:
            X, event, time = check_arrays_survival(X, y)
            time = time.astype(np.float64)
            self.event_times_ = np.unique(time[event])

            y_numeric = np.empty((X.shape[0], 2), dtype=np.float64)
            y_numeric[:, 0] = time
            y_numeric[:, 1] = event.astype(np.float64)
        else:
            y_numeric, self.event_times_ = y

        n_samples, self.n_features_ = X.shape
        params = self._check_params(n_samples)

        self.n_outputs_ = self.event_times_.shape[0]
        # one "class" for CHF, one for survival function
        self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2

        # Build tree
        criterion = LogrankCriterion(self.n_outputs_, n_samples,
                                     self.event_times_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = DENSE_SPLITTERS[self.splitter](
                criterion, self.max_features_, params["min_samples_leaf"],
                params["min_weight_leaf"], random_state)

        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if params["max_leaf_nodes"] < 0:
            builder = DepthFirstTreeBuilder(
                splitter,
                params["min_samples_split"],
                params["min_samples_leaf"],
                params["min_weight_leaf"],
                params["max_depth"],
                0.0,  # min_impurity_decrease
                params["min_impurity_split"])
        else:
            builder = BestFirstTreeBuilder(
                splitter,
                params["min_samples_split"],
                params["min_samples_leaf"],
                params["min_weight_leaf"],
                params["max_depth"],
                params["max_leaf_nodes"],
                0.0,  # min_impurity_decrease
                params["min_impurity_split"])

        builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted)

        return self

    def _check_params(self, n_samples):
        # Check parameters
        max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth)
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero.")

        max_leaf_nodes = self._check_max_leaf_nodes()
        min_samples_leaf = self._check_min_samples_leaf(n_samples)
        min_samples_split = self._check_min_samples_split(n_samples)
        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

        self._check_max_features()

        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")

        min_weight_leaf = self.min_weight_fraction_leaf * n_samples
        min_impurity_split = 1e-7

        if self.presort != 'deprecated':
            warnings.warn(
                "The parameter 'presort' is deprecated and has no "
                "effect. It will be removed in v0.24. You can "
                "suppress this warning by not passing any value "
                "to the 'presort' parameter.", DeprecationWarning)

        return {
            "max_depth": max_depth,
            "max_leaf_nodes": max_leaf_nodes,
            "min_samples_leaf": min_samples_leaf,
            "min_samples_split": min_samples_split,
            "min_impurity_split": min_impurity_split,
            "min_weight_leaf": min_weight_leaf,
        }

    def _check_max_leaf_nodes(self):
        max_leaf_nodes = (-1 if self.max_leaf_nodes is None else
                          self.max_leaf_nodes)
        if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
            raise ValueError("max_leaf_nodes must be integral number but was "
                             "%r" % max_leaf_nodes)
        if -1 < max_leaf_nodes < 2:
            raise ValueError(("max_leaf_nodes {} must be either None "
                              "or larger than 1").format(max_leaf_nodes))
        return max_leaf_nodes

    def _check_min_samples_leaf(self, n_samples):
        if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)):
            if not 1 <= self.min_samples_leaf:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = self.min_samples_leaf
        else:  # float
            if not 0. < self.min_samples_leaf <= 0.5:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
        # FIXME throw exception if min_samples_leaf < 2
        return min_samples_leaf

    def _check_min_samples_split(self, n_samples):
        if isinstance(self.min_samples_split, (numbers.Integral, np.integer)):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s" % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0. < self.min_samples_split <= 1.:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s" % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)
        return min_samples_split

    def _check_max_features(self):
        if isinstance(self.max_features, str):
            if self.max_features in ("auto", "sqrt"):
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1,
                                   int(self.max_features * self.n_features_))
            else:
                max_features = 0

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        self.max_features_ = max_features

    def _validate_X_predict(self, X, check_input):
        """Validate X whenever one tries to predict"""
        if check_input:
            X = check_array(X, dtype=DTYPE)

        n_features = X.shape[1]
        if self.n_features_ != n_features:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is %s and "
                             "input n_features is %s." %
                             (self.n_features_, n_features))

        return X

    def predict(self, X, check_input=True):
        """Predict risk score.

        The risk score is the total number of events, which can
        be estimated by the sum of the estimated cumulative
        hazard function :math:`\\hat{H}_h` in terminal node :math:`h`.

        .. math::

            \\sum_{j=1}^{n(h)} \\hat{H}_h(T_{j} \\mid x) ,

        where :math:`n(h)` denotes the number of distinct event times
        of samples belonging to the same terminal node as :math:`x`.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        risk_scores : ndarray, shape = (n_samples,)
            Predicted risk scores.
        """
        chf = self.predict_cumulative_hazard_function(X,
                                                      check_input,
                                                      return_array=True)
        return chf.sum(1)

    def predict_cumulative_hazard_function(self,
                                           X,
                                           check_input=True,
                                           return_array="warn"):
        """Predict cumulative hazard function.

        The cumulative hazard function (CHF) for an individual
        with feature vector :math:`x` is computed from
        all samples of the training data that are in the
        same terminal node as :math:`x`.
        It is estimated by the Nelson–Aalen estimator.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        return_array : boolean
            If set, return an array with the cumulative hazard rate
            for each `self.event_times_`, otherwise an array of
            :class:`sksurv.functions.StepFunction`.

        Returns
        -------
        cum_hazard : ndarray
            If `return_array` is set, an array with the cumulative hazard rate
            for each `self.event_times_`, otherwise an array of
            :class:`sksurv.functions.StepFunction` will be returned.

        Examples
        --------
        >>> import matplotlib.pyplot as plt
        >>> from sksurv.datasets import load_whas500
        >>> from sksurv.tree import SurvivalTree

        Load and prepare the data.

        >>> X, y = load_whas500()
        >>> X = X.astype(float)

        Fit the model.

        >>> estimator = SurvivalTree().fit(X, y)

        Estimate the cumulative hazard function for the first 5 samples.

        >>> chf_funcs = estimator.predict_cumulative_hazard_function(X.iloc[:5], return_array=False)

        Plot the estimated cumulative hazard functions.

        >>> for fn in chf_funcs:
        ...    plt.step(fn.x, fn(fn.x), where="post")
        ...
        >>> plt.ylim(0, 1)
        >>> plt.show()
        """
        if return_array == "warn":
            warnings.warn(
                "predict_cumulative_hazard_function will return an array of StepFunction instances in 0.14. "
                "Use return_array=True to keep the old behavior.",
                FutureWarning)

        check_is_fitted(self, 'tree_')
        X = self._validate_X_predict(X, check_input)

        pred = self.tree_.predict(X)
        arr = pred[..., 0]
        if return_array:
            return arr
        return _array_to_step_function(self.event_times_, arr)

    def predict_survival_function(self,
                                  X,
                                  check_input=True,
                                  return_array="warn"):
        """Predict survival function.

        The survival function for an individual
        with feature vector :math:`x` is computed from
        all samples of the training data that are in the
        same terminal node as :math:`x`.
        It is estimated by the Kaplan-Meier estimator.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        return_array : boolean
            If set, return an array with the probability
            of survival for each `self.event_times_`,
            otherwise an array of :class:`sksurv.functions.StepFunction`.

        Returns
        -------
        survival : ndarray
            If `return_array` is set, an array with the probability
            of survival for each `self.event_times_`,
            otherwise an array of :class:`sksurv.functions.StepFunction`
            will be returned.

        Examples
        --------
        >>> import matplotlib.pyplot as plt
        >>> from sksurv.datasets import load_whas500
        >>> from sksurv.tree import SurvivalTree

        Load and prepare the data.

        >>> X, y = load_whas500()
        >>> X = X.astype(float)

        Fit the model.

        >>> estimator = SurvivalTree().fit(X, y)

        Estimate the survival function for the first 5 samples.

        >>> surv_funcs = estimator.predict_survival_function(X.iloc[:5], return_array=False)

        Plot the estimated survival functions.

        >>> for fn in surv_funcs:
        ...    plt.step(fn.x, fn(fn.x), where="post")
        ...
        >>> plt.ylim(0, 1)
        >>> plt.show()
        """
        if return_array == "warn":
            warnings.warn(
                "predict_survival_function will return an array of StepFunction instances in 0.14. "
                "Use return_array=True to keep the old behavior.",
                FutureWarning)

        check_is_fitted(self, 'tree_')
        X = self._validate_X_predict(X, check_input)

        pred = self.tree_.predict(X)
        arr = pred[..., 1]
        if return_array:
            return arr
        return _array_to_step_function(self.event_times_, arr)
Example #14
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):
        """Build a survival tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Data matrix

        y : structured array, shape = (n_samples,)
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        check_input : boolean, default: True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        X_idx_sorted : array-like, shape = (n_samples, n_features), optional
            The indexes of the sorted training input samples. If many tree
            are grown on the same dataset, this allows the ordering to be
            cached between trees. If None, the data will be sorted here.
            Don't use this parameter unless you know what to do.

        Returns
        -------
        self
        """
        random_state = check_random_state(self.random_state)

        if check_input:
            X, event, time = check_arrays_survival(X, y)
            time = time.astype(np.float64)
            self.event_times_ = np.unique(time[event])

            y_numeric = np.empty((X.shape[0], 2), dtype=np.float64)
            y_numeric[:, 0] = time
            y_numeric[:, 1] = event.astype(np.float64)
        else:
            y_numeric, self.event_times_ = y

        n_samples, self.n_features_ = X.shape
        params = self._check_params(n_samples)

        self.n_outputs_ = self.event_times_.shape[0]
        # one "class" for CHF, one for survival function
        self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2

        # Build tree
        criterion = LogrankCriterion(self.n_outputs_, n_samples,
                                     self.event_times_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = DENSE_SPLITTERS[self.splitter](
                criterion, self.max_features_, params["min_samples_leaf"],
                params["min_weight_leaf"], random_state)

        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if params["max_leaf_nodes"] < 0:
            builder = DepthFirstTreeBuilder(
                splitter,
                params["min_samples_split"],
                params["min_samples_leaf"],
                params["min_weight_leaf"],
                params["max_depth"],
                0.0,  # min_impurity_decrease
                params["min_impurity_split"])
        else:
            builder = BestFirstTreeBuilder(
                splitter,
                params["min_samples_split"],
                params["min_samples_leaf"],
                params["min_weight_leaf"],
                params["max_depth"],
                params["max_leaf_nodes"],
                0.0,  # min_impurity_decrease
                params["min_impurity_split"])

        builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted)

        return self
Example #15
0
def digitize2tree(bins, right=False):
    """
    Builds a decision tree which returns the same result as
    `lambda x: numpy.digitize(x, bins, right=right)`
    (see :epkg:`numpy:digitize`).

    :param bins: array of bins. It has to be 1-dimensional and monotonic.
    :param right: Indicating whether the intervals include the right
        or the left bin edge. Default behavior is (right==False)
        indicating that the interval does not include the right edge.
        The left bin end is open in this case, i.e.,
        `bins[i-1] <= x < bins[i]` is the default behavior for
        monotonically increasing bins.
    :return: decision tree

    .. note::
        The implementation of decision trees in :epkg:`scikit-learn`
        only allows one type of decision (`<=`). That's why the
        function throws an exception when `right=False`. However,
        this could be overcome by using :epkg:`ONNX` where all
        kind of decision rules are implemented. Default value for
        right is still *False* to follow *numpy* API even though
        this value raises an exception in *digitize2tree*.

    The following example shows what the tree looks like.

    .. runpython::
        :showcode:

        import numpy
        from sklearn.tree import export_text
        from mlinsights.mltree import digitize2tree

        x = numpy.array([0.2, 6.4, 3.0, 1.6])
        bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
        expected = numpy.digitize(x, bins, right=True)
        tree = digitize2tree(bins, right=True)
        pred = tree.predict(x.reshape((-1, 1)))
        print("Comparison with numpy:")
        print(expected, pred)
        print("Tree:")
        print(export_text(tree, feature_names=['x']))

    See also example :ref:`l-example-digitize`.

    .. versionadded:: 0.4
    """
    if not right:
        raise RuntimeError(
            "right must be True not right=%r" % right)
    ascending = len(bins) <= 1 or bins[0] < bins[1]

    if not ascending:
        bins2 = bins[::-1]
        cl = digitize2tree(bins2, right=right)
        n = len(bins)
        for i in range(cl.tree_.value.shape[0]):
            cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0]
        return cl

    tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1)
    values = []
    UNUSED = numpy.nan
    n_nodes = []

    def add_root(index):
        if index < 0 or index >= len(bins):
            raise IndexError(  # pragma: no cover
                "Unexpected index %d / len(bins)=%d." % (
                    index, len(bins)))
        parent = -1
        is_left = False
        is_leaf = False
        threshold = bins[index]
        n = tree_add_node(
            tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.)
        values.append(UNUSED)
        n_nodes.append(n)
        return n

    def add_nodes(parent, i, j, is_left):
        # add for bins[i:j] (j excluded)
        if is_left:
            # it means j is the parent split
            if i == j:
                # leaf
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.)
                n_nodes.append(n)
                values.append(i)
                return n
            if i + 1 == j:
                # split
                values.append(UNUSED)
                th = bins[i]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, i, True)
                add_nodes(n, i, j, False)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        else:
            # it means i is the parent split
            if i + 1 == j:
                # leaf
                values.append(j)
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.)
                n_nodes.append(n)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        raise NotImplementedError(  # pragma: no cover
            "Unexpected case where i=%r, j=%r, is_left=%r." % (
                i, j, is_left))

    index = len(bins) // 2
    add_root(index)
    add_nodes(0, 0, index, True)
    add_nodes(0, index, len(bins), False)

    cl = DecisionTreeRegressor()
    cl.tree_ = tree
    cl.tree_.value[:, 0, 0] = numpy.array(  # pylint: disable=E1137
        values, dtype=numpy.float64)
    cl.n_outputs = 1
    cl.n_outputs_ = 1
    try:
        # scikit-learn >= 0.24
        cl.n_features_in_ = 1
    except AttributeError:
        # scikit-learn < 0.24
        cl.n_features_ = 1
    try:
        # for scikit-learn<=0.23.2
        cl.n_features_ = 1
    except AttributeError:
        pass
    return cl
Example #16
0
class myDecisionTreeClassifier(six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin, ClassifierMixin)):
    
    def __init__(self,
                 # Max depth for Decision Tree
                 max_depth=None,
                 # Min number of samples per split
                 min_samples_split=2,
                 # Min samples per leaf node
                 min_samples_leaf=1,
                 # Max number of features to consider when looking for the best split
                 max_features=None,
                 # Init the random state of the tree
                 random_state=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        # We'll waint until we fit to inti these:
        # Learning criterion for training tree
        self.criterion = None
        # Split method
        self.splitter = None
        # Number of features
        self.n_features_ = None
        # Number of outputs
        self.n_outputs_ = None
        # Labels of classes
        self.classes_ = None
        # Number of classes
        self.n_classes_ = None
        # Tree dataframe
        self.tree_ = None
            
    def fit(self, X, y, check_input=True, sample_weight=None):
        # Poll the randome state from the tree
        random_state = check_random_state(self.random_state)
        # If the data hasn't yet been formated
        if check_input:
            # Then convert the X data
            X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        # Get the dimentions of X
        n_samples, self.n_features_ = X.shape
        # Make sure that y is a 1d and not a id.T
        y = np.atleast_1d(y)
        # If our output is 1d
        if y.ndim == 1:
            # Reshape y to preserve the data contiguity
            y = np.reshape(y, (-1, 1))
        # Get the number of outputs
        self.n_outputs_ = y.shape[1]
        y = np.copy(y)
        # Make a container for all unique classes
        self.classes_ = []
        # Make a container for number of instances of each unique classe
        self.n_classes_ = []
        # For each output of y
        for k in xrange(self.n_outputs_):
            # Get the unique classe lables and an array of indexs pointing to the lable
            classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
            # Store the unique classe lables
            self.classes_.append(classes_k)
            # And store the unique classe lables' length
            self.n_classes_.append(classes_k.shape[0])
        # Lets make this numpy array type ints for speed
        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)
        # Check parameters
        # If no maxdepth was given
        max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth
        # If defult was given 
        if isinstance(self.max_features, six.string_types):
            # then set it to the sqrt of number of features 
            max_features = max(1, int(np.sqrt(self.n_features_)))
        # If None was given 
        elif self.max_features is None:
            # Just use all of them
            max_features = self.n_features_
        # Otherwise
        else:
            # Use whats given
            max_features = self.max_features
        # We we we're given a sample weight
        if sample_weight is not None:
            # Then  we'll nedd to make sure its double precision
            if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
        min_samples_split = self.min_samples_split
        criterion = self.criterion
        # If we have not yet inti our tree criterion
        if criterion is None:
            # Lets inti our entropy criterion
            criterion = Entropy(self.n_outputs_, self.n_classes_)
        splitter = self.splitter
        # If we have not yet inti our tree splitter
        if splitter is None:
            # Lets inti our best binary splitter
            splitter = BestSplitter(criterion, max_features, self.min_samples_leaf, random_state)
        # We'll save these so we don't have to init them agian a second time for retraining
        self.criterion_ = criterion
        self.splitter_ = splitter
        # Now lets init
        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state)
        # and fit our tree database
        self.tree_.build(X, y, sample_weight=sample_weight)
        # If we only have one output
        if self.n_outputs_ == 1:
            # Then just save the first class
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]
        # Then save our tree
        return self

    def predict(self, X):
        """Predict class for a given X"""
        # Make sure the data is DTYPE for the tree and is 2D
        if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
            X = array2d(X, dtype=DTYPE)
        # Get the dimentions of X
        n_samples, n_features = X.shape
        # Predict class from tree database
        proba = self.tree_.predict(X)
        # If we only have one output
        if self.n_outputs_ == 1:
            # Then use the index of the max prob to pick the class from classes_
            return self.classes_.take(np.argmax(proba, axis=1), axis=0)
        # If we were trained with multiple outputs
        else:
            # Make a empty 2D array to hold predictions
            predictions = np.zeros((n_samples, self.n_outputs_))
            # For each output
            for k in xrange(self.n_outputs_):
                # Then use the index of the max prob to pick the class from classes_
                predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)
            # Return the results
            return predictions

    def predict_proba(self, X):
        """Predict class probabilities from the given X"""
        # Make sure the data is DTYPE for the tree and is 2D
        if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
            X = array2d(X, dtype=DTYPE)
        # Get the dimentions of X
        n_samples, n_features = X.shape
        # Predict class from tree database
        proba = self.tree_.predict(X)
        # If we only have one output
        if self.n_outputs_ == 1:
            # Grab the predictions for the avalable classes
            proba = proba[:, :self.n_classes_]
            # Generate a normalizer from all the proba weights
            normalizer = proba.sum(axis=1)[:, np.newaxis]
            # Remap all of the zero normalizer elemnts to one
            # This is so just we can avoid deviding by zero
            normalizer[normalizer == 0.0] = 1.0
            # Now normilize the proba by the total weight sum of each sample
            proba /= normalizer
            # Return the results
            return proba
        # If we were trained with multiple outputs
        else:
            # Make a empty container to hold all proba
            all_proba = []
            # For each output
            for k in xrange(self.n_outputs_):
                # Grab the predictions for the avalable classes
                proba_k = proba[:, k, :self.n_classes_[k]]
                # Generate a normalizer from all the proba weights
                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
                # Remap all of the zero normalizer elemnts to one
                # This is so just we can avoid deviding by zero
                normalizer[normalizer == 0.0] = 1.0
                # Now normilize the proba by the total weight sum of each sample
                proba_k /= normalizer
                # Return the results
                all_proba.append(proba_k)
            # Return the results
            return all_proba
Example #17
0
    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype=getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))

        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeRegressor(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(model, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_
        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Example #18
0
class ModelTreeRegressor(DecisionTreeRegressor):
    """TODO"""
    def __init__(self, max_depth=3, min_samples_leaf=25, debug=False):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = Tree(np.size(X, axis=1), np.array([1]), 1)
        self.debug = debug

    def fit(self, X, y):

        self.children_left = [-1]
        self.children_right = [-1]
        self.node_count = 1
        self.feature = []
        self.threshold = []
        self.n_node_samples = []
        self.linear_models = [Ridge(alpha=0.01)]
        self.linear_models[0].fit(X, y)
        self.mse = [((self.linear_models[0].predict(X) - y)**2).sum()]

        self._split(X, y, 0)
        state = {
            'node_count':
            self.node_count,
            'values':
            np.array([[self.mse]], order='C').reshape((-1, 1, 1)),
            'nodes':
            np.array([(a, z, e, r, t, y, u) for a, z, e, r, t, y, u in zip(
                self.children_left, self.children_right, self.feature, self.
                threshold, self.mse, self.n_node_samples, self.n_node_samples)
                      ],
                     dtype=self.tree.__getstate__()['nodes'].dtype)
        }
        # return state
        self.tree.__setstate__(state)
        return self

    def _split(self, X, y, node, depth=0):

        if depth >= self.max_depth:
            self.feature.append(-1)
            self.threshold.append(0.)
            self.n_node_samples.append(len(X))
            return

        if self.debug:
            print("")
            print("left {}".format(self.children_left))
            print("right {}".format(self.children_right))
            print("splitting node {}, {} points, mse={}".format(
                node, len(X), self.mse[node]))
            # print X
            # print y

        best_a = -1
        best_threshold = 0
        best_mse = self.mse[node]
        best_mask = None
        for a in range(np.size(X, axis=1)):
            if self.debug: print("attribute {}".format(a))
            arg = np.argsort(X[:, a])
            mask = np.ones(len(X), dtype=bool)  #1 to the right, 0 to the left
            for i in range(0, len(X) - self.min_samples_leaf):
                if i < self.min_samples_leaf or X[arg[i], a] == X[arg[i + 1],
                                                                  a]:
                    mask[arg[i]] = False
                    continue
                # if self.debug: print("  threshold {}".format(X[arg[i],a]))
                # L1=LinearRegression()
                # L2=LinearRegression()
                L1 = Ridge(alpha=0.01)
                L2 = Ridge(alpha=0.01)
                L1.fit(X[mask == False], y[mask == False])
                L2.fit(X[mask], y[mask])
                y1 = L1.predict(X[mask == False])
                y2 = L2.predict(X[mask])
                mse1 = ((y1 - y[mask == False])**2).sum()
                mse2 = ((y2 - y[mask])**2).sum()

                # print "    {} points, mse1={}".format(i,mse1)
                # print "    {} points, mse2={}".format(len(X)-i,mse2)
                mse = (float(i) / len(X)) * (
                    ((y1 - y[mask == False])**
                     2).sum()) + (float(len(X) - i) / len(X)) * ((
                         (y2 - y[mask])**2).sum())
                # if self.debug: print("  total mse={}".format(mse))
                if mse < best_mse:
                    # print mask
                    best_a = a
                    best_threshold = (X[arg[i], a] + X[arg[i + 1], a]) / 2
                    best_mse = mse
                    best_mse1 = mse1
                    best_mse2 = mse2
                    best_l1 = L1
                    best_l2 = L2
                    best_mask = np.array(mask)
                mask[arg[i]] = False

                ####### ?????????????? To remove?########
                if self.debug: time.sleep(0.001)

        if best_a == -1:
            self.feature.append(-2)
            self.threshold.append(-2.)
            self.n_node_samples.append(len(X))
            return

        self.feature.append(best_a)
        self.threshold.append(best_threshold)
        self.n_node_samples.append(len(X))

        #create left children node
        self.children_left.append(-1)
        self.children_right.append(-1)
        self.mse.append(best_mse1)
        self.linear_models.append(best_l1)
        self.children_left[node] = self.node_count
        self.node_count = self.node_count + 1
        self._split(X[best_mask == False],
                    y[best_mask == False],
                    self.node_count - 1,
                    depth=depth + 1)

        #create left children node
        self.children_left.append(-1)
        self.children_right.append(-1)
        self.mse.append(best_mse2)
        self.linear_models.append(best_l2)
        self.children_right[node] = self.node_count
        self.node_count = self.node_count + 1
        self._split(X[best_mask],
                    y[best_mask],
                    self.node_count - 1,
                    depth=depth + 1)

    def predict(self, X):
        if len(np.shape(X)) == 1:
            X = np.array(X).reshape(1, -1)

        predicted = np.empty(len(X))
        nodes = self.tree.apply(np.array(X, dtype=np.float32))
        # print nodes
        for i, n in enumerate(nodes):
            predict = self.linear_models[n].predict(X[i])
            if predict < 0:
                predict = 0
            elif predict > 1:
                predict = 1
            predicted[i] = predict

        return predicted
Example #19
0
 def __init__(self, max_depth=3, min_samples_leaf=25, debug=False):
     self.max_depth = max_depth
     self.min_samples_leaf = min_samples_leaf
     self.tree = Tree(np.size(X, axis=1), np.array([1]), 1)
     self.debug = debug
Example #20
0
class BaseDecisionTree(
        six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin)):
    """Base class for decision trees.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """
    @abstractmethod
    def __init__(self, criterion, splitter, max_depth, min_samples_split,
                 min_samples_leaf, max_features, random_state):
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state

        self.n_features_ = None
        self.n_outputs_ = None
        self.classes_ = None
        self.n_classes_ = None

        self.splitter_ = None
        self.tree_ = None

    def fit(self,
            X,
            y,
            sample_mask=None,
            X_argsorted=None,
            check_input=True,
            sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        # Convert data
        if check_input:
            X, = check_arrays(X,
                              dtype=DTYPE,
                              sparse_format="dense",
                              check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE
                    or not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(sample_weight,
                                                     dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split,
                                2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features,
                                                self.min_samples_leaf,
                                                random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_,
                          splitter, max_depth, min_samples_split,
                          self.min_samples_leaf, random_state)

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self

    def predict(self, X):
        """Predict class or regression value for X.

        For a classification model, the predicted class for each sample in X is
        returned. For a regression model, the predicted value based on X is
        returned.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes, or the predict values.
        """
        if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
            X = array2d(X, dtype=DTYPE)

        n_samples, n_features = X.shape

        if self.tree_ is None:
            raise Exception("Tree not initialized. Perform a fit first")

        if self.n_features_ != n_features:
            raise ValueError("Number of features of the model must "
                             " match the input. Model n_features is %s and "
                             " input n_features is %s " %
                             (self.n_features_, n_features))

        proba = self.tree_.predict(X)

        # Classification
        if isinstance(self, ClassifierMixin):
            if self.n_outputs_ == 1:
                return self.classes_.take(np.argmax(proba, axis=1), axis=0)

            else:
                predictions = np.zeros((n_samples, self.n_outputs_))

                for k in xrange(self.n_outputs_):
                    predictions[:,
                                k] = self.classes_[k].take(np.argmax(proba[:,
                                                                           k],
                                                                     axis=1),
                                                           axis=0)

                return predictions

        # Regression
        else:
            if self.n_outputs_ == 1:
                return proba[:, 0]

            else:
                return proba[:, :, 0]

    @property
    def feature_importances_(self):
        """Return the feature importances.

        The importance of a feature is computed as the (normalized) total
        reduction of the criterion brought by that feature.
        It is also known as the Gini importance.

        Returns
        -------
        feature_importances_ : array, shape = [n_features]
        """
        if self.tree_ is None:
            raise ValueError("Estimator not fitted, "
                             "call `fit` before `feature_importances_`.")

        return self.tree_.compute_feature_importances()
Example #21
0
class DecisionTreeClassifier(sk.DecisionTreeClassifier):

    def __init__(
            self,
            *,
            criterion="gini",
            splitter="best",
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_features=None,
            random_state=None,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            class_weight=None,
            ccp_alpha=0.0, e, s):
        super().__init__(
            criterion=criterion,
            splitter=splitter,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            class_weight=class_weight,
            random_state=random_state,
            min_impurity_decrease=min_impurity_decrease,
            ccp_alpha=ccp_alpha)

        self.e = e
        self.s = s

        def fit(self, X, y, sample_weight=None, check_input=True,
                X_idx_sorted=None):

            random_state = check_random_state(self.random_state)

            if self.ccp_alpha < 0.0:
                raise ValueError(
                    "ccp_alpha must be greater than or equal to 0")

            if check_input:
                # Need to validate separately here.
                # We can't pass multi_ouput=True because that would allow y to be
                # csr.
                check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
                check_y_params = dict(ensure_2d=False, dtype=None)
                X, y = self._validate_data(X, y,
                                           validate_separately=(check_X_params,
                                                                check_y_params))
                if issparse(X):
                    X.sort_indices()

                    if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
                        raise ValueError("No support for np.int64 index based "
                                         "sparse matrices")

            # Determine output settings
            n_samples, self.n_features_ = X.shape
            is_classification = is_classifier(self)

            y = np.atleast_1d(y)
            expanded_class_weight = None

            if y.ndim == 1:
                # reshape is necessary to preserve the data contiguity against vs
                # [:, np.newaxis] that does not.
                y = np.reshape(y, (-1, 1))

            self.n_outputs_ = y.shape[1]

            if is_classification:
                check_classification_targets(y)
                y = np.copy(y)
                # print(y)
                self.classes_ = []
                self.n_classes_ = []

                if self.class_weight is not None:
                    y_original = np.copy(y)

                y_encoded = np.zeros(y.shape, dtype=np.int)
                for k in range(self.n_outputs_):
                    classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                           return_inverse=True)
                    self.classes_.append(classes_k)
                    self.n_classes_.append(classes_k.shape[0])
                y = y_encoded

                if self.class_weight is not None:
                    expanded_class_weight = compute_sample_weight(
                        self.class_weight, y_original)

                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
                y = np.ascontiguousarray(y, dtype=DOUBLE)

            # Check parameters
            max_depth = (np.iinfo(np.int32).max if self.max_depth is None
                         else self.max_depth)
            max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                              else self.max_leaf_nodes)

            if isinstance(self.min_samples_leaf, numbers.Integral):
                if not 1 <= self.min_samples_leaf:
                    raise ValueError("min_samples_leaf must be at least 1 "
                                     "or in (0, 0.5], got %s"
                                     % self.min_samples_leaf)
                min_samples_leaf = self.min_samples_leaf
            else:  # float
                if not 0. < self.min_samples_leaf <= 0.5:
                    raise ValueError("min_samples_leaf must be at least 1 "
                                     "or in (0, 0.5], got %s"
                                     % self.min_samples_leaf)
                min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

            if isinstance(self.min_samples_split, numbers.Integral):
                if not 2 <= self.min_samples_split:
                    raise ValueError("min_samples_split must be an integer "
                                     "greater than 1 or a float in (0.0, 1.0]; "
                                     "got the integer %s"
                                     % self.min_samples_split)
                min_samples_split = self.min_samples_split
            else:  # float
                if not 0. < self.min_samples_split <= 1.:
                    raise ValueError("min_samples_split must be an integer "
                                     "greater than 1 or a float in (0.0, 1.0]; "
                                     "got the float %s"
                                     % self.min_samples_split)
                min_samples_split = int(
                    ceil(self.min_samples_split * n_samples))
                min_samples_split = max(2, min_samples_split)

            min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

            if isinstance(self.max_features, str):
                if self.max_features == "auto":
                    if is_classification:
                        max_features = max(1, int(np.sqrt(self.n_features_)))
                    else:
                        max_features = self.n_features_
                elif self.max_features == "sqrt":
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                elif self.max_features == "log2":
                    max_features = max(1, int(np.log2(self.n_features_)))
                else:
                    raise ValueError("Invalid value for max_features. "
                                     "Allowed string values are 'auto', "
                                     "'sqrt' or 'log2'.")
            elif self.max_features is None:
                max_features = self.n_features_
            elif isinstance(self.max_features, numbers.Integral):
                max_features = self.max_features
            else:  # float
                if self.max_features > 0.0:
                    max_features = max(1,
                                       int(self.max_features * self.n_features_))
                else:
                    max_features = 0

            self.max_features_ = max_features

            if len(y) != n_samples:
                raise ValueError("Number of labels=%d does not match "
                                 "number of samples=%d" % (len(y), n_samples))
            if not 0 <= self.min_weight_fraction_leaf <= 0.5:
                raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
            if max_depth <= 0:
                raise ValueError("max_depth must be greater than zero. ")
            if not (0 < max_features <= self.n_features_):
                raise ValueError("max_features must be in (0, n_features]")
            if not isinstance(max_leaf_nodes, numbers.Integral):
                raise ValueError("max_leaf_nodes must be integral number but was "
                                 "%r" % max_leaf_nodes)
            if -1 < max_leaf_nodes < 2:
                raise ValueError(("max_leaf_nodes {0} must be either None "
                                  "or larger than 1").format(max_leaf_nodes))

            if sample_weight is not None:
                sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

            if expanded_class_weight is not None:
                if sample_weight is not None:
                    sample_weight = sample_weight * expanded_class_weight
                else:
                    sample_weight = expanded_class_weight

            # Set min_weight_leaf from min_weight_fraction_leaf
            if sample_weight is None:
                min_weight_leaf = (self.min_weight_fraction_leaf *
                                   n_samples)
            else:
                min_weight_leaf = (self.min_weight_fraction_leaf *
                                   np.sum(sample_weight))

            min_impurity_split = self.min_impurity_split
            if min_impurity_split is not None:
                warnings.warn("The min_impurity_split parameter is deprecated. "
                              "Its default value has changed from 1e-7 to 0 in "
                              "version 0.23, and it will be removed in 0.25. "
                              "Use the min_impurity_decrease parameter instead.",
                              FutureWarning)

                if min_impurity_split < 0.:
                    raise ValueError("min_impurity_split must be greater than "
                                     "or equal to 0")
            else:
                min_impurity_split = 0

            if self.min_impurity_decrease < 0.:
                raise ValueError("min_impurity_decrease must be greater than "
                                 "or equal to 0")

            if self.presort != 'deprecated':
                warnings.warn("The parameter 'presort' is deprecated and has no "
                              "effect. It will be removed in v0.24. You can "
                              "suppress this warning by not passing any value "
                              "to the 'presort' parameter.",
                              FutureWarning)

            # Build tree
            criterion = self.criterion
            if not isinstance(criterion, Criterion):
                if is_classification:
                    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                             self.n_classes_)
                else:
                    criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                             n_samples)

            SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS

            splitter = self.splitter
            if not isinstance(self.splitter, Splitter):
                splitter = SPLITTERS[self.splitter](criterion,
                                                    self.max_features_,
                                                    min_samples_leaf,
                                                    min_weight_leaf,
                                                    random_state)

            if is_classifier(self):
                self.tree_ = Tree(self.n_features_,
                                  self.n_classes_, self.n_outputs_)

            else:
                self.tree_ = Tree(self.n_features_,
                                  # TODO: tree should't need this in this case
                                  np.array([1] * self.n_outputs_,
                                           dtype=np.intp),
                                  self.n_outputs_)

            # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
            if max_leaf_nodes < 0:
                builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                                min_samples_leaf,
                                                min_weight_leaf,
                                                max_depth,
                                                self.min_impurity_decrease,
                                                min_impurity_split)
            else:
                builder = BestFirstTreeBuilder(splitter, min_samples_split,
                                               min_samples_leaf,
                                               min_weight_leaf,
                                               max_depth,
                                               max_leaf_nodes,
                                               self.min_impurity_decrease,
                                               min_impurity_split)

            builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
            # print(self.tree_.children_left.shape)
            if self.n_outputs_ == 1 and is_classifier(self):
                self.n_classes_ = self.n_classes_[0]
                self.classes_ = self.classes_[0]
            # print(self.tree_.weighted_n_node_samples)

            e = self.e
            # print(e)
            # for i in range(self.tree_.value.shape[0]):
            #
            # 	for j in range(self.tree_.value.shape[2]):
            #
            # 		self.e = e /((self.tree_.value[i][0][j] + max_depth))
            # 		#print(self.tree_.value[i][0][j])
            # 		self.tree_.value[i][0][j] = self.addNoise(self.tree_.value[i][0][j])
            # 		#print(self.tree_.value[i][0][j])

            # print(self.tree_.value[0][0])

            for i in range(self.tree_.value.shape[0]):
                fr = np.sum(self.tree_.value[i][0])
                self.e = e / (fr + max_depth)
                self.tree_.value[i][0] = self.addNoise(self.tree_.value[i][0])

            self._prune_tree()
            # print(self.tree_.value[0][0])
            return self

        def _validate_X_predict(self, X, check_input):
            """Validate X whenever one tries to predict, apply, predict_proba"""
            if check_input:
                X = check_array(X, dtype=DTYPE, accept_sparse="csr")
                if issparse(X) and (X.indices.dtype != np.intc or
                                    X.indptr.dtype != np.intc):
                    raise ValueError("No support for np.int64 index based "
                                     "sparse matrices")

            n_features = X.shape[1]
            if self.n_features_ != n_features:
                raise ValueError("Number of features of the model must "
                                 "match the input. Model n_features is %s and "
                                 "input n_features is %s "
                                 % (self.n_features_, n_features))
            #print(self.tree_.n_node_samples[self.tree_.children_left != -1])
            return X

        def predict(self, X, check_input=True):
            """Predict class or regression value for X.

            For a classification model, the predicted class for each sample in X is
            returned. For a regression model, the predicted value based on X is
            returned.

            Parameters
            ----------
            X : {array-like, sparse matrix} of shape (n_samples, n_features)
                    The input samples. Internally, it will be converted to
                    ``dtype=np.float32`` and if a sparse matrix is provided
                    to a sparse ``csr_matrix``.

            check_input : bool, default=True
                    Allow to bypass several input checking.
                    Don't use this parameter unless you know what you do.

            Returns
            -------
            y : array-like of shape (n_samples,) or (n_samples, n_outputs)
                    The predicted classes, or the predict values.
            """
            check_is_fitted(self)
            X = self._validate_X_predict(X, check_input)
            proba = self.tree_.predict(X)

            #proba = self.addNoise(proba)
            # print(proba)
            n_samples = X.shape[0]

            # Classification
            if is_classifier(self):
                if self.n_outputs_ == 1:
                    return self.classes_.take(np.argmax(proba, axis=1), axis=0)

                else:
                    class_type = self.classes_[0].dtype
                    predictions = np.zeros((n_samples, self.n_outputs_),
                                           dtype=class_type)
                    for k in range(self.n_outputs_):
                        predictions[:, k] = self.classes_[k].take(
                            np.argmax(proba[:, k], axis=1),
                            axis=0)

                    return predictions

        def addNoise(self, value):
            # print(proba)
            lp = laplace.Laplace().set_epsilon(
                self.e).set_epsilon_delta(self.e, 0).set_sensitivity(1)
            noisy_counts = np.zeros(value.shape[0])
            for i in range(noisy_counts.shape[0]):
                noisy_counts[i] = lp.randomise(value[i])

            return noisy_counts
Example #22
0
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_

        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        classes_ = self.classes_[0]
        n_classes_ = self.n_classes_[0]
        # convert model to estimators
        params = {
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'random_state': None,
        }
        if not sklearn_check_version('1.0'):
            params['min_impurity_split'] = self.min_impurity_split
        est = DecisionTreeClassifier(**params)
        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = classes_
            est_i.n_classes_ = n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth',
            # 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i,
                                                      n_classes_)

            # node_ndarray = tree_i_state_class.node_ar
            # value_ndarray = tree_i_state_class.value_ar
            # value_shape = (node_ndarray.shape[0], self.n_outputs_,
            #                n_classes_)
            # assert np.allclose(
            #     value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')
            # ), "Value array is non-integer"
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            est_i.tree_ = Tree(self.n_features_in_,
                               np.array([n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self._cached_estimators_ = estimators_
        return estimators_
    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):
        """Build a newsvendor decision tree regressor from the training set (X, y).

        Method is based on [1] and was adapted to enable usage of the newsvendor criterion

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (real numbers). Use ``dtype=np.float64`` and
            ``order='C'`` for maximum efficiency.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node.
        check_input : bool, default=True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.
        X_idx_sorted : array-like of shape (n_samples, n_features), \
            default=None
            The indexes of the sorted training input samples. If many tree
            are grown on the same dataset, this allows the ordering to be
            cached between trees. If None, the data will be sorted here.
            Don't use this parameter unless you know what to do.
        Returns
        -------
        self : NewsvendorDecisionTreeRegressor
            Fitted estimator.

        References
        ----------
        [1] scikit-learn, BaseDecisionTree.fit()
            <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_classes.py>
        """

        random_state = check_random_state(self.random_state)

        if self.ccp_alpha < 0.0:
            raise ValueError("ccp_alpha must be greater than or equal to 0")

        # Need to validate separately here.
        # We can't pass multi_ouput=True because that would allow y to be
        # csr.
        check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
        check_y_params = dict(ensure_2d=False, dtype=None)

        X, y = self._validate_data(X,
                                   y,
                                   validate_separately=(check_X_params,
                                                        check_y_params))

        if issparse(X):
            X.sort_indices()
            if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
                raise ValueError("No support for np.int64 index based "
                                 "sparse matrices")

        # Determine output settings
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        self.cu_, self.co_ = check_cu_co(self.cu, self.co, self.n_outputs_)

        max_depth = (np.iinfo(np.int32).max
                     if self.max_depth is None else self.max_depth)
        max_leaf_nodes = (-1 if self.max_leaf_nodes is None else
                          self.max_leaf_nodes)

        if isinstance(self.min_samples_leaf, numbers.Integral):
            if not 1 <= self.min_samples_leaf:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = self.min_samples_leaf
        else:  # float
            if not 0. < self.min_samples_leaf <= 0.5:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

        if isinstance(self.min_samples_split, numbers.Integral):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s" % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0. < self.min_samples_split <= 1.:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s" % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError("Invalid value for max_features. "
                                 "Allowed string values are 'auto', "
                                 "'sqrt' or 'log2'.")
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, numbers.Integral):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1,
                                   int(self.max_features * self.n_features_))
            else:
                max_features = 0

        self.max_features_ = max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")
        if not isinstance(max_leaf_nodes, numbers.Integral):
            raise ValueError("max_leaf_nodes must be integral number but was "
                             "%r" % max_leaf_nodes)
        if -1 < max_leaf_nodes < 2:
            raise ValueError(("max_leaf_nodes {0} must be either None "
                              "or larger than 1").format(max_leaf_nodes))

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Set min_weight_leaf from min_weight_fraction_leaf
        if sample_weight is None:
            min_weight_leaf = (self.min_weight_fraction_leaf * n_samples)
        else:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               np.sum(sample_weight))

        min_impurity_split = self.min_impurity_split
        if min_impurity_split is not None:
            warnings.warn(
                "The min_impurity_split parameter is deprecated. "
                "Its default value has changed from 1e-7 to 0 in "
                "version 0.23, and it will be removed in 0.25. "
                "Use the min_impurity_decrease parameter instead.",
                FutureWarning)

            if min_impurity_split < 0.:
                raise ValueError("min_impurity_split must be greater than "
                                 "or equal to 0")
        else:
            min_impurity_split = 0

        if self.min_impurity_decrease < 0.:
            raise ValueError("min_impurity_decrease must be greater than "
                             "or equal to 0")

        # Build tree
        criterion = NewsvendorCriterion(self.n_outputs_, n_samples, self.cu_,
                                        self.co_)

        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, self.max_features_,
                                                min_samples_leaf,
                                                min_weight_leaf, random_state)

        self.tree_ = Tree(
            self.n_features_,
            # TODO: tree should't need this in this case
            np.array([1] * self.n_outputs_, dtype=np.intp),
            self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if max_leaf_nodes < 0:
            builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                            min_samples_leaf, min_weight_leaf,
                                            max_depth,
                                            self.min_impurity_decrease,
                                            min_impurity_split)
        else:
            builder = BestFirstTreeBuilder(splitter, min_samples_split,
                                           min_samples_leaf, min_weight_leaf,
                                           max_depth, max_leaf_nodes,
                                           self.min_impurity_decrease,
                                           min_impurity_split)

        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted=None)

        self._prune_tree()

        return self