def deserialize_tree(tree_dict, n_features, n_classes, n_outputs): tree_dict['nodes'] = [tuple(lst) for lst in tree_dict['nodes']] names = [ 'left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples' ] tree_dict['nodes'] = np.array(tree_dict['nodes'], dtype=np.dtype({ 'names': names, 'formats': tree_dict['nodes_dtype'] })) tree_dict['values'] = np.array(tree_dict['values']) tree = Tree(n_features, np.array([n_classes], dtype=np.intp), n_outputs) tree.__setstate__(tree_dict) return tree
def estimators_(self): if hasattr(self, '_cached_estimators_'): if self._cached_estimators_: return self._cached_estimators_ if LooseVersion(sklearn_version) >= LooseVersion("0.22"): check_is_fitted(self) else: check_is_fitted(self, 'daal_model_') # convert model to estimators est = DecisionTreeRegressor( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=None) # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution estimators_ = [] for i in range(self.n_estimators): est_i = clone(est) est_i.n_features_ = self.n_features_ est_i.n_outputs_ = self.n_outputs_ tree_i_state_class = daal4py.getTreeState(self.daal_model_, i) tree_i_state_dict = { 'max_depth' : tree_i_state_class.max_depth, 'node_count' : tree_i_state_class.node_count, 'nodes' : tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) return estimators_
def _estimators_(self): if hasattr(self, '_cached_estimators_'): if self._cached_estimators_: return self._cached_estimators_ if LooseVersion(sklearn_version) >= LooseVersion("0.22"): check_is_fitted(self) else: check_is_fitted(self, 'daal_model_') classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { 'criterion': self.criterion, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split, 'min_samples_leaf': self.min_samples_leaf, 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, 'max_features': self.max_features, 'max_leaf_nodes': self.max_leaf_nodes, 'min_impurity_decrease': self.min_impurity_decrease, 'random_state': None, } if not sklearn_check_version('1.0'): params['min_impurity_split'] = self.min_impurity_split est = DecisionTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) for i in range(self.n_estimators): # print("Tree #{}".format(i)) est_i = clone(est) est_i.set_params(random_state=random_state_checked.randint( np.iinfo(np.int32).max)) if sklearn_check_version('1.0'): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ # treeState members: 'class_count', 'leaf_count', 'max_depth', # 'node_ar', 'node_count', 'value_ar' tree_i_state_class = daal4py.getTreeState(self.daal_model_, i, n_classes_) # node_ndarray = tree_i_state_class.node_ar # value_ndarray = tree_i_state_class.value_ar # value_shape = (node_ndarray.shape[0], self.n_outputs_, # n_classes_) # assert np.allclose( # value_ndarray, value_ndarray.astype(np.intc, casting='unsafe') # ), "Value array is non-integer" tree_i_state_dict = { 'max_depth': tree_i_state_class.max_depth, 'node_count': tree_i_state_class.node_count, 'nodes': tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } est_i.tree_ = Tree(self.n_features_in_, np.array([n_classes_], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self._cached_estimators_ = estimators_ return estimators_
def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Deprecations if sample_mask is not None: warn( "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) if X_argsorted is not None: warn( "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) # Convert data if check_input: X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state) self.criterion_ = criterion self.splitter_ = splitter self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state) self.tree_.build(X, y, sample_weight=sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def __init__(self, max_depth=3, min_samples_leaf=25, debug=False): self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.tree = Tree(np.size(X, axis=1), np.array([1]), 1) self.debug = debug
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): random_state = check_random_state(self.random_state) if self.ccp_alpha < 0.0: raise ValueError( "ccp_alpha must be greater than or equal to 0") if check_input: # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = is_classifier(self) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) y = np.copy(y) # print(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int( ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either None " "or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn("The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) if min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") if self.presort != 'deprecated': warnings.warn("The parameter 'presort' is deprecated and has no " "effect. It will be removed in v0.24. You can " "suppress this warning by not passing any value " "to the 'presort' parameter.", FutureWarning) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state) if is_classifier(self): self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) else: self.tree_ = Tree(self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) # print(self.tree_.children_left.shape) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] # print(self.tree_.weighted_n_node_samples) e = self.e # print(e) # for i in range(self.tree_.value.shape[0]): # # for j in range(self.tree_.value.shape[2]): # # self.e = e /((self.tree_.value[i][0][j] + max_depth)) # #print(self.tree_.value[i][0][j]) # self.tree_.value[i][0][j] = self.addNoise(self.tree_.value[i][0][j]) # #print(self.tree_.value[i][0][j]) # print(self.tree_.value[0][0]) for i in range(self.tree_.value.shape[0]): fr = np.sum(self.tree_.value[i][0]) self.e = e / (fr + max_depth) self.tree_.value[i][0] = self.addNoise(self.tree_.value[i][0]) self._prune_tree() # print(self.tree_.value[0][0]) return self
def daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.single, np.double] X = check_array(X, dtype=_supported_dtypes_) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ raise ValueError( _class_name + " does not currently support multi-output data. Consider using OneHotEncoder" ) y = check_array(y, ensure_2d=False, dtype=None) y, _ = self._validate_y_class_weight(y) self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if self.n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(self.n_classes_), fptype=X_fptype, method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap)) # compute dfc_trainingResult = dfc_algorithm.compute(X, y) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # convert model to estimators est = DecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=None) # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution estimators_ = [] for i in range(self.n_estimators): # print("Tree #{}".format(i)) est_i = clone(est) est_i.n_features_ = self.n_features_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = self.classes_ est_i.n_classes_ = self.n_classes_ # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar' tree_i_state_class = daal4py.getTreeState(model, i, self.n_classes_) node_ndarray = tree_i_state_class.node_ar value_ndarray = tree_i_state_class.value_ar value_shape = (node_ndarray.shape[0], self.n_outputs_, self.n_classes_) # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer" tree_i_state_dict = { 'max_depth': tree_i_state_class.max_depth, 'node_count': tree_i_state_class.node_count, 'nodes': tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } # est_i.tree_ = Tree(self.n_features_, np.array([self.n_classes_], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self.estimators_ = estimators_ # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
def daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.double, np.single] X = check_array(X, dtype=_supported_dtypes_) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype=getFPType(X), method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap)) dfr_trainingResult = dfr_algorithm.compute(X, y) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # convert model to estimators est = DecisionTreeRegressor( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=None) # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution estimators_ = [] for i in range(self.n_estimators): est_i = clone(est) est_i.n_features_ = self.n_features_ est_i.n_outputs_ = self.n_outputs_ tree_i_state_class = daal4py.getTreeState(model, i) tree_i_state_dict = { 'max_depth': tree_i_state_class.max_depth, 'node_count': tree_i_state_class.node_count, 'nodes': tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self.estimators_ = estimators_ # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
if getattr(y_train, "dtype", None) != DOUBLE or not y_train.flags.contigous: y_train = np.ascontiguousarray(y_train, dtype=DOUBLE) max_depth = (np.iinfo(np.int32).max if max_depth is None else max_depth) max_leaf_nodes = (-1 if max_leaf_nodes is None else max_leaf_nodes) max_features = max(1, int(np.sqrt(n_features_))) criterion = CRITERIA_CLF[criterion](n_outputs_, n_classes_) SPLITTERS = DENSE_SPLITTERS splitter = SPLITTERS[splitter](criterion, max_features, min_samples_leaf, min_weight_leaf, random_state) tree_ = Tree(n_features_, n_classes_, n_outputs_) builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, min_impurity_decrease, min_impurity_split) builder.build(tree_, X_train, y_train) classes_ = classes_[0] n_classes_ = np.atleast_1d(n_classes_) pruned_tree = Tree(n_features_, n_classes_, n_outputs_) _build_pruned_tree_ccp(pruned_tree, tree_, 0) tree_ = pruned_tree X_test = check_array(X_test, dtype=DTYPE, accept_sparse="csr")
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a survival tree from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = (n_samples, n_features), optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self """ random_state = check_random_state(self.random_state) if check_input: X, event, time = check_arrays_survival(X, y) time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) else: y_numeric, self.event_times_ = y n_samples, self.n_features_ = X.shape params = self._check_params(n_samples) self.n_outputs_ = self.event_times_.shape[0] # one "class" for CHF, one for survival function self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2 # Build tree criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = DENSE_SPLITTERS[self.splitter]( criterion, self.max_features_, params["min_samples_leaf"], params["min_weight_leaf"], random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if params["max_leaf_nodes"] < 0: builder = DepthFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], 0.0, # min_impurity_decrease params["min_impurity_split"]) else: builder = BestFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], params["max_leaf_nodes"], 0.0, # min_impurity_decrease params["min_impurity_split"]) builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted) return self
def digitize2tree(bins, right=False): """ Builds a decision tree which returns the same result as `lambda x: numpy.digitize(x, bins, right=right)` (see :epkg:`numpy:digitize`). :param bins: array of bins. It has to be 1-dimensional and monotonic. :param right: Indicating whether the intervals include the right or the left bin edge. Default behavior is (right==False) indicating that the interval does not include the right edge. The left bin end is open in this case, i.e., `bins[i-1] <= x < bins[i]` is the default behavior for monotonically increasing bins. :return: decision tree .. note:: The implementation of decision trees in :epkg:`scikit-learn` only allows one type of decision (`<=`). That's why the function throws an exception when `right=False`. However, this could be overcome by using :epkg:`ONNX` where all kind of decision rules are implemented. Default value for right is still *False* to follow *numpy* API even though this value raises an exception in *digitize2tree*. The following example shows what the tree looks like. .. runpython:: :showcode: import numpy from sklearn.tree import export_text from mlinsights.mltree import digitize2tree x = numpy.array([0.2, 6.4, 3.0, 1.6]) bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) expected = numpy.digitize(x, bins, right=True) tree = digitize2tree(bins, right=True) pred = tree.predict(x.reshape((-1, 1))) print("Comparison with numpy:") print(expected, pred) print("Tree:") print(export_text(tree, feature_names=['x'])) See also example :ref:`l-example-digitize`. .. versionadded:: 0.4 """ if not right: raise RuntimeError( "right must be True not right=%r" % right) ascending = len(bins) <= 1 or bins[0] < bins[1] if not ascending: bins2 = bins[::-1] cl = digitize2tree(bins2, right=right) n = len(bins) for i in range(cl.tree_.value.shape[0]): cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0] return cl tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) values = [] UNUSED = numpy.nan n_nodes = [] def add_root(index): if index < 0 or index >= len(bins): raise IndexError( # pragma: no cover "Unexpected index %d / len(bins)=%d." % ( index, len(bins))) parent = -1 is_left = False is_leaf = False threshold = bins[index] n = tree_add_node( tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) values.append(UNUSED) n_nodes.append(n) return n def add_nodes(parent, i, j, is_left): # add for bins[i:j] (j excluded) if is_left: # it means j is the parent split if i == j: # leaf n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) n_nodes.append(n) values.append(i) return n if i + 1 == j: # split values.append(UNUSED) th = bins[i] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, i, True) add_nodes(n, i, j, False) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n else: # it means i is the parent split if i + 1 == j: # leaf values.append(j) n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) n_nodes.append(n) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n raise NotImplementedError( # pragma: no cover "Unexpected case where i=%r, j=%r, is_left=%r." % ( i, j, is_left)) index = len(bins) // 2 add_root(index) add_nodes(0, 0, index, True) add_nodes(0, index, len(bins), False) cl = DecisionTreeRegressor() cl.tree_ = tree cl.tree_.value[:, 0, 0] = numpy.array( # pylint: disable=E1137 values, dtype=numpy.float64) cl.n_outputs = 1 cl.n_outputs_ = 1 try: # scikit-learn >= 0.24 cl.n_features_in_ = 1 except AttributeError: # scikit-learn < 0.24 cl.n_features_ = 1 try: # for scikit-learn<=0.23.2 cl.n_features_ = 1 except AttributeError: pass return cl
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): """Build a survival tree from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : deprecated, default="deprecated" This parameter is deprecated and has no effect Returns ------- self """ random_state = check_random_state(self.random_state) if check_input: X, event, time = check_arrays_survival(X, y) time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) else: y_numeric, self.event_times_ = y n_samples, self.n_features_ = X.shape self.n_features_in_ = self.n_features_ params = self._check_params(n_samples) if not isinstance(X_idx_sorted, str) or X_idx_sorted != "deprecated": warnings.warn( "The parameter 'X_idx_sorted' is deprecated and has no " "effect. It will be removed in sklearn 1.1 (renaming of 0.26). " "You can suppress this warning by not passing any value to the " "'X_idx_sorted' parameter.", FutureWarning ) self.n_outputs_ = self.event_times_.shape[0] # one "class" for CHF, one for survival function self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2 # Build tree criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = DENSE_SPLITTERS[self.splitter]( criterion, self.max_features_, params["min_samples_leaf"], params["min_weight_leaf"], random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if params["max_leaf_nodes"] < 0: builder = DepthFirstTreeBuilder(splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], 0.0, # min_impurity_decrease params["min_impurity_split"]) else: builder = BestFirstTreeBuilder(splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], params["max_leaf_nodes"], 0.0, # min_impurity_decrease params["min_impurity_split"]) builder.build(self.tree_, X, y_numeric, sample_weight) return self
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a newsvendor decision tree regressor from the training set (X, y). Method is based on [1] and was adapted to enable usage of the newsvendor criterion Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (real numbers). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like of shape (n_samples, n_features), \ default=None The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self : NewsvendorDecisionTreeRegressor Fitted estimator. References ---------- [1] scikit-learn, BaseDecisionTree.fit() <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_classes.py> """ random_state = check_random_state(self.random_state) if self.ccp_alpha < 0.0: raise ValueError("ccp_alpha must be greater than or equal to 0") # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters self.cu_, self.co_ = check_cu_co(self.cu, self.co, self.n_outputs_) max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either None " "or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn( "The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) if min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") # Build tree criterion = NewsvendorCriterion(self.n_outputs_, n_samples, self.cu_, self.co_) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree( self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted=None) self._prune_tree() return self