def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): random_state = check_random_state(self.random_state) if self.ccp_alpha < 0.0: raise ValueError( "ccp_alpha must be greater than or equal to 0") if check_input: # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = is_classifier(self) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) y = np.copy(y) # print(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int( ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either None " "or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn("The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) if min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") if self.presort != 'deprecated': warnings.warn("The parameter 'presort' is deprecated and has no " "effect. It will be removed in v0.24. You can " "suppress this warning by not passing any value " "to the 'presort' parameter.", FutureWarning) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state) if is_classifier(self): self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) else: self.tree_ = Tree(self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) # print(self.tree_.children_left.shape) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] # print(self.tree_.weighted_n_node_samples) e = self.e # print(e) # for i in range(self.tree_.value.shape[0]): # # for j in range(self.tree_.value.shape[2]): # # self.e = e /((self.tree_.value[i][0][j] + max_depth)) # #print(self.tree_.value[i][0][j]) # self.tree_.value[i][0][j] = self.addNoise(self.tree_.value[i][0][j]) # #print(self.tree_.value[i][0][j]) # print(self.tree_.value[0][0]) for i in range(self.tree_.value.shape[0]): fr = np.sum(self.tree_.value[i][0]) self.e = e / (fr + max_depth) self.tree_.value[i][0] = self.addNoise(self.tree_.value[i][0]) self._prune_tree() # print(self.tree_.value[0][0]) return self
max_depth = (np.iinfo(np.int32).max if max_depth is None else max_depth) max_leaf_nodes = (-1 if max_leaf_nodes is None else max_leaf_nodes) max_features = max(1, int(np.sqrt(n_features_))) criterion = CRITERIA_CLF[criterion](n_outputs_, n_classes_) SPLITTERS = DENSE_SPLITTERS splitter = SPLITTERS[splitter](criterion, max_features, min_samples_leaf, min_weight_leaf, random_state) tree_ = Tree(n_features_, n_classes_, n_outputs_) builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, min_impurity_decrease, min_impurity_split) builder.build(tree_, X_train, y_train) classes_ = classes_[0] n_classes_ = np.atleast_1d(n_classes_) pruned_tree = Tree(n_features_, n_classes_, n_outputs_) _build_pruned_tree_ccp(pruned_tree, tree_, 0) tree_ = pruned_tree X_test = check_array(X_test, dtype=DTYPE, accept_sparse="csr") proba = tree_.predict(X_test) n_samples = X_test.shape[0] predictions = classes_.take(np.argmax(proba, axis=1), axis=0)
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): """Build a survival tree from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : deprecated, default="deprecated" This parameter is deprecated and has no effect Returns ------- self """ random_state = check_random_state(self.random_state) if check_input: X, event, time = check_arrays_survival(X, y) time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) else: y_numeric, self.event_times_ = y n_samples, self.n_features_ = X.shape self.n_features_in_ = self.n_features_ params = self._check_params(n_samples) if not isinstance(X_idx_sorted, str) or X_idx_sorted != "deprecated": warnings.warn( "The parameter 'X_idx_sorted' is deprecated and has no " "effect. It will be removed in sklearn 1.1 (renaming of 0.26). " "You can suppress this warning by not passing any value to the " "'X_idx_sorted' parameter.", FutureWarning ) self.n_outputs_ = self.event_times_.shape[0] # one "class" for CHF, one for survival function self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2 # Build tree criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = DENSE_SPLITTERS[self.splitter]( criterion, self.max_features_, params["min_samples_leaf"], params["min_weight_leaf"], random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if params["max_leaf_nodes"] < 0: builder = DepthFirstTreeBuilder(splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], 0.0, # min_impurity_decrease params["min_impurity_split"]) else: builder = BestFirstTreeBuilder(splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], params["max_leaf_nodes"], 0.0, # min_impurity_decrease params["min_impurity_split"]) builder.build(self.tree_, X, y_numeric, sample_weight) return self
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a survival tree from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = (n_samples, n_features), optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self """ random_state = check_random_state(self.random_state) if check_input: X, event, time = check_arrays_survival(X, y) time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) else: y_numeric, self.event_times_ = y n_samples, self.n_features_ = X.shape params = self._check_params(n_samples) self.n_outputs_ = self.event_times_.shape[0] # one "class" for CHF, one for survival function self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2 # Build tree criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = DENSE_SPLITTERS[self.splitter]( criterion, self.max_features_, params["min_samples_leaf"], params["min_weight_leaf"], random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if params["max_leaf_nodes"] < 0: builder = DepthFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], 0.0, # min_impurity_decrease params["min_impurity_split"]) else: builder = BestFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], params["max_leaf_nodes"], 0.0, # min_impurity_decrease params["min_impurity_split"]) builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted) return self
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a newsvendor decision tree regressor from the training set (X, y). Method is based on [1] and was adapted to enable usage of the newsvendor criterion Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (real numbers). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like of shape (n_samples, n_features), \ default=None The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self : NewsvendorDecisionTreeRegressor Fitted estimator. References ---------- [1] scikit-learn, BaseDecisionTree.fit() <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_classes.py> """ random_state = check_random_state(self.random_state) if self.ccp_alpha < 0.0: raise ValueError("ccp_alpha must be greater than or equal to 0") # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters self.cu_, self.co_ = check_cu_co(self.cu, self.co, self.n_outputs_) max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either None " "or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn( "The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) if min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") # Build tree criterion = NewsvendorCriterion(self.n_outputs_, n_samples, self.cu_, self.co_) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree( self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted=None) self._prune_tree() return self