class BaseDecisionTree( six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin)): """Base class for decision trees. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, max_features, random_state): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features self.random_state = random_state self.n_features_ = None self.n_outputs_ = None self.classes_ = None self.n_classes_ = None self.splitter_ = None self.tree_ = None def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Deprecations if sample_mask is not None: warn( "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) if X_argsorted is not None: warn( "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) # Convert data if check_input: X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state) self.criterion_ = criterion self.splitter_ = splitter self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state) self.tree_.build(X, y, sample_weight=sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def predict(self, X): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in xrange(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0] @property def feature_importances_(self): """Return the feature importances. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Returns ------- feature_importances_ : array, shape = [n_features] """ if self.tree_ is None: raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.") return self.tree_.compute_feature_importances()
class myDecisionTreeClassifier(six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin, ClassifierMixin)): def __init__(self, # Max depth for Decision Tree max_depth=None, # Min number of samples per split min_samples_split=2, # Min samples per leaf node min_samples_leaf=1, # Max number of features to consider when looking for the best split max_features=None, # Init the random state of the tree random_state=None): self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features self.random_state = random_state # We'll waint until we fit to inti these: # Learning criterion for training tree self.criterion = None # Split method self.splitter = None # Number of features self.n_features_ = None # Number of outputs self.n_outputs_ = None # Labels of classes self.classes_ = None # Number of classes self.n_classes_ = None # Tree dataframe self.tree_ = None def fit(self, X, y, check_input=True, sample_weight=None): # Poll the randome state from the tree random_state = check_random_state(self.random_state) # If the data hasn't yet been formated if check_input: # Then convert the X data X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Get the dimentions of X n_samples, self.n_features_ = X.shape # Make sure that y is a 1d and not a id.T y = np.atleast_1d(y) # If our output is 1d if y.ndim == 1: # Reshape y to preserve the data contiguity y = np.reshape(y, (-1, 1)) # Get the number of outputs self.n_outputs_ = y.shape[1] y = np.copy(y) # Make a container for all unique classes self.classes_ = [] # Make a container for number of instances of each unique classe self.n_classes_ = [] # For each output of y for k in xrange(self.n_outputs_): # Get the unique classe lables and an array of indexs pointing to the lable classes_k, y[:, k] = unique(y[:, k], return_inverse=True) # Store the unique classe lables self.classes_.append(classes_k) # And store the unique classe lables' length self.n_classes_.append(classes_k.shape[0]) # Lets make this numpy array type ints for speed self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters # If no maxdepth was given max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth # If defult was given if isinstance(self.max_features, six.string_types): # then set it to the sqrt of number of features max_features = max(1, int(np.sqrt(self.n_features_))) # If None was given elif self.max_features is None: # Just use all of them max_features = self.n_features_ # Otherwise else: # Use whats given max_features = self.max_features # We we we're given a sample weight if sample_weight is not None: # Then we'll nedd to make sure its double precision if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) min_samples_split = self.min_samples_split criterion = self.criterion # If we have not yet inti our tree criterion if criterion is None: # Lets inti our entropy criterion criterion = Entropy(self.n_outputs_, self.n_classes_) splitter = self.splitter # If we have not yet inti our tree splitter if splitter is None: # Lets inti our best binary splitter splitter = BestSplitter(criterion, max_features, self.min_samples_leaf, random_state) # We'll save these so we don't have to init them agian a second time for retraining self.criterion_ = criterion self.splitter_ = splitter # Now lets init self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state) # and fit our tree database self.tree_.build(X, y, sample_weight=sample_weight) # If we only have one output if self.n_outputs_ == 1: # Then just save the first class self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] # Then save our tree return self def predict(self, X): """Predict class for a given X""" # Make sure the data is DTYPE for the tree and is 2D if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Get the dimentions of X n_samples, n_features = X.shape # Predict class from tree database proba = self.tree_.predict(X) # If we only have one output if self.n_outputs_ == 1: # Then use the index of the max prob to pick the class from classes_ return self.classes_.take(np.argmax(proba, axis=1), axis=0) # If we were trained with multiple outputs else: # Make a empty 2D array to hold predictions predictions = np.zeros((n_samples, self.n_outputs_)) # For each output for k in xrange(self.n_outputs_): # Then use the index of the max prob to pick the class from classes_ predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) # Return the results return predictions def predict_proba(self, X): """Predict class probabilities from the given X""" # Make sure the data is DTYPE for the tree and is 2D if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Get the dimentions of X n_samples, n_features = X.shape # Predict class from tree database proba = self.tree_.predict(X) # If we only have one output if self.n_outputs_ == 1: # Grab the predictions for the avalable classes proba = proba[:, :self.n_classes_] # Generate a normalizer from all the proba weights normalizer = proba.sum(axis=1)[:, np.newaxis] # Remap all of the zero normalizer elemnts to one # This is so just we can avoid deviding by zero normalizer[normalizer == 0.0] = 1.0 # Now normilize the proba by the total weight sum of each sample proba /= normalizer # Return the results return proba # If we were trained with multiple outputs else: # Make a empty container to hold all proba all_proba = [] # For each output for k in xrange(self.n_outputs_): # Grab the predictions for the avalable classes proba_k = proba[:, k, :self.n_classes_[k]] # Generate a normalizer from all the proba weights normalizer = proba_k.sum(axis=1)[:, np.newaxis] # Remap all of the zero normalizer elemnts to one # This is so just we can avoid deviding by zero normalizer[normalizer == 0.0] = 1.0 # Now normilize the proba by the total weight sum of each sample proba_k /= normalizer # Return the results all_proba.append(proba_k) # Return the results return all_proba
class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin)): """Base class for decision trees. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, max_features, random_state): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features self.random_state = random_state self.n_features_ = None self.n_outputs_ = None self.classes_ = None self.n_classes_ = None self.splitter_ = None self.tree_ = None def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Deprecations if sample_mask is not None: warn( "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning, ) if X_argsorted is not None: warn( "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning, ) # Convert data if check_input: X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( "Invalid value for max_features. Allowed string " 'values are "auto", "sqrt" or "log2".' ) elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: if getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous: sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError( "Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples) ) # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state) self.criterion_ = criterion self.splitter_ = splitter self.tree_ = Tree( self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state, ) self.tree_.build(X, y, sample_weight=sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def predict(self, X): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError( "Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features) ) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in xrange(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0] @property def feature_importances_(self): """Return the feature importances. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Returns ------- feature_importances_ : array, shape = [n_features] """ if self.tree_ is None: raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.") return self.tree_.compute_feature_importances()
class DecisionTreeClassifier(sk.DecisionTreeClassifier): def __init__( self, *, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, e, s): super().__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha) self.e = e self.s = s def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): random_state = check_random_state(self.random_state) if self.ccp_alpha < 0.0: raise ValueError( "ccp_alpha must be greater than or equal to 0") if check_input: # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = is_classifier(self) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) y = np.copy(y) # print(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int( ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either None " "or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn("The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) if min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") if self.presort != 'deprecated': warnings.warn("The parameter 'presort' is deprecated and has no " "effect. It will be removed in v0.24. You can " "suppress this warning by not passing any value " "to the 'presort' parameter.", FutureWarning) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state) if is_classifier(self): self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) else: self.tree_ = Tree(self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) # print(self.tree_.children_left.shape) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] # print(self.tree_.weighted_n_node_samples) e = self.e # print(e) # for i in range(self.tree_.value.shape[0]): # # for j in range(self.tree_.value.shape[2]): # # self.e = e /((self.tree_.value[i][0][j] + max_depth)) # #print(self.tree_.value[i][0][j]) # self.tree_.value[i][0][j] = self.addNoise(self.tree_.value[i][0][j]) # #print(self.tree_.value[i][0][j]) # print(self.tree_.value[0][0]) for i in range(self.tree_.value.shape[0]): fr = np.sum(self.tree_.value[i][0]) self.e = e / (fr + max_depth) self.tree_.value[i][0] = self.addNoise(self.tree_.value[i][0]) self._prune_tree() # print(self.tree_.value[0][0]) return self def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError("Number of features of the model must " "match the input. Model n_features is %s and " "input n_features is %s " % (self.n_features_, n_features)) #print(self.tree_.n_node_samples[self.tree_.children_left != -1]) return X def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array-like of shape (n_samples,) or (n_samples, n_outputs) The predicted classes, or the predict values. """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) #proba = self.addNoise(proba) # print(proba) n_samples = X.shape[0] # Classification if is_classifier(self): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: class_type = self.classes_[0].dtype predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take( np.argmax(proba[:, k], axis=1), axis=0) return predictions def addNoise(self, value): # print(proba) lp = laplace.Laplace().set_epsilon( self.e).set_epsilon_delta(self.e, 0).set_sensitivity(1) noisy_counts = np.zeros(value.shape[0]) for i in range(noisy_counts.shape[0]): noisy_counts[i] = lp.randomise(value[i]) return noisy_counts
max_features = max(1, int(np.sqrt(n_features_))) criterion = CRITERIA_CLF[criterion](n_outputs_, n_classes_) SPLITTERS = DENSE_SPLITTERS splitter = SPLITTERS[splitter](criterion, max_features, min_samples_leaf, min_weight_leaf, random_state) tree_ = Tree(n_features_, n_classes_, n_outputs_) builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, min_impurity_decrease, min_impurity_split) builder.build(tree_, X_train, y_train) classes_ = classes_[0] n_classes_ = np.atleast_1d(n_classes_) pruned_tree = Tree(n_features_, n_classes_, n_outputs_) _build_pruned_tree_ccp(pruned_tree, tree_, 0) tree_ = pruned_tree X_test = check_array(X_test, dtype=DTYPE, accept_sparse="csr") proba = tree_.predict(X_test) n_samples = X_test.shape[0] predictions = classes_.take(np.argmax(proba, axis=1), axis=0) metrics.accuracy_score(y_test, predictions)
class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin): """A survival tree. The quality of a split is measured by the log-rank splitting rule. See [1]_, [2]_ and [3]_ for further description. Parameters ---------- splitter : string, optional, default: "best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. max_depth : int or None, optional, default: None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int, float, optional, default: 6 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_samples_leaf : int, float, optional, default: 3 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, optional, default: 0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : int, float, string or None, optional, default: None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. max_leaf_nodes : int or None, optional, default: None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. presort : deprecated, optional, default: 'deprecated' This parameter is deprecated and will be removed in a future version. Attributes ---------- event_times_ : array of shape = (n_event_times,) Unique time points where events occurred. max_features_ : int, The inferred value of max_features. n_features_ : int The number of features when ``fit`` is performed. tree_ : Tree object The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object. See also -------- sksurv.ensemble.RandomSurvivalForest An ensemble of SurvivalTrees. References ---------- .. [1] Leblanc, M., & Crowley, J. (1993). Survival Trees by Goodness of Split. Journal of the American Statistical Association, 88(422), 457–467. .. [2] Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. The Annals of Applied Statistics, 2(3), 841–860. .. [3] Ishwaran, H., Kogalur, U. B. (2007). Random survival forests for R. R News, 7(2), 25–31. https://cran.r-project.org/doc/Rnews/Rnews_2007-2.pdf. """ def __init__(self, splitter="best", max_depth=None, min_samples_split=6, min_samples_leaf=3, min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None, presort='deprecated'): self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes self.presort = presort def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a survival tree from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = (n_samples, n_features), optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self """ random_state = check_random_state(self.random_state) if check_input: X, event, time = check_arrays_survival(X, y) time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) else: y_numeric, self.event_times_ = y n_samples, self.n_features_ = X.shape params = self._check_params(n_samples) self.n_outputs_ = self.event_times_.shape[0] # one "class" for CHF, one for survival function self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2 # Build tree criterion = LogrankCriterion(self.n_outputs_, n_samples, self.event_times_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = DENSE_SPLITTERS[self.splitter]( criterion, self.max_features_, params["min_samples_leaf"], params["min_weight_leaf"], random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if params["max_leaf_nodes"] < 0: builder = DepthFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], 0.0, # min_impurity_decrease params["min_impurity_split"]) else: builder = BestFirstTreeBuilder( splitter, params["min_samples_split"], params["min_samples_leaf"], params["min_weight_leaf"], params["max_depth"], params["max_leaf_nodes"], 0.0, # min_impurity_decrease params["min_impurity_split"]) builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted) return self def _check_params(self, n_samples): # Check parameters max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth) if max_depth <= 0: raise ValueError("max_depth must be greater than zero.") max_leaf_nodes = self._check_max_leaf_nodes() min_samples_leaf = self._check_min_samples_leaf(n_samples) min_samples_split = self._check_min_samples_split(n_samples) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) self._check_max_features() if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") min_weight_leaf = self.min_weight_fraction_leaf * n_samples min_impurity_split = 1e-7 if self.presort != 'deprecated': warnings.warn( "The parameter 'presort' is deprecated and has no " "effect. It will be removed in v0.24. You can " "suppress this warning by not passing any value " "to the 'presort' parameter.", DeprecationWarning) return { "max_depth": max_depth, "max_leaf_nodes": max_leaf_nodes, "min_samples_leaf": min_samples_leaf, "min_samples_split": min_samples_split, "min_impurity_split": min_impurity_split, "min_weight_leaf": min_weight_leaf, } def _check_max_leaf_nodes(self): max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {} must be either None " "or larger than 1").format(max_leaf_nodes)) return max_leaf_nodes def _check_min_samples_leaf(self, n_samples): if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) # FIXME throw exception if min_samples_leaf < 2 return min_samples_leaf def _check_min_samples_split(self, n_samples): if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) return min_samples_split def _check_max_features(self): if isinstance(self.max_features, str): if self.max_features in ("auto", "sqrt"): max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") self.max_features_ = max_features def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict""" if check_input: X = check_array(X, dtype=DTYPE) n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError("Number of features of the model must " "match the input. Model n_features is %s and " "input n_features is %s." % (self.n_features_, n_features)) return X def predict(self, X, check_input=True): """Predict risk score. The risk score is the total number of events, which can be estimated by the sum of the estimated cumulative hazard function :math:`\\hat{H}_h` in terminal node :math:`h`. .. math:: \\sum_{j=1}^{n(h)} \\hat{H}_h(T_{j} \\mid x) , where :math:`n(h)` denotes the number of distinct event times of samples belonging to the same terminal node as :math:`x`. Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- risk_scores : ndarray, shape = (n_samples,) Predicted risk scores. """ chf = self.predict_cumulative_hazard_function(X, check_input, return_array=True) return chf.sum(1) def predict_cumulative_hazard_function(self, X, check_input=True, return_array="warn"): """Predict cumulative hazard function. The cumulative hazard function (CHF) for an individual with feature vector :math:`x` is computed from all samples of the training data that are in the same terminal node as :math:`x`. It is estimated by the Nelson–Aalen estimator. Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. return_array : boolean If set, return an array with the cumulative hazard rate for each `self.event_times_`, otherwise an array of :class:`sksurv.functions.StepFunction`. Returns ------- cum_hazard : ndarray If `return_array` is set, an array with the cumulative hazard rate for each `self.event_times_`, otherwise an array of :class:`sksurv.functions.StepFunction` will be returned. Examples -------- >>> import matplotlib.pyplot as plt >>> from sksurv.datasets import load_whas500 >>> from sksurv.tree import SurvivalTree Load and prepare the data. >>> X, y = load_whas500() >>> X = X.astype(float) Fit the model. >>> estimator = SurvivalTree().fit(X, y) Estimate the cumulative hazard function for the first 5 samples. >>> chf_funcs = estimator.predict_cumulative_hazard_function(X.iloc[:5], return_array=False) Plot the estimated cumulative hazard functions. >>> for fn in chf_funcs: ... plt.step(fn.x, fn(fn.x), where="post") ... >>> plt.ylim(0, 1) >>> plt.show() """ if return_array == "warn": warnings.warn( "predict_cumulative_hazard_function will return an array of StepFunction instances in 0.14. " "Use return_array=True to keep the old behavior.", FutureWarning) check_is_fitted(self, 'tree_') X = self._validate_X_predict(X, check_input) pred = self.tree_.predict(X) arr = pred[..., 0] if return_array: return arr return _array_to_step_function(self.event_times_, arr) def predict_survival_function(self, X, check_input=True, return_array="warn"): """Predict survival function. The survival function for an individual with feature vector :math:`x` is computed from all samples of the training data that are in the same terminal node as :math:`x`. It is estimated by the Kaplan-Meier estimator. Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix. check_input : boolean, default: True Allow to bypass several input checking. Don't use this parameter unless you know what you do. return_array : boolean If set, return an array with the probability of survival for each `self.event_times_`, otherwise an array of :class:`sksurv.functions.StepFunction`. Returns ------- survival : ndarray If `return_array` is set, an array with the probability of survival for each `self.event_times_`, otherwise an array of :class:`sksurv.functions.StepFunction` will be returned. Examples -------- >>> import matplotlib.pyplot as plt >>> from sksurv.datasets import load_whas500 >>> from sksurv.tree import SurvivalTree Load and prepare the data. >>> X, y = load_whas500() >>> X = X.astype(float) Fit the model. >>> estimator = SurvivalTree().fit(X, y) Estimate the survival function for the first 5 samples. >>> surv_funcs = estimator.predict_survival_function(X.iloc[:5], return_array=False) Plot the estimated survival functions. >>> for fn in surv_funcs: ... plt.step(fn.x, fn(fn.x), where="post") ... >>> plt.ylim(0, 1) >>> plt.show() """ if return_array == "warn": warnings.warn( "predict_survival_function will return an array of StepFunction instances in 0.14. " "Use return_array=True to keep the old behavior.", FutureWarning) check_is_fitted(self, 'tree_') X = self._validate_X_predict(X, check_input) pred = self.tree_.predict(X) arr = pred[..., 1] if return_array: return arr return _array_to_step_function(self.event_times_, arr)