def _get_terminal_node_forest(self, X): """Get a terminal node forest for X. Uses a trained forest to return the terminal node ids of each record of ``X`` for each tree. Returns a dictionary. The returned value of key ``predictions`` will hold a list of lists. The inner list is the list of terminal nodes of each tree for a record. The outer list entries correspond to each record of ``X``. :param array2d X: prediction input features """ # many fields defaulted here which are unused forest = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray([[]]), self.feature_names_, # variable_names 0, # m_try getattr(self, "n_estimators", 1), # num_trees self.verbose, self.seed, getattr(self, "n_jobs_", 1), # num_threads False, # write_forest 0, # importance_mode 0, # min_node_size [], # split_select_weights False, # use_split_select_weights [], # always_split_feature_names False, # use_always_split_feature_names True, # prediction_mode self.ranger_forest_["forest"], # loaded_forest True, # sample_with_replacement False, # probability [], # unordered_feature_names False, # use_unordered_features False, # save_memory 1, # split_rule [], # case_weights False, # use_case_weights {}, # class_weights False, # predict_all self.keep_inbag, [1], # sample_fraction 0, # alpha 0, # minprop self.holdout, 2, # prediction_type (terminal nodes) 1, # num_random_splits False, # oob_error 0, # max_depth [], # inbag False, # use_inbag [], # regularization_factor_ False, # use_regularization_factor_ False, # regularization_usedepth ) return forest
def predict_proba(self, X): """Predict probabilities for classes from X. :param array2d X: prediction input features """ check_is_fitted(self) X = check_array(X) self._check_n_features(X, reset=False) result = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray([[]]), self.feature_names_, # variable_names self.mtry_, 1, # num_trees self.verbose, self.seed, 1, # num_threads False, # write_forest self.importance_mode_, self.min_node_size, [], # split_select_weights False, # use_split_select_weights [], # always_split_variable_names False, # use_always_split_variable_names True, # prediction_mode self.ranger_forest_["forest"], # loaded_forest self.replace, # sample_with_replacement False, # probability [], # unordered_feature_names False, # use_unordered_features self.save_memory, self.split_rule_, [], # case_weights False, # use_case_weights {}, # class_weights False, # predict_all self.keep_inbag, [1], # sample_fraction 0.5, # alpha 0.1, # minprop self.holdout, 1, # prediction_type self.num_random_splits, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, self.use_regularization_factor_, self.regularization_usedepth, ) predictions = np.atleast_2d(np.array(result["predictions"])) return predictions[:, self.ranger_class_order_]
def fit( self, X, y, sample_weight=None, split_select_weights=None, always_split_features=None, categorical_features=None, ): """Fit the ranger random forest using training data. :param array2d X: training input features :param array2d y: training input targets, rows of (bool, float) representing (survival, time) :param array1d sample_weight: optional weights for input samples :param list split_select_weights: Vector of weights between 0 and 1 of probabilities to select features for splitting. Can be a single vector or a vector of vectors with one vector per tree. :param list always_split_features: Features which should always be selected for splitting. A list of column index values. :param list categorical_features: A list of column index values which should be considered categorical, or unordered. """ self.tree_type_ = 5 # tree_type, TREE_SURVIVAL # Check input X = check_array(X) # convert 1d array of 2tuples to 2d array # ranger expects the time first, and status second # since we follow the scikit-survival convention, we fliplr y = np.fliplr(np.array(y.tolist())) # Check the init parameters self._validate_parameters(X, y, sample_weight) # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) # Check weights sample_weight, use_sample_weight = self._check_sample_weight( sample_weight, X) ( always_split_features, use_always_split_features, ) = self._check_always_split_features(always_split_features) ( categorical_features, use_categorical_features, ) = self._check_categorical_features(categorical_features) ( split_select_weights, use_split_select_weights, ) = self._check_split_select_weights(split_select_weights) # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(y.astype("float64")), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, split_select_weights, use_split_select_weights, always_split_features, # always_split_variable_names use_always_split_features, # use_always_split_variable_names False, # prediction_mode {}, # loaded_forest self.replace, # sample_with_replacement False, # probability categorical_features, # unordered_feature_names use_categorical_features, # use_unordered_features False, # save_memory self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights {}, # class_weights False, # predict_all self.keep_inbag, self.sample_fraction_, self.alpha, self.minprop, self.holdout, 1, # prediction_type self.num_random_splits, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) self.event_times_ = np.array( self.ranger_forest_["forest"]["unique_death_times"]) # dtype to suppress warning about ragged nested sequences self.cumulative_hazard_function_ = np.array( self.ranger_forest_["forest"]["cumulative_hazard_function"], dtype=object) if self.enable_tree_details: sample_weight = sample_weight if len( sample_weight) > 0 else np.ones(len(X)) terminal_node_forest = self._get_terminal_node_forest(X) terminal_nodes = np.atleast_2d( terminal_node_forest["predictions"]).astype(int) self._set_leaf_samples(terminal_nodes) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def fit( self, X, y, sample_weight=None, class_weights=None, split_select_weights=None, always_split_features=None, categorical_features=None, ): """Fit the ranger random forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples :param list split_select_weights: Vector of weights between 0 and 1 of probabilities to select features for splitting. Can be a single vector or a vector of vectors with one vector per tree. :param list always_split_features: Features which should always be selected for splitting. A list of column index values. :param list categorical_features: A list of column index values which should be considered categorical, or unordered. """ self.tree_type_ = 3 # tree_type, TREE_REGRESSION # Check input X, y = self._validate_data(X, y) # Check the init parameters self._validate_parameters(X, y, sample_weight) # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) # Check weights sample_weight, use_sample_weight = self._check_sample_weight( sample_weight, X) ( always_split_features, use_always_split_features, ) = self._check_always_split_features(always_split_features) ( categorical_features, use_categorical_features, ) = self._check_categorical_features(categorical_features) ( split_select_weights, use_split_select_weights, ) = self._check_split_select_weights(split_select_weights) # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()), self.feature_names_, # variable_names self.mtry_, 1, # num_trees self.verbose, self.seed, 1, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, split_select_weights, use_split_select_weights, always_split_features, # always_split_feature_names bool(always_split_features), # use_always_split_feature_names False, # prediction_mode {}, # loaded_forest self.replace, # sample_with_replacement False, # probability categorical_features, # unordered_feature_names use_categorical_features, # use_unordered_features self.save_memory, self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights {}, # class_weights False, # predict_all self.keep_inbag, self.sample_fraction_, self.alpha, self.minprop, self.holdout, 1, # prediction_type self.num_random_splits, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) sample_weight = sample_weight if len(sample_weight) > 0 else np.ones( len(X)) terminal_node_forest = self._get_terminal_node_forest(X) terminal_nodes = np.atleast_2d( terminal_node_forest["predictions"]).astype(int) self._set_leaf_samples(terminal_nodes) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def fit( self, X, y, sample_weight=None, class_weights=None, split_select_weights=None, always_split_features=None, categorical_features=None, ): """Fit the ranger tree using training data. :param array2d X: training input features :param array1d y: training input target classes :param array1d sample_weight: optional weights for input samples :param dict class_weights: A dictionary of outcome classes to weights. :param list split_select_weights: Vector of weights between 0 and 1 of probabilities to select features for splitting. Can be a single vector or a vector of vectors with one vector per tree. :param list always_split_features: Features which should always be selected for splitting. A list of column index values. :param list categorical_features: A list of column index values which should be considered categorical, or unordered. """ self.tree_type_ = 9 # tree_type, TREE_PROBABILITY enables predict_proba # Check input X, y = self._validate_data(X, y) check_classification_targets(y) # Check the init parameters self._validate_parameters(X, y, sample_weight) # Map classes to indices y = np.copy(y) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) if class_weights is None: class_weights = {} else: try: class_weights = { idx: class_weights[k] for idx, k in enumerate(self.classes_) } except KeyError: raise ValueError( "class weights must have a weight for each class" ) from None # Check weights sample_weight, use_sample_weight = self._check_sample_weight(sample_weight, X) ( always_split_features, use_always_split_features, ) = self._check_always_split_features(always_split_features) ( categorical_features, use_categorical_features, ) = self._check_categorical_features(categorical_features) ( split_select_weights, use_split_select_weights, ) = self._check_split_select_weights(split_select_weights) # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()), self.feature_names_, # variable_names self.mtry_, 1, # num_trees self.verbose, self.seed, 1, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, split_select_weights, use_split_select_weights, always_split_features, # always_split_variable_names bool(always_split_features), # use_always_split_variable_names False, # prediction_mode {}, # loaded_forest self.replace, # sample_with_replacement False, # probability categorical_features, # unordered_variable_names use_categorical_features, # use_unordered_variable_names self.save_memory, self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights class_weights, False, # predict_all self.keep_inbag, self.sample_fraction_, 0.5, # alpha, ignored because maxstat can't be used on classification 0.1, # minprop, ignored because maxstat can't be used on classification self.holdout, 1, # prediction_type self.num_random_splits, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) self.ranger_class_order_ = np.argsort( np.array(self.ranger_forest_["forest"]["class_values"]).astype(int) ) sample_weight = sample_weight if len(sample_weight) > 0 else np.ones(len(X)) terminal_node_forest = self._get_terminal_node_forest(X) terminal_nodes = np.atleast_2d(terminal_node_forest["predictions"]).astype(int) self._set_leaf_samples(terminal_nodes) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def predict(self, X, quantiles=None): """Predict regression target for X. If quantiles are passed, predict quantiles instead. :param array2d X: prediction input features :param list(float) quantiles: a list of quantiles on which to predict. If the list contains a single quantile, the result will be a 1darray. If there are multiple quantiles, the result will be a 2darray with columns corresponding to respective quantiles. If quantiles are not provided the result is the regression target estimate. """ if quantiles is not None: return self.predict_quantiles(X, quantiles) if isinstance(self.quantiles, (list, np.ndarray)): return self.predict_quantiles(X, self.quantiles) check_is_fitted(self) X = check_array(X) self._check_n_features(X, reset=False) result = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray([[]]), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads False, # write_forest self.importance_mode_, self.min_node_size, self.split_select_weights or [], bool(self.split_select_weights), # use_split_select_weights [], # always_split_feature_names False, # use_always_split_feature_names True, # prediction_mode self.ranger_forest_["forest"], # loaded_forest self.replace, # sample_with_replacement False, # probability [], # unordered_feature_names False, # use_unordered_features self.save_memory, self.split_rule_, [], # case_weights False, # use_case_weights {}, # class_weights False, # predict_all self.keep_inbag, [1], # sample_fraction self.alpha, self.minprop, self.holdout, 1, # prediction_type self.num_random_splits, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, self.use_regularization_factor_, self.regularization_usedepth, ) return np.array(result["predictions"])