def predict_proba(self, X): """Predict probabilities for classes from X. :param array2d X: prediction input features """ check_is_fitted(self) X = check_array(X) self._check_n_features(X, reset=False) result = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray([[]]), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads False, # write_forest self.importance_mode_, self.min_node_size, self.split_select_weights or [], bool(self.split_select_weights), # use_split_select_weights [], # always_split_variable_names False, # use_always_split_variable_names True, # prediction_mode self.ranger_forest_["forest"], # loaded_forest np.asfortranarray([[]]), # snp_data self.replace, # sample_with_replacement False, # probability self.categorical_features_, # unordered_feature_names bool(self.categorical_features_), # use_unordered_features self.save_memory, self.split_rule_, [], # case_weights False, # use_case_weights self.class_weights or [], False, # predict_all self.keep_inbag, [1], # sample_fraction 0.5, # alpha 0.1, # minprop self.holdout, 1, # prediction_type self.num_random_splits, False, # use_sparse_data self.order_snps_, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, self.use_regularization_factor_, self.regularization_usedepth, ) predictions = np.atleast_2d(np.array(result["predictions"])) return predictions[:, self.ranger_class_order_]
def _get_terminal_node_forest(self, X): """Get a terminal node forest for X. :param array2d X: prediction input features """ # many fields defaulted here which are unused forest = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray([[]]), self.feature_names_, # variable_names 0, # m_try self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads False, # write_forest 0, # importance_mode 0, # min_node_size [], # split_select_weights False, # use_split_select_weights [], # always_split_feature_names False, # use_always_split_feature_names True, # prediction_mode self.ranger_forest_["forest"], # loaded_forest np.asfortranarray([[]]), # snp_data True, # sample_with_replacement False, # probability [], # unordered_feature_names False, # use_unordered_features False, # save_memory 1, # split_rule [], # case_weights False, # use_case_weights [], # class_weights False, # predict_all self.keep_inbag, [1], # sample_fraction 0, # alpha 0, # minprop self.holdout, 2, # prediction_type (terminal nodes) 1, # num_random_splits False, # use_sparse_data False, # order_snps_ False, # oob_error 0, # max_depth [], # inbag False, # use_inbag [], # regularization_factor_ False, # use_regularization_factor_ False, # regularization_usedepth ) return forest
def fit(self, X, y, sample_weight=None): """Fit the ranger random forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples """ self.tree_type_ = 3 # tree_type, TREE_REGRESSION # Check input X, y = self._validate_data(X, y) # Check the init parameters self._validate_parameters(X, y, sample_weight) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) use_sample_weight = True # ranger does additional rng on samples if weights are passed. # if the weights are ones, then we dont want that extra rng. if np.array_equal(np.unique(sample_weight), np.array([1.0])): sample_weight = [] use_sample_weight = False else: sample_weight = [] use_sample_weight = False # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) if self.always_split_features is not None: always_split_features = [ str(c).encode() for c in self.always_split_features ] else: always_split_features = [] # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, self.split_select_weights or [], bool(self.split_select_weights), # use_split_select_weights always_split_features, # always_split_feature_names bool(always_split_features), # use_always_split_feature_names False, # prediction_mode {}, # loaded_forest np.asfortranarray([[]]), # snp_data self.replace, # sample_with_replacement False, # probability self.categorical_features_, # unordered_feature_names bool(self.categorical_features_), # use_unordered_features self.save_memory, self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights [], # class_weights False, # predict_all self.keep_inbag, self.sample_fraction_, self.alpha, self.minprop, self.holdout, 1, # prediction_type self.num_random_splits, False, # use_sparse_data self.order_snps_, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) if self.quantiles: forest = self._get_terminal_node_forest(X) terminal_nodes = np.array(forest["predictions"]).astype(int) self.random_node_values_ = np.empty( (np.max(terminal_nodes) + 1, self.n_estimators)) self.random_node_values_[:] = np.nan for tree in range(self.n_estimators): idx = np.arange(X.shape[0]) np.random.shuffle(idx) self.random_node_values_[terminal_nodes[idx, tree], tree] = y[idx] return self
def fit(self, X, y, sample_weight=None): """Fit the ranger random forest using training data. :param array2d X: training input features :param array2d y: training input targets, rows of (bool, float) representing (survival, time) :param array1d sample_weight: optional weights for input samples """ self.tree_type_ = 5 # tree_type, TREE_SURVIVAL # Check input X = check_array(X) # convert 1d array of 2tuples to 2d array # ranger expects the time first, and status second # since we follow the scikit-survival convention, we fliplr y = np.fliplr(np.array(y.tolist())) # Check the init parameters self._validate_parameters(X, y, sample_weight) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) use_sample_weight = True # ranger does additional rng on samples if weights are passed. # if the weights are ones, then we dont want that extra rng. if np.array_equal(np.unique(sample_weight), np.array([1.0])): sample_weight = [] use_sample_weight = False else: sample_weight = [] use_sample_weight = False # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) if self.always_split_features is not None: always_split_features = [ str(c).encode() for c in self.always_split_features ] else: always_split_features = [] # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(y.astype("float64")), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, self.split_select_weights or [], bool(self.split_select_weights), # use_split_select_weights always_split_features, # always_split_variable_names bool(always_split_features), # use_always_split_variable_names False, # prediction_mode {}, # loaded_forest np.asfortranarray([[]]), # snp_data self.replace, # sample_with_replacement False, # probability self.categorical_features_, # unordered_feature_names bool(self.categorical_features_), # use_unordered_features False, # save_memory self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights [], # class_weights False, # predict_all self.keep_inbag, self.sample_fraction_, self.alpha, self.minprop, self.holdout, 1, # prediction_type self.num_random_splits, False, # use_sparse_data self.order_snps_, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) self.event_times_ = np.array( self.ranger_forest_["forest"]["unique_death_times"]) # dtype to suppress warning about ragged nested sequences self.cumulative_hazard_function_ = np.array( self.ranger_forest_["forest"]["cumulative_hazard_function"], dtype=object) return self
def fit(self, X, y, sample_weight=None): """Fit the ranger random forest using training data. :param array2d X: training input features :param array1d y: training input target classes :param array1d sample_weight: optional weights for input samples """ self.tree_type_ = 9 # tree_type, TREE_PROBABILITY enables predict_proba # Check input X, y = self._validate_data(X, y) check_classification_targets(y) # Check the init parameters self._validate_parameters(X, y, sample_weight) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) use_sample_weight = True # ranger does additional rng on samples if weights are passed. # if the weights are ones, then we dont want that extra rng. if np.array_equal(np.unique(sample_weight), np.array([1.0])): sample_weight = [] use_sample_weight = False else: sample_weight = [] use_sample_weight = False # Map classes to indices y = np.copy(y) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) # Set X info self.feature_names_ = [str(c).encode() for c in range(X.shape[1])] self._check_n_features(X, reset=True) if self.always_split_features is not None: always_split_features = [ str(c).encode() for c in self.always_split_features ] else: always_split_features = [] # Fit the forest self.ranger_forest_ = ranger.ranger( self.tree_type_, np.asfortranarray(X.astype("float64")), np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()), self.feature_names_, # variable_names self.mtry_, self.n_estimators, # num_trees self.verbose, self.seed, self.n_jobs_, # num_threads True, # write_forest self.importance_mode_, self.min_node_size, self.split_select_weights or [], bool(self.split_select_weights), # use_split_select_weights always_split_features, # always_split_variable_names bool(always_split_features), # use_always_split_variable_names False, # prediction_mode {}, # loaded_forest np.asfortranarray([[]]), # snp_data self.replace, # sample_with_replacement False, # probability self.categorical_features_, # unordered_variable_names bool(self.categorical_features_), # use_unordered_variable_names self.save_memory, self.split_rule_, sample_weight, # case_weights use_sample_weight, # use_case_weights self.class_weights or [], False, # predict_all self.keep_inbag, self.sample_fraction_, 0.5, # alpha, ignored because maxstat can't be used on classification 0.1, # minprop, ignored because maxstat can't be used on classification self.holdout, 1, # prediction_type self.num_random_splits, False, # use_sparse_data self.order_snps_, self.oob_error, self.max_depth, self.inbag or [], bool(self.inbag), # use_inbag self.regularization_factor_, False, # use_regularization_factor self.regularization_usedepth, ) self.ranger_class_order_ = np.argsort( np.array( self.ranger_forest_["forest"]["class_values"]).astype(int)) return self