Exemple #1
0
    def predict_proba(self, X):
        """Predict probabilities for classes from X.

        :param array2d X: prediction input features
        """
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)

        result = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            False,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            [],  # always_split_variable_names
            False,  # use_always_split_variable_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            np.asfortranarray([[]]),  # snp_data
            self.replace,  # sample_with_replacement
            False,  # probability
            self.categorical_features_,  # unordered_feature_names
            bool(self.categorical_features_),  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            [],  # case_weights
            False,  # use_case_weights
            self.class_weights or [],
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            0.5,  # alpha
            0.1,  # minprop
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            False,  # use_sparse_data
            self.order_snps_,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            self.use_regularization_factor_,
            self.regularization_usedepth,
        )
        predictions = np.atleast_2d(np.array(result["predictions"]))
        return predictions[:, self.ranger_class_order_]
Exemple #2
0
    def _get_terminal_node_forest(self, X):
        """Get a terminal node forest for X.

        :param array2d X: prediction input features
        """
        # many fields defaulted here which are unused
        forest = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            0,  # m_try
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            False,  # write_forest
            0,  # importance_mode
            0,  # min_node_size
            [],  # split_select_weights
            False,  # use_split_select_weights
            [],  # always_split_feature_names
            False,  # use_always_split_feature_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            np.asfortranarray([[]]),  # snp_data
            True,  # sample_with_replacement
            False,  # probability
            [],  # unordered_feature_names
            False,  # use_unordered_features
            False,  # save_memory
            1,  # split_rule
            [],  # case_weights
            False,  # use_case_weights
            [],  # class_weights
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            0,  # alpha
            0,  # minprop
            self.holdout,
            2,  # prediction_type (terminal nodes)
            1,  # num_random_splits
            False,  # use_sparse_data
            False,  # order_snps_
            False,  # oob_error
            0,  # max_depth
            [],  # inbag
            False,  # use_inbag
            [],  # regularization_factor_
            False,  # use_regularization_factor_
            False,  # regularization_usedepth
        )
        return forest
Exemple #3
0
    def fit(self, X, y, sample_weight=None):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        """
        self.tree_type_ = 3  # tree_type, TREE_REGRESSION

        # Check input
        X, y = self._validate_data(X, y)

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)
            use_sample_weight = True
            # ranger does additional rng on samples if weights are passed.
            # if the weights are ones, then we dont want that extra rng.
            if np.array_equal(np.unique(sample_weight), np.array([1.0])):
                sample_weight = []
                use_sample_weight = False
        else:
            sample_weight = []
            use_sample_weight = False

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        if self.always_split_features is not None:
            always_split_features = [
                str(c).encode() for c in self.always_split_features
            ]
        else:
            always_split_features = []

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            always_split_features,  # always_split_feature_names
            bool(always_split_features),  # use_always_split_feature_names
            False,  # prediction_mode
            {},  # loaded_forest
            np.asfortranarray([[]]),  # snp_data
            self.replace,  # sample_with_replacement
            False,  # probability
            self.categorical_features_,  # unordered_feature_names
            bool(self.categorical_features_),  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            [],  # class_weights
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            False,  # use_sparse_data
            self.order_snps_,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )

        if self.quantiles:
            forest = self._get_terminal_node_forest(X)
            terminal_nodes = np.array(forest["predictions"]).astype(int)
            self.random_node_values_ = np.empty(
                (np.max(terminal_nodes) + 1, self.n_estimators))
            self.random_node_values_[:] = np.nan
            for tree in range(self.n_estimators):
                idx = np.arange(X.shape[0])
                np.random.shuffle(idx)
                self.random_node_values_[terminal_nodes[idx, tree],
                                         tree] = y[idx]

        return self
    def fit(self, X, y, sample_weight=None):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array2d y: training input targets, rows of (bool, float)
            representing (survival, time)
        :param array1d sample_weight: optional weights for input samples
        """
        self.tree_type_ = 5  # tree_type, TREE_SURVIVAL

        # Check input
        X = check_array(X)

        # convert 1d array of 2tuples to 2d array
        # ranger expects the time first, and status second
        # since we follow the scikit-survival convention, we fliplr
        y = np.fliplr(np.array(y.tolist()))

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)
            use_sample_weight = True
            # ranger does additional rng on samples if weights are passed.
            # if the weights are ones, then we dont want that extra rng.
            if np.array_equal(np.unique(sample_weight), np.array([1.0])):
                sample_weight = []
                use_sample_weight = False
        else:
            sample_weight = []
            use_sample_weight = False

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        if self.always_split_features is not None:
            always_split_features = [
                str(c).encode() for c in self.always_split_features
            ]
        else:
            always_split_features = []

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(y.astype("float64")),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            always_split_features,  # always_split_variable_names
            bool(always_split_features),  # use_always_split_variable_names
            False,  # prediction_mode
            {},  # loaded_forest
            np.asfortranarray([[]]),  # snp_data
            self.replace,  # sample_with_replacement
            False,  # probability
            self.categorical_features_,  # unordered_feature_names
            bool(self.categorical_features_),  # use_unordered_features
            False,  # save_memory
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            [],  # class_weights
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            False,  # use_sparse_data
            self.order_snps_,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        self.event_times_ = np.array(
            self.ranger_forest_["forest"]["unique_death_times"])
        # dtype to suppress warning about ragged nested sequences
        self.cumulative_hazard_function_ = np.array(
            self.ranger_forest_["forest"]["cumulative_hazard_function"],
            dtype=object)
        return self
Exemple #5
0
    def fit(self, X, y, sample_weight=None):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array1d y: training input target classes
        :param array1d sample_weight: optional weights for input samples
        """
        self.tree_type_ = 9  # tree_type, TREE_PROBABILITY enables predict_proba

        # Check input
        X, y = self._validate_data(X, y)
        check_classification_targets(y)

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)
            use_sample_weight = True
            # ranger does additional rng on samples if weights are passed.
            # if the weights are ones, then we dont want that extra rng.
            if np.array_equal(np.unique(sample_weight), np.array([1.0])):
                sample_weight = []
                use_sample_weight = False
        else:
            sample_weight = []
            use_sample_weight = False

        # Map classes to indices
        y = np.copy(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        if self.always_split_features is not None:
            always_split_features = [
                str(c).encode() for c in self.always_split_features
            ]
        else:
            always_split_features = []

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            always_split_features,  # always_split_variable_names
            bool(always_split_features),  # use_always_split_variable_names
            False,  # prediction_mode
            {},  # loaded_forest
            np.asfortranarray([[]]),  # snp_data
            self.replace,  # sample_with_replacement
            False,  # probability
            self.categorical_features_,  # unordered_variable_names
            bool(self.categorical_features_),  # use_unordered_variable_names
            self.save_memory,
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            self.class_weights or [],
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            0.5,  # alpha, ignored because maxstat can't be used on classification
            0.1,  # minprop, ignored because maxstat can't be used on classification
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            False,  # use_sparse_data
            self.order_snps_,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        self.ranger_class_order_ = np.argsort(
            np.array(
                self.ranger_forest_["forest"]["class_values"]).astype(int))
        return self