Example #1
0
    def _get_terminal_node_forest(self, X):
        """Get a terminal node forest for X.

        Uses a trained forest to return the terminal node ids of each record of ``X``
        for each tree. Returns a dictionary.

        The returned value of key ``predictions`` will hold a list of
        lists. The inner list is the list of terminal nodes of each tree for a record.
        The outer list entries correspond to each record of ``X``.

        :param array2d X: prediction input features
        """
        # many fields defaulted here which are unused
        forest = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            0,  # m_try
            getattr(self, "n_estimators", 1),  # num_trees
            self.verbose,
            self.seed,
            getattr(self, "n_jobs_", 1),  # num_threads
            False,  # write_forest
            0,  # importance_mode
            0,  # min_node_size
            [],  # split_select_weights
            False,  # use_split_select_weights
            [],  # always_split_feature_names
            False,  # use_always_split_feature_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            True,  # sample_with_replacement
            False,  # probability
            [],  # unordered_feature_names
            False,  # use_unordered_features
            False,  # save_memory
            1,  # split_rule
            [],  # case_weights
            False,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            0,  # alpha
            0,  # minprop
            self.holdout,
            2,  # prediction_type (terminal nodes)
            1,  # num_random_splits
            False,  # oob_error
            0,  # max_depth
            [],  # inbag
            False,  # use_inbag
            [],  # regularization_factor_
            False,  # use_regularization_factor_
            False,  # regularization_usedepth
        )
        return forest
Example #2
0
    def predict_proba(self, X):
        """Predict probabilities for classes from X.

        :param array2d X: prediction input features
        """
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)

        result = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            self.mtry_,
            1,  # num_trees
            self.verbose,
            self.seed,
            1,  # num_threads
            False,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            [],  # split_select_weights
            False,  # use_split_select_weights
            [],  # always_split_variable_names
            False,  # use_always_split_variable_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            [],  # unordered_feature_names
            False,  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            [],  # case_weights
            False,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            0.5,  # alpha
            0.1,  # minprop
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            self.use_regularization_factor_,
            self.regularization_usedepth,
        )
        predictions = np.atleast_2d(np.array(result["predictions"]))
        return predictions[:, self.ranger_class_order_]
Example #3
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        split_select_weights=None,
        always_split_features=None,
        categorical_features=None,
    ):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array2d y: training input targets, rows of (bool, float)
            representing (survival, time)
        :param array1d sample_weight: optional weights for input samples
        :param list split_select_weights: Vector of weights between 0 and 1 of
            probabilities to select features for splitting. Can be a single vector or a
            vector of vectors with one vector per tree.
        :param list always_split_features:  Features which should always be selected for
            splitting. A list of column index values.
        :param list categorical_features: A list of column index values which should be
            considered categorical, or unordered.
        """
        self.tree_type_ = 5  # tree_type, TREE_SURVIVAL

        # Check input
        X = check_array(X)

        # convert 1d array of 2tuples to 2d array
        # ranger expects the time first, and status second
        # since we follow the scikit-survival convention, we fliplr
        y = np.fliplr(np.array(y.tolist()))

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        # Check weights
        sample_weight, use_sample_weight = self._check_sample_weight(
            sample_weight, X)
        (
            always_split_features,
            use_always_split_features,
        ) = self._check_always_split_features(always_split_features)

        (
            categorical_features,
            use_categorical_features,
        ) = self._check_categorical_features(categorical_features)

        (
            split_select_weights,
            use_split_select_weights,
        ) = self._check_split_select_weights(split_select_weights)

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(y.astype("float64")),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            split_select_weights,
            use_split_select_weights,
            always_split_features,  # always_split_variable_names
            use_always_split_features,  # use_always_split_variable_names
            False,  # prediction_mode
            {},  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            categorical_features,  # unordered_feature_names
            use_categorical_features,  # use_unordered_features
            False,  # save_memory
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        self.event_times_ = np.array(
            self.ranger_forest_["forest"]["unique_death_times"])
        # dtype to suppress warning about ragged nested sequences
        self.cumulative_hazard_function_ = np.array(
            self.ranger_forest_["forest"]["cumulative_hazard_function"],
            dtype=object)

        if self.enable_tree_details:
            sample_weight = sample_weight if len(
                sample_weight) > 0 else np.ones(len(X))
            terminal_node_forest = self._get_terminal_node_forest(X)
            terminal_nodes = np.atleast_2d(
                terminal_node_forest["predictions"]).astype(int)
            self._set_leaf_samples(terminal_nodes)
            self._set_node_values(y, sample_weight)
            self._set_n_classes()
        return self
Example #4
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        class_weights=None,
        split_select_weights=None,
        always_split_features=None,
        categorical_features=None,
    ):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        :param list split_select_weights: Vector of weights between 0 and 1 of
            probabilities to select features for splitting. Can be a single vector or a
            vector of vectors with one vector per tree.
        :param list always_split_features:  Features which should always be selected for
            splitting. A list of column index values.
        :param list categorical_features: A list of column index values which should be
            considered categorical, or unordered.
        """
        self.tree_type_ = 3  # tree_type, TREE_REGRESSION

        # Check input
        X, y = self._validate_data(X, y)

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        # Check weights
        sample_weight, use_sample_weight = self._check_sample_weight(
            sample_weight, X)
        (
            always_split_features,
            use_always_split_features,
        ) = self._check_always_split_features(always_split_features)

        (
            categorical_features,
            use_categorical_features,
        ) = self._check_categorical_features(categorical_features)

        (
            split_select_weights,
            use_split_select_weights,
        ) = self._check_split_select_weights(split_select_weights)

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()),
            self.feature_names_,  # variable_names
            self.mtry_,
            1,  # num_trees
            self.verbose,
            self.seed,
            1,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            split_select_weights,
            use_split_select_weights,
            always_split_features,  # always_split_feature_names
            bool(always_split_features),  # use_always_split_feature_names
            False,  # prediction_mode
            {},  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            categorical_features,  # unordered_feature_names
            use_categorical_features,  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        sample_weight = sample_weight if len(sample_weight) > 0 else np.ones(
            len(X))

        terminal_node_forest = self._get_terminal_node_forest(X)
        terminal_nodes = np.atleast_2d(
            terminal_node_forest["predictions"]).astype(int)
        self._set_leaf_samples(terminal_nodes)
        self._set_node_values(y, sample_weight)
        self._set_n_classes()
        return self
Example #5
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        class_weights=None,
        split_select_weights=None,
        always_split_features=None,
        categorical_features=None,
    ):
        """Fit the ranger tree using training data.

        :param array2d X: training input features
        :param array1d y: training input target classes
        :param array1d sample_weight: optional weights for input samples
        :param dict class_weights: A dictionary of outcome classes to weights.
        :param list split_select_weights: Vector of weights between 0 and 1 of
            probabilities to select features for splitting. Can be a single vector or a
            vector of vectors with one vector per tree.
        :param list always_split_features:  Features which should always be selected for
            splitting. A list of column index values.
        :param list categorical_features: A list of column index values which should be
            considered categorical, or unordered.
       """
        self.tree_type_ = 9  # tree_type, TREE_PROBABILITY enables predict_proba

        # Check input
        X, y = self._validate_data(X, y)
        check_classification_targets(y)

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        # Map classes to indices
        y = np.copy(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        if class_weights is None:
            class_weights = {}
        else:
            try:
                class_weights = {
                    idx: class_weights[k] for idx, k in enumerate(self.classes_)
                }
            except KeyError:
                raise ValueError(
                    "class weights must have a weight for each class"
                ) from None

        # Check weights
        sample_weight, use_sample_weight = self._check_sample_weight(sample_weight, X)
        (
            always_split_features,
            use_always_split_features,
        ) = self._check_always_split_features(always_split_features)

        (
            categorical_features,
            use_categorical_features,
        ) = self._check_categorical_features(categorical_features)

        (
            split_select_weights,
            use_split_select_weights,
        ) = self._check_split_select_weights(split_select_weights)

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()),
            self.feature_names_,  # variable_names
            self.mtry_,
            1,  # num_trees
            self.verbose,
            self.seed,
            1,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            split_select_weights,
            use_split_select_weights,
            always_split_features,  # always_split_variable_names
            bool(always_split_features),  # use_always_split_variable_names
            False,  # prediction_mode
            {},  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            categorical_features,  # unordered_variable_names
            use_categorical_features,  # use_unordered_variable_names
            self.save_memory,
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            class_weights,
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            0.5,  # alpha, ignored because maxstat can't be used on classification
            0.1,  # minprop, ignored because maxstat can't be used on classification
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        self.ranger_class_order_ = np.argsort(
            np.array(self.ranger_forest_["forest"]["class_values"]).astype(int)
        )
        sample_weight = sample_weight if len(sample_weight) > 0 else np.ones(len(X))

        terminal_node_forest = self._get_terminal_node_forest(X)
        terminal_nodes = np.atleast_2d(terminal_node_forest["predictions"]).astype(int)
        self._set_leaf_samples(terminal_nodes)
        self._set_node_values(y, sample_weight)
        self._set_n_classes()
        return self
Example #6
0
    def predict(self, X, quantiles=None):
        """Predict regression target for X.

        If quantiles are passed, predict quantiles instead.

        :param array2d X: prediction input features
        :param list(float) quantiles: a list of quantiles on which to predict.
          If the list contains a single quantile, the result will be a 1darray.
          If there are multiple quantiles, the result will be a 2darray with
          columns corresponding to respective quantiles. If quantiles are not provided
          the result is the regression target estimate.
        """
        if quantiles is not None:
            return self.predict_quantiles(X, quantiles)
        if isinstance(self.quantiles, (list, np.ndarray)):
            return self.predict_quantiles(X, self.quantiles)
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)

        result = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            False,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            [],  # always_split_feature_names
            False,  # use_always_split_feature_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            [],  # unordered_feature_names
            False,  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            [],  # case_weights
            False,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            self.use_regularization_factor_,
            self.regularization_usedepth,
        )
        return np.array(result["predictions"])