Ejemplo n.º 1
0
    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=False,
        index_id=None,
        time_bins=None,
    ):
        """
        Fit XGBoost model to predict a value that is interpreted as a risk metric.
        Fit Weibull Regression model using risk metric as only independent variable.

        Args:
            X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEStackedWeibull: Trained XGBSEStackedWeibull instance
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(X_val, y_val,
                                                self.xgb_params["objective"])
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )

        # predicting risk from XGBoost
        train_risk = self.bst.predict(dtrain)

        # replacing 0 by minimum positive value in df
        # so Weibull can be fitted
        min_positive_value = T_train[T_train > 0].min()
        T_train = np.clip(T_train, min_positive_value, None)

        # creating df to use lifelines API
        weibull_train_df = pd.DataFrame({
            "risk": train_risk,
            "duration": T_train,
            "event": E_train
        })

        # fitting weibull aft
        self.weibull_aft = WeibullAFTFitter(**self.weibull_params)
        self.weibull_aft.fit(weibull_train_df,
                             "duration",
                             "event",
                             ancillary=True)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()

            index_leaves = self.bst.predict(dtrain, pred_leaf=True)
            self.tree = BallTree(index_leaves, metric="hamming")

        self.index_id = index_id

        return self
    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=False,
        index_id=None,
        time_bins=None,
    ):
        """
        Transform feature space by fitting a XGBoost model and returning its leaf indices.
        Leaves are transformed and considered as dummy variables to fit multiple logistic
        regression models to each evaluated time bin.

        Args:
            X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions


        Returns:
            XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(
                X_val, y_val, self.xgb_params["objective"]
            )
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )

        # predicting and encoding leaves
        self.encoder = OneHotEncoder()
        leaves = self.bst.predict(dtrain, pred_leaf=True)
        leaves_encoded = self.encoder.fit_transform(leaves)

        # convert targets for using with logistic regression
        self.targets, self.time_bins = _build_multi_task_targets(
            E_train, T_train, self.time_bins
        )

        # fitting LR for several targets
        self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()

            index_leaves = self.bst.predict(dtrain, pred_leaf=True)
            self.tree = BallTree(index_leaves, metric="hamming")

        self.index_id = index_id

        return self
    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=True,
        index_id=None,
        time_bins=None,
    ):
        """
        Transform feature space by fitting a XGBoost model and outputting its leaf indices.
        Build search index in the new space to allow nearest neighbor queries at scoring time.

        Args:
            X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors
        """

        self.E_train, self.T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(self.T_train, self.E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(
                X_val, y_val, self.xgb_params["objective"]
            )
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )
        self.feature_importances_ = self.bst.get_score()

        # creating nearest neighbor index
        leaves = self.bst.predict(dtrain, pred_leaf=True)

        self.tree = BallTree(leaves, metric="hamming", leaf_size=40)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
        self.index_id = index_id

        return self
    def fit(
        self,
        X,
        y,
        persist_train=True,
        index_id=None,
        time_bins=None,
        ci_width=0.683,
        **xgb_kwargs,
    ):
        """
        Fit a single decision tree using xgboost. For each leaf in the tree,
        build a Kaplan-Meier estimator.

        !!! Note
            * Differently from `XGBSEKaplanNeighbors`, in `XGBSEKaplanTree`, the width of
            the confidence interval (`ci_width`) must be specified at fit time.

        Args:

            X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

            ci_width (Float): Width of confidence interval

        Returns:
            XGBSEKaplanTree: Trained instance of XGBSEKaplanTree
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # training XGB
        self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs)
        self.feature_importances_ = self.bst.get_score()

        # getting leaves
        leaves = self.bst.predict(dtrain, pred_leaf=True)

        # organizing elements per leaf
        leaf_neighs = (
            pd.DataFrame({"leaf": leaves})
            .groupby("leaf")
            .apply(lambda x: list(x.index))
        )

        # getting T and E for each leaf
        T_leaves = _align_leaf_target(leaf_neighs, T_train)
        E_leaves = _align_leaf_target(leaf_neighs, E_train)

        # calculating z-score from width
        z = st.norm.ppf(0.5 + ci_width / 2)

        # vectorized (very fast!) implementation of Kaplan Meier curves
        (
            self._train_survival,
            self._train_upper_ci,
            self._train_lower_ci,
        ) = calculate_kaplan_vectorized(T_leaves, E_leaves, time_bins, z)

        # adding leaf indexes
        self._train_survival = self._train_survival.set_index(leaf_neighs.index)
        self._train_upper_ci = self._train_upper_ci.set_index(leaf_neighs.index)
        self._train_lower_ci = self._train_lower_ci.set_index(leaf_neighs.index)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
            self.tree = BallTree(leaves.reshape(-1, 1), metric="hamming", leaf_size=40)
        self.index_id = index_id

        return self
    X_valid,
    T_train,
    T_test,
    T_valid,
    E_train,
    E_test,
    E_valid,
    y_train,
    y_test,
    y_valid,
    features,
) = get_data()

# generating Kaplan Meier for all tests

time_bins = get_time_bins(T_train, E_train, 100)

mean, high, low = calculate_kaplan_vectorized(T_train.values.reshape(1, -1),
                                              E_train.values.reshape(1, -1),
                                              time_bins)

km_survival = pd.concat([mean] * len(y_train))
km_survival = km_survival.reset_index(drop=True)

# generating xgbse predictions for all tests

xgbse_model = XGBSEDebiasedBCE()

xgbse_model.fit(
    X_train,
    y_train,