def fit( self, X, y, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None, time_bins=None, ): """ Fit XGBoost model to predict a value that is interpreted as a risk metric. Fit Weibull Regression model using risk metric as only independent variable. Args: X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. num_boost_round (Int): Number of boosting iterations. validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping early_stopping_rounds (Int): Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. See xgboost.train documentation. verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions Returns: XGBSEStackedWeibull: Trained XGBSEStackedWeibull instance """ E_train, T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(T_train, E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # converting validation data to xgb format evals = () if validation_data: X_val, y_val = validation_data dvalid = convert_data_to_xgb_format(X_val, y_val, self.xgb_params["objective"]) evals = [(dvalid, "validation")] # training XGB self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, evals=evals, verbose_eval=verbose_eval, ) # predicting risk from XGBoost train_risk = self.bst.predict(dtrain) # replacing 0 by minimum positive value in df # so Weibull can be fitted min_positive_value = T_train[T_train > 0].min() T_train = np.clip(T_train, min_positive_value, None) # creating df to use lifelines API weibull_train_df = pd.DataFrame({ "risk": train_risk, "duration": T_train, "event": E_train }) # fitting weibull aft self.weibull_aft = WeibullAFTFitter(**self.weibull_params) self.weibull_aft.fit(weibull_train_df, "duration", "event", ancillary=True) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() index_leaves = self.bst.predict(dtrain, pred_leaf=True) self.tree = BallTree(index_leaves, metric="hamming") self.index_id = index_id return self
def fit( self, X, y, persist_train=True, index_id=None, time_bins=None, ci_width=0.683, **xgb_kwargs, ): """ Fit a single decision tree using xgboost. For each leaf in the tree, build a Kaplan-Meier estimator. !!! Note * Differently from `XGBSEKaplanNeighbors`, in `XGBSEKaplanTree`, the width of the confidence interval (`ci_width`) must be specified at fit time. Args: X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions ci_width (Float): Width of confidence interval Returns: XGBSEKaplanTree: Trained instance of XGBSEKaplanTree """ E_train, T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(T_train, E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # training XGB self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs) self.feature_importances_ = self.bst.get_score() # getting leaves leaves = self.bst.predict(dtrain, pred_leaf=True) # organizing elements per leaf leaf_neighs = ( pd.DataFrame({"leaf": leaves}) .groupby("leaf") .apply(lambda x: list(x.index)) ) # getting T and E for each leaf T_leaves = _align_leaf_target(leaf_neighs, T_train) E_leaves = _align_leaf_target(leaf_neighs, E_train) # calculating z-score from width z = st.norm.ppf(0.5 + ci_width / 2) # vectorized (very fast!) implementation of Kaplan Meier curves ( self._train_survival, self._train_upper_ci, self._train_lower_ci, ) = calculate_kaplan_vectorized(T_leaves, E_leaves, time_bins, z) # adding leaf indexes self._train_survival = self._train_survival.set_index(leaf_neighs.index) self._train_upper_ci = self._train_upper_ci.set_index(leaf_neighs.index) self._train_lower_ci = self._train_lower_ci.set_index(leaf_neighs.index) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() self.tree = BallTree(leaves.reshape(-1, 1), metric="hamming", leaf_size=40) self.index_id = index_id return self
def fit( self, X, y, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None, time_bins=None, ): """ Transform feature space by fitting a XGBoost model and returning its leaf indices. Leaves are transformed and considered as dummy variables to fit multiple logistic regression models to each evaluated time bin. Args: X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. num_boost_round (Int): Number of boosting iterations. validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping early_stopping_rounds (Int): Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. See xgboost.train documentation. verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions Returns: XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance """ E_train, T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(T_train, E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # converting validation data to xgb format evals = () if validation_data: X_val, y_val = validation_data dvalid = convert_data_to_xgb_format( X_val, y_val, self.xgb_params["objective"] ) evals = [(dvalid, "validation")] # training XGB self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, evals=evals, verbose_eval=verbose_eval, ) # predicting and encoding leaves self.encoder = OneHotEncoder() leaves = self.bst.predict(dtrain, pred_leaf=True) leaves_encoded = self.encoder.fit_transform(leaves) # convert targets for using with logistic regression self.targets, self.time_bins = _build_multi_task_targets( E_train, T_train, self.time_bins ) # fitting LR for several targets self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() index_leaves = self.bst.predict(dtrain, pred_leaf=True) self.tree = BallTree(index_leaves, metric="hamming") self.index_id = index_id return self
def fit( self, X, y, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0, persist_train=True, index_id=None, time_bins=None, ): """ Transform feature space by fitting a XGBoost model and outputting its leaf indices. Build search index in the new space to allow nearest neighbor queries at scoring time. Args: X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. num_boost_round (Int): Number of boosting iterations. validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping early_stopping_rounds (Int): Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. See xgboost.train documentation. verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions Returns: XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors """ self.E_train, self.T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(self.T_train, self.E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # converting validation data to xgb format evals = () if validation_data: X_val, y_val = validation_data dvalid = convert_data_to_xgb_format( X_val, y_val, self.xgb_params["objective"] ) evals = [(dvalid, "validation")] # training XGB self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, evals=evals, verbose_eval=verbose_eval, ) self.feature_importances_ = self.bst.get_score() # creating nearest neighbor index leaves = self.bst.predict(dtrain, pred_leaf=True) self.tree = BallTree(leaves, metric="hamming", leaf_size=40) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() self.index_id = index_id return self