Beispiel #1
0
    def predict(self, X, return_interval_probs=False):
        """
        Predicts survival probabilities using the XGBoost + Weibull AFT stacking pipeline.

        Args:
            X (pd.DataFrame): Dataframe of features to be used as input for the
                XGBoost model.

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.
                Default is False.

        Returns:
            pd.DataFrame: A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are returned
            instead of the cumulative survival probabilities.
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        risk = self.bst.predict(d_matrix)
        weibull_score_df = pd.DataFrame({"risk": risk})

        # predicting from logistic regression artifacts

        preds_df = self.weibull_aft.predict_survival_function(
            weibull_score_df, self.time_bins).T

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)

        return preds_df
    def predict(self, X, return_interval_probs=False):
        """
        Predicts survival probabilities using the XGBoost + Logistic Regression pipeline.

        Args:
            X (pd.DataFrame): Dataframe of features to be used as input for the
                XGBoost model.

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.
                Default is False.

        Returns:
            pd.DataFrame: A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are returned
            instead of the cumulative survival probabilities.
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        leaves = self.bst.predict(d_matrix, pred_leaf=True)
        leaves_encoded = self.encoder.transform(leaves)

        # predicting from logistic regression artifacts

        preds_df = self._predict_from_lr_list(self.lr_estimators_,
                                              leaves_encoded, self.time_bins)

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)

        return preds_df
    def predict(self, X, return_ci=False, return_interval_probs=False):
        """
        Run samples through tree until terminal nodes. Predict the Kaplan-Meier
        estimator associated to the leaf node each sample ended into.

        Args:
            X (pd.DataFrame): Data frame with samples to generate predictions

            return_ci (Bool): Whether to return confidence intervals via the Exponential Greenwood formula

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.


        Returns:
            preds_df (pd.DataFrame): A dataframe of survival probabilities
                for all times (columns), from a time_bins array, for all samples of X
                (rows). If return_interval_probs is True, the interval probabilities are returned
                instead of the cumulative survival probabilities.

            upper_ci (np.array): Upper confidence interval for the survival
                probability values

            lower_ci (np.array): Lower confidence interval for the survival
                probability values
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        leaves = self.bst.predict(d_matrix,
                                  pred_leaf=True,
                                  iteration_range=(0, self.bst.best_iteration +
                                                   1))

        # searching for kaplan meier curves in leaves
        preds_df = self._train_survival.loc[leaves].reset_index(drop=True)
        upper_ci = self._train_upper_ci.loc[leaves].reset_index(drop=True)
        lower_ci = self._train_lower_ci.loc[leaves].reset_index(drop=True)

        if return_ci and return_interval_probs:
            raise ValueError(
                "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
            )

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)
            return preds_df

        if return_ci:
            return preds_df, upper_ci, lower_ci
        return preds_df
    def predict(
        self,
        X,
        time_bins=None,
        return_ci=False,
        ci_width=0.683,
        return_interval_probs=False,
    ):
        """
        Make queries to nearest neighbor search index build on the transformed XGBoost space.
        Compute a Kaplan-Meier estimator for each neighbor-set. Predict the KM estimators.

        Args:
            X (pd.DataFrame): Dataframe with samples to generate predictions

            time_bins (np.array): Specified time windows to use when making survival predictions

            return_ci (Bool): Whether to return confidence intervals via the Exponential Greenwood formula

            ci_width (Float): Width of confidence interval

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.


        Returns:
            (pd.DataFrame): A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are returned
            instead of the cumulative survival probabilities.

            upper_ci (np.array): Upper confidence interval for the survival
            probability values

            lower_ci (np.array): Lower confidence interval for the survival
            probability values
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        leaves = self.bst.predict(d_matrix, pred_leaf=True)

        if self.radius:
            assert self.radius > 0, "Radius must be positive"

            neighs, _ = self.tree.query_radius(
                leaves, r=self.radius, return_distance=True
            )

            number_of_neighbors = np.array([len(neigh) for neigh in neighs])

            if np.argwhere(number_of_neighbors == 1).shape[0] > 0:
                # If there is at least one sample without neighbors apart from itself
                # a warning is raised suggesting a radius increase
                warnings.warn(
                    "Warning: Some samples don't have neighbors apart from itself. Increase the radius",
                    RuntimeWarning,
                )
        else:
            _, neighs = self.tree.query(leaves, k=self.n_neighbors)

        # gathering times and events/censors for neighbor sets
        T_neighs = self.T_train[neighs]
        E_neighs = self.E_train[neighs]

        # vectorized (very fast!) implementation of Kaplan Meier curves
        if time_bins is None:
            time_bins = self.time_bins

        # calculating z-score from width
        z = st.norm.ppf(0.5 + ci_width / 2)

        preds_df, upper_ci, lower_ci = calculate_kaplan_vectorized(
            T_neighs, E_neighs, time_bins, z
        )

        if return_ci and return_interval_probs:
            raise ValueError(
                "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
            )

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)
            return preds_df

        if return_ci:
            return preds_df, upper_ci, lower_ci

        return preds_df