Beispiel #1
0
def fit_aft_model(data, formula_, yvar_="mainline_vol", event_var="failure"):
    aft = WeibullAFTFitter()
    aft.fit(
        data,
        duration_col=yvar_,
        event_col=event_var,
        formula=formula_,
    )
    return aft
def _train_aft(x, t, e, folds, l2):

  fold_model = {}

  for f in set(folds):
    df = convert_to_data_frame(x[folds != f], t[folds != f], e[folds != f])
    aft = WeibullAFTFitter(penalizer=l2).fit(df, duration_col='T',
                                             event_col='E')
    fold_model[f] = copy.deepcopy(aft)
  return fold_model
Beispiel #3
0
 def test_weibull_aft_plot_covariate_groups(self, block):
     df = load_rossi()
     aft = WeibullAFTFitter()
     aft.fit(df, "week", "arrest")
     aft.plot_covariate_groups("age", [10, 50, 80])
     self.plt.tight_layout()
     self.plt.title("test_weibull_aft_plot_covariate_groups")
     self.plt.show(block=block)
Beispiel #4
0
 def test_weibull_aft_plot_partial_effects_on_outcome_with_multiple_columns(self, block):
     df = load_rossi()
     aft = WeibullAFTFitter()
     aft.fit(df, "week", "arrest")
     aft.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 50]])
     self.plt.tight_layout()
     self.plt.title("test_weibull_aft_plot_partial_effects_on_outcome_with_multiple_columns")
     self.plt.show(block=block)
Beispiel #5
0
 def test_weibull_aft_plotting_with_subset_of_columns(self, block):
     df = load_regression_dataset()
     aft = WeibullAFTFitter()
     aft.fit(df, "T", "E")
     aft.plot(columns=["var1", "var2"])
     self.plt.tight_layout()
     self.plt.title("test_weibull_aft_plotting_with_subset_of_columns")
     self.plt.show(block=block)
Beispiel #6
0
 def test_weibull_aft_plot_partial_effects_on_outcome(self, block):
     df = load_rossi()
     aft = WeibullAFTFitter()
     aft.fit(df, "week", "arrest")
     aft.plot_partial_effects_on_outcome("age", [10, 50, 80])
     self.plt.tight_layout()
     self.plt.title("test_weibull_aft_plot_partial_effects_on_outcome")
     self.plt.show(block=block)
Beispiel #7
0
 def test_aft_plot_partial_effects_on_outcome_with_categorical(self, block):
     df = load_rossi()
     df["cat"] = np.random.choice(["a", "b", "c"], size=df.shape[0])
     aft = WeibullAFTFitter()
     aft.fit(df, "week", "arrest", formula="cat + age + fin")
     aft.plot_partial_effects_on_outcome("cat", values=["a", "b", "c"])
     self.plt.title("test_aft_plot_partial_effects_on_outcome_with_categorical")
     self.plt.show(block=block)
Beispiel #8
0
 def test_weibull_aft_plotting(self, block):
     df = load_regression_dataset()
     aft = WeibullAFTFitter()
     aft.fit(df, "T", "E")
     aft.plot()
     self.plt.tight_layout()
     self.plt.title("test_weibull_aft_plotting")
     self.plt.show(block=block)
    def __init__(self):

        super(Weibull, self).__init__()
        # super().__init__()

        self.name = 'Weibull'

        self.model = WeibullAFTFitter()  #otherwise error occured
        self.direction = 1
        self.prob_FLAG = True

        self.explained = "*Parameteric model - Weibull"
        self.image_name = "Weibull.png"
        self.image_size = (500, 500)
def home_vary_survival(df, DURATION, EVENT, option):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 7))
    #plt.ylim(0, 1.01)
    #plt.xlim(0, 60)
    times = np.arange(0, 120)
    if option == "Baseline":
        kmf = KaplanMeierFitter().fit(df[DURATION], df[EVENT])
        kmf.survival_function_.plot(ax=ax)
    else:
        min_value = min(df[option].values)
        max_value = max(df[option].values) + 1
        interval = math.ceil((max_value - min_value) / 4)
        value_range = range(min_value, max_value, interval)
        wft = WeibullAFTFitter().fit(df,
                                     DURATION,
                                     EVENT,
                                     ancillary=True,
                                     timeline=times)
        wft.plot_partial_effects_on_outcome(option,
                                            value_range,
                                            cmap='coolwarm',
                                            ax=ax)
    st.pyplot(plt)
Beispiel #11
0
# -*- coding: utf-8 -*-
# weibull aft


if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import WeibullAFTFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 1)

    # df["start"] = df["week"]
    # df["stop"] = np.where(df["arrest"], df["start"], np.inf)
    # df = df.drop("week", axis=1)

    wp = WeibullAFTFitter()
    start_time = time.time()
    wp.fit_right_censoring(df, "week", event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    wp.print_summary()
    print(wp.score(df, scoring_method="log_likelihood"))
    print(wp.score(df, scoring_method="concordance_index"))
Beispiel #12
0
# -*- coding: utf-8 -*-
# weibull aft

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import WeibullAFTFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 1)

    # df["start"] = df["week"]
    # df["stop"] = np.where(df["arrest"], df["start"], np.inf)
    # df = df.drop("week", axis=1)

    wp = WeibullAFTFitter()
    start_time = time.time()
    print(df.head())
    wp.fit_right_censoring(df, "week", event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    wp.print_summary()
import pandas as pd
from lifelines import WeibullAFTFitter, CoxPHFitter

# This is an implementation of https://uwspace.uwaterloo.ca/bitstream/handle/10012/10265/Cook_Richard-10265.pdf

N = 50000
p = 0.5
bX = np.log(0.5)
bZ = np.log(4)

Z = np.random.binomial(1, p, size=N)
X = np.random.binomial(1, 0.5, size=N)
X_ = 20000 + 10 * np.random.randn(N)

W = weibull_min.rvs(1, scale=1, loc=0, size=N)

Y = bX * X + bZ * Z + np.log(W)
T = np.exp(Y)

#######################################

df = pd.DataFrame({"T": T, "x": X, "x_": X_})


wf = WeibullAFTFitter().fit(df, "T")
wf.print_summary(4)


cph = CoxPHFitter().fit(df, "T", show_progress=True, step_size=1.0)
cph.print_summary(4)
Beispiel #14
0
# -*- coding: utf-8 -*-
# weibull aft

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import WeibullAFTFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 1)

    df["start"] = df["week"]
    df["stop"] = np.where(df["arrest"], df["start"], np.inf)
    df = df.drop("week", axis=1)

    wp = WeibullAFTFitter()
    start_time = time.time()
    print(df.head())
    wp.fit_interval_censoring(df,
                              lower_bound_col="start",
                              upper_bound_col="stop",
                              event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    wp.print_summary()

    wp.summary.loc["rho_", "_intercept"]
import streamlit as st
import pandas as pd
from pandas import CategoricalDtype
from lifelines.datasets import load_rossi
from lifelines import WeibullAFTFitter, CoxPHFitter
from utils import plotter, read_config
from joblib import dump, load
import json
import matplotlib.pyplot as plt
import numpy as np
import math

# SETUP
#st.set_page_config(layout="wide")
df = load_rossi()
model = WeibullAFTFitter().fit(df, 'week', 'arrest')
cph = CoxPHFitter()
cph.fit(df, 'week', 'arrest')

DURATION = 'week'
EVENT = 'arrest'


def home_vary_survival(df, DURATION, EVENT, option):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 7))
    #plt.ylim(0, 1.01)
    #plt.xlim(0, 60)
    times = np.arange(0, 120)
    if option == "Baseline":
        kmf = KaplanMeierFitter().fit(df[DURATION], df[EVENT])
        kmf.survival_function_.plot(ax=ax)
    for i in range(n):
        u = np.random.random()
        x = X[i]
        sol = root_scalar(lambda t: S(t, x) - u, x0=1, x1=3)
        assert sol.converged
        T_actual[i] = sol.root

    MAX_TIME = 5
    T_observed = np.minimum(MAX_TIME, T_actual)
    E = T_actual < MAX_TIME
    return pd.DataFrame({"E": E, "T": T_observed, "X": X})


df = generate_data()
WeibullAFTFitter().fit(df, "T", "E").print_summary()

regressors = {
    "beta_": "X - 1",
    "gamma0_": "1",
    "gamma1_": "1",
    "gamma2_": "1",
    "gamma3_": "1"
}

cf = CRCSplineFitter(4).fit(df, "T", "E", regressors=regressors)
# beta_   X should be around 0.5

cf.print_summary()
cf.predict_hazard(df)[[0, 1, 2, 3]].plot()
        [
            prebreakdown_merge_len_acc_1500_model_df.drop(
                columns="geometry_type"), temp
        ],
        axis=1,
    )

    fig = px.histogram(
        prebreakdown_merge_len_acc_1500_model_df_one_hot,
        x="mainline_vol",
        color="failure",
    )
    prebreakdown_merge_len_acc_1500_model_df_one_hot_no_censor = prebreakdown_merge_len_acc_1500_model_df_one_hot.query(
        "failure==1")

    aft = WeibullAFTFitter()

    aft.fit(
        prebreakdown_merge_len_acc_1500_model_df_one_hot,
        duration_col="mainline_vol",
        event_col="failure",
        formula=
        "ramp_metering+length_of_acceleration_lane+ffs_cap_df+number_of_mainline_lane_downstream+simple_merge",
    )

    aft.print_summary()
    aft.plot()
    aft.median_survival_time_
    aft.mean_survival_time_

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 4))
import numpy as np
from scipy.stats import weibull_min
import pandas as pd
from lifelines import WeibullAFTFitter, CoxPHFitter

# This is an implementation of https://uwspace.uwaterloo.ca/bitstream/handle/10012/10265/Cook_Richard-10265.pdf

N = 50000
p = 0.5
bX = np.log(0.5)
bZ = np.log(4)

Z = np.random.binomial(1, p, size=N)
X = np.random.binomial(1, 0.5, size=N)
X_ = 20000 + 10 * np.random.randn(N)

W = weibull_min.rvs(1, scale=1, loc=0, size=N)

Y = bX * X + bZ * Z + np.log(W)
T = np.exp(Y)

#######################################

df = pd.DataFrame({"T": T, "x": X, "x_": X_})

wf = WeibullAFTFitter().fit(df, "T")
wf.print_summary(4)

cph = CoxPHFitter().fit(df, "T", show_progress=True, step_size=1.0)
cph.print_summary(4)
Beispiel #19
0
class XGBSEStackedWeibull(XGBSEBaseEstimator):
    """
    Perform stacking of a XGBoost survival model with a Weibull AFT parametric model.
    The XGBoost fits the data and then predicts a value that is interpreted as a risk metric.
    This risk metric is fed to the Weibull regression which uses it as its only independent variable.

    Thus, we can get the benefit of XGBoost discrimination power alongside the Weibull AFT
    statistical rigor (e.g. calibrated survival curves).

    !!! Note
        * As we're stacking XGBoost with a single, one-variable parametric model
        (as opposed to `XGBSEDebiasedBCE`), the model can be much faster (especially in training).
        * We also have better extrapolation capabilities, as opposed to the cure fraction
        problem in `XGBSEKaplanNeighbors` and `XGBSEKaplanTree`.
        * However, we also have stronger assumptions about the shape of the survival curve.

    Read more in [How XGBSE works](https://loft-br.github.io/xgboost-survival-embeddings/how_xgbse_works.html).

    """
    def __init__(
        self,
        xgb_params=None,
        weibull_params=None,
    ):
        """
        Args:
            xgb_params (Dict, None): Parameters for XGBoost model.
                If not passed, the following default parameters will be used:

                ```
                DEFAULT_PARAMS = {
                    "objective": "survival:aft",
                    "eval_metric": "aft-nloglik",
                    "aft_loss_distribution": "normal",
                    "aft_loss_distribution_scale": 1,
                    "tree_method": "hist",
                    "learning_rate": 5e-2,
                    "max_depth": 8,
                    "booster": "dart",
                    "subsample": 0.5,
                    "min_child_weight": 50,
                    "colsample_bynode": 0.5,
                }
                ```

                Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for more options.

            weibull_params (Dict): Parameters for Weibull Regerssion model.
                If not passed, will use the default parameters as shown in the Lifelines documentation.

                Check <https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html>
                for more options.


        """
        if xgb_params is None:
            xgb_params = DEFAULT_PARAMS
        if weibull_params is None:
            weibull_params = DEFAULT_PARAMS_WEIBULL

        self.xgb_params = xgb_params
        self.weibull_params = weibull_params
        self.persist_train = False

    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=False,
        index_id=None,
        time_bins=None,
    ):
        """
        Fit XGBoost model to predict a value that is interpreted as a risk metric.
        Fit Weibull Regression model using risk metric as only independent variable.

        Args:
            X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEStackedWeibull: Trained XGBSEStackedWeibull instance
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(X_val, y_val,
                                                self.xgb_params["objective"])
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )

        # predicting risk from XGBoost
        train_risk = self.bst.predict(dtrain)

        # replacing 0 by minimum positive value in df
        # so Weibull can be fitted
        min_positive_value = T_train[T_train > 0].min()
        T_train = np.clip(T_train, min_positive_value, None)

        # creating df to use lifelines API
        weibull_train_df = pd.DataFrame({
            "risk": train_risk,
            "duration": T_train,
            "event": E_train
        })

        # fitting weibull aft
        self.weibull_aft = WeibullAFTFitter(**self.weibull_params)
        self.weibull_aft.fit(weibull_train_df,
                             "duration",
                             "event",
                             ancillary=True)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()

            index_leaves = self.bst.predict(dtrain, pred_leaf=True)
            self.tree = BallTree(index_leaves, metric="hamming")

        self.index_id = index_id

        return self

    def predict(self, X, return_interval_probs=False):
        """
        Predicts survival probabilities using the XGBoost + Weibull AFT stacking pipeline.

        Args:
            X (pd.DataFrame): Dataframe of features to be used as input for the
                XGBoost model.

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.
                Default is False.

        Returns:
            pd.DataFrame: A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are returned
            instead of the cumulative survival probabilities.
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        risk = self.bst.predict(d_matrix)
        weibull_score_df = pd.DataFrame({"risk": risk})

        # predicting from logistic regression artifacts

        preds_df = self.weibull_aft.predict_survival_function(
            weibull_score_df, self.time_bins).T

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)

        return preds_df
Beispiel #20
0
    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=False,
        index_id=None,
        time_bins=None,
    ):
        """
        Fit XGBoost model to predict a value that is interpreted as a risk metric.
        Fit Weibull Regression model using risk metric as only independent variable.

        Args:
            X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEStackedWeibull: Trained XGBSEStackedWeibull instance
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(X_val, y_val,
                                                self.xgb_params["objective"])
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )

        # predicting risk from XGBoost
        train_risk = self.bst.predict(dtrain)

        # replacing 0 by minimum positive value in df
        # so Weibull can be fitted
        min_positive_value = T_train[T_train > 0].min()
        T_train = np.clip(T_train, min_positive_value, None)

        # creating df to use lifelines API
        weibull_train_df = pd.DataFrame({
            "risk": train_risk,
            "duration": T_train,
            "event": E_train
        })

        # fitting weibull aft
        self.weibull_aft = WeibullAFTFitter(**self.weibull_params)
        self.weibull_aft.fit(weibull_train_df,
                             "duration",
                             "event",
                             ancillary=True)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()

            index_leaves = self.bst.predict(dtrain, pred_leaf=True)
            self.tree = BallTree(index_leaves, metric="hamming")

        self.index_id = index_id

        return self
Beispiel #21
0
# -*- coding: utf-8 -*-
# weibull aft

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import WeibullAFTFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 1)

    df["start"] = df["week"]
    df["stop"] = np.where(df["arrest"], df["start"], np.inf)
    df = df.drop("week", axis=1)

    wp = WeibullAFTFitter()
    start_time = time.time()
    print(df.head())
    wp.fit_interval_censoring(df,
                              start_col="start",
                              stop_col="stop",
                              event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    wp.print_summary()

    wp.fit_right_censoring(load_rossi(), "week", event_col="arrest")
    wp.print_summary()
Beispiel #22
0
# -*- coding: utf-8 -*-
# weibull aft

if __name__ == "__main__":
    import pandas as pd
    import time

    from lifelines import WeibullAFTFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 1)
    # df = df.reset_index()
    # df['week'] = np.random.exponential(1, size=df.shape[0])
    wp = WeibullAFTFitter()
    start_time = time.time()
    wp.fit(df, duration_col="week", event_col="arrest")
    print("--- %s seconds ---" % (time.time() - start_time))
    wp.print_summary()
fin = st.sidebar.slider(
    'Discount',
    0, 1
)
age = st.sidebar.slider(
    'Age',
    17, 75
)

mar = st.sidebar.slider(
    'Marital Status',
    0, 1
)

paro = st.sidebar.slider(
    'Referral',
    0, 1
)





wf = WeibullAFTFitter().fit(rossi, "week", "arrest")
predict_input = pd.DataFrame([week, 0, fin, age, 1, 1, mar, paro, 1]).T
predict_input.columns = ['week', 'arrest', 'fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio']
prediction_output = wf.predict_median(predict_input, conditional_after=predict_input[DURATION])

st.sidebar.write("## Weeks until churn:", round(prediction_output[0]))