def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ """ from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) model = self.__class__(breakpoints=self.breakpoints[:-1], penalizer=self.penalizer) regressors = {param_name: ["_intercept"] for param_name in self._fitted_parameter_names} if CensoringType.is_right_censoring(self): df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0}) model.fit_right_censoring( df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame( { "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0, } ) model.fit_interval_censoring( df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model.log_likelihood_ return self._ll_null_
def cdf_plot(model, timeline=None, **plot_kwargs): from lifelines import KaplanMeierFitter set_kwargs_ax(plot_kwargs) ax = plot_kwargs.pop("ax") if timeline is None: timeline = model.timeline COL_EMP = "empirical quantiles" if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_interval_censoring(model): raise NotImplementedError() kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = log(Ts[1] - Ts[0]) return np.array([log_data.mean(), log(log_data.std()), 0.1])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = Ts[1] elif CensoringType.is_interval_censoring(self): T = Ts[1] - Ts[0] return np.array([np.median(T), 1.0])
def _get_initial_values(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = np.log(Ts[1] - Ts[0]) return np.array([log_data.mean(), np.log(log_data.std()), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_T = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_T = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_T = np.log(Ts[1]) return np.array([np.median(log_T), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): # this fails if Ts[1] == Ts[0], so we add a some fudge factors. log_data = log(Ts[1] - Ts[0] + 0.01) return np.array([log_data.mean(), log(log_data.std() + 0.01), 0.1])
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ This plot compares the empirical CDF (derived by KaplanMeier) vs the model CDF. Parameters ------------ model: lifelines univariate model timeline: iterable ax: matplotlib axis """ from lifelines import KaplanMeierFitter from matplotlib import pyplot as plt if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_interval_censoring(model): empirical_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = np.clip(0.0001, np.inf, Ts[1]) elif CensoringType.is_interval_censoring(self): if E.sum() > 0: # Ts[1] can contain infs, so ignore this data okay_data = Ts[1] < 1e10 T = Ts[1] T = T[okay_data] else: T = np.array([1.0]) return np.array([np.median(T), 1.0])
def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) regressors = { name: ["intercept"] for name in self._fitted_parameter_names } model = self.__class__() with warnings.catch_warnings(): warnings.simplefilter("ignore") if CensoringType.is_right_censoring(self): df = pd.DataFrame({ "T": self.durations, "E": self.event_observed, "entry": self.entry, "intercept": 1 }) model.fit_right_censoring(df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame({ "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "intercept": 1, }) model.fit_interval_censoring(df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model._log_likelihood return self._ll_null_
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ Cumulative Distribution Function """ from lifelines import KaplanMeierFitter #kmf = KaplanMeierFitter() #kmf.fit(durations = churn_data['tenure'], event_observed = churn_data['Churn - Yes'] ) if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline CDL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): emp_kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_right_censoring(model): emp_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_interval_censoring(model): emp_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry)
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- >>> from lifelines import * >>> from lifelines.plotting import qq_plot >>> from lifelines.datasets import load_rossi >>> df = load_rossi() >>> wf = WeibullFitter().fit(df['week'], df['arrest']) >>> qq_plot(wf) """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") q = np.unique(kmf.cumulative_density_.values[:, 0]) # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)` quantiles = qth_survival_times(1 - q, kmf.survival_function_) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- .. code:: python from lifelines import * from lifelines.plotting import qq_plot from lifelines.datasets import load_rossi df = load_rossi() wf = WeibullFitter().fit(df['week'], df['arrest']) qq_plot(wf) Notes ------ The interval censoring case uses the mean between the upper and lower bounds. """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_interval_censoring(model): kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"] q = np.unique(cdf.values) quantiles = qth_survival_times(1 - q, sf) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def survival_probability_calibration(model: RegressionFitter, training_df: pd.DataFrame, t0: float, ax=None): r""" Smoothed calibration curves for time-to-event models. This is analogous to calibration curves for classification models, extended to handle survival probabilities and censoring. Produces a matplotlib figure and some metrics. We want to calibrate our model's prediction of :math:`P(T < \text{t0})` against the observed frequencies. Parameters ------------- model: a fitted lifelines regression model to be evaluated training_df: DataFrame the DataFrame used to train the model t0: float the time to evaluate the probability of event occurring prior at. Returns ---------- ax: mpl axes ICI: mean absolute difference between predicted and observed E50: median absolute difference between predicted and observed https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8570 """ def ccl(p): return np.log(-np.log(1 - p)) if ax is None: ax = plt.gca() T = model.duration_col E = model.event_col predictions_at_t0 = np.clip( 1 - model.predict_survival_function(training_df, times=[t0]).T.squeeze(), 1e-10, 1 - 1e-10) # create new dataset with the predictions prediction_df = pd.DataFrame({ "ccl_at_%d" % t0: ccl(predictions_at_t0), "constant": 1, T: model.durations, E: model.event_observed }) # fit new dataset to flexible spline model # this new model connects prediction probabilities and actual survival. It should be very flexible, almost to the point of overfitting. It's goal is just to smooth out the data! knots = 3 regressors = { "beta_": ["ccl_at_%d" % t0], "gamma0_": ["constant"], "gamma1_": ["constant"], "gamma2_": ["constant"] } # this model is from examples/royson_crowther_clements_splines.py crc = CRCSplineFitter(knots, penalizer=0) if CensoringType.is_right_censoring(model): crc.fit_right_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_left_censoring(model): crc.fit_left_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_interval_censoring(model): crc.fit_interval_censoring(prediction_df, T, E, regressors=regressors) # predict new model at values 0 to 1, but remember to ccl it! x = np.linspace(np.clip(predictions_at_t0.min() - 0.01, 0, 1), np.clip(predictions_at_t0.max() + 0.01, 0, 1), 100) y = 1 - crc.predict_survival_function(pd.DataFrame({ "ccl_at_%d" % t0: ccl(x), "constant": 1 }), times=[t0]).T.squeeze() # plot our results ax.set_title( "Smoothed calibration curve of \npredicted vs observed probabilities of t ≤ %d mortality" % t0) color = "tab:red" ax.plot(x, y, label="smoothed calibration curve", color=color) ax.set_xlabel("Predicted probability of \nt ≤ %d mortality" % t0) ax.set_ylabel("Observed probability of \nt ≤ %d mortality" % t0, color=color) ax.tick_params(axis="y", labelcolor=color) # plot x=y line ax.plot(x, x, c="k", ls="--") ax.legend() # plot histogram of our original predictions color = "tab:blue" twin_ax = ax.twinx() twin_ax.set_ylabel("Count of \npredicted probabilities", color=color) # we already handled the x-label with ax1 twin_ax.tick_params(axis="y", labelcolor=color) twin_ax.hist(predictions_at_t0, alpha=0.3, bins="sqrt", color=color) plt.tight_layout() deltas = ((1 - crc.predict_survival_function( prediction_df, times=[t0])).T.squeeze() - predictions_at_t0).abs() ICI = deltas.mean() E50 = np.percentile(deltas, 50) print("ICI = ", ICI) print("E50 = ", E50) return ax, ICI, E50
def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show alpha: float or iterable specify confidence intervals to show kwargs: print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) if self.event_col: print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.penalizer > 0: print("{} = {}".format(justify("penalizer"), self.penalizer)) if self.robust: print("{} = {}".format(justify("robust variance"), True)) print("{} = {}".format(justify("number of subjects"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {:.{prec}f}".format(justify("log-likelihood"), self._log_likelihood, prec=decimals)) print("{} = {}".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary print( df.to_string( float_format=format_floats(decimals), formatters={ "p": format_p_value(decimals), "exp(coef)": format_exp_floats(decimals) }, )) print("---") if CensoringType.is_right_censoring(self): print("Concordance = {:.{prec}f}".format(self.score_, prec=decimals)) with np.errstate(invalid="ignore", divide="ignore"): sr = self.log_likelihood_ratio_test() print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}" .format(sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals))