def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) model = self.__class__(breakpoints=self.breakpoints[:-1], penalizer=self.penalizer) regressors = {param_name: ["_intercept"] for param_name in self._fitted_parameter_names} if CensoringType.is_right_censoring(self): df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0}) model.fit_right_censoring( df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame( { "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0, } ) model.fit_interval_censoring( df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors ) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model.log_likelihood_ return self._ll_null_
def cdf_plot(model, timeline=None, **plot_kwargs): from lifelines import KaplanMeierFitter set_kwargs_ax(plot_kwargs) ax = plot_kwargs.pop("ax") if timeline is None: timeline = model.timeline COL_EMP = "empirical quantiles" if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP, timeline=timeline) elif CensoringType.is_interval_censoring(model): raise NotImplementedError() kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ """ from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline ) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = log(Ts[1] - Ts[0]) return np.array([log_data.mean(), log(log_data.std()), 0.1])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = Ts[1] elif CensoringType.is_interval_censoring(self): T = Ts[1] - Ts[0] return np.array([np.median(T), 1.0])
def _get_initial_values(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_data = np.log(Ts[1] - Ts[0]) return np.array([log_data.mean(), np.log(log_data.std()), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_T = np.log(Ts[0]) elif CensoringType.is_left_censoring(self): log_T = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): log_T = np.log(Ts[1]) return np.array([np.median(log_T), 1.0])
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): log_data = log(Ts[0]) elif CensoringType.is_left_censoring(self): log_data = log(Ts[1]) elif CensoringType.is_interval_censoring(self): # this fails if Ts[1] == Ts[0], so we add a some fudge factors. log_data = log(Ts[1] - Ts[0] + 0.01) return np.array([log_data.mean(), log(log_data.std() + 0.01), 0.1])
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ This plot compares the empirical CDF (derived by KaplanMeier) vs the model CDF. Parameters ------------ model: lifelines univariate model timeline: iterable ax: matplotlib axis """ from lifelines import KaplanMeierFitter from matplotlib import pyplot as plt if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline COL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) elif CensoringType.is_interval_censoring(model): empirical_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs) dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs) ax.legend() return ax
def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): T = np.clip(0.0001, np.inf, Ts[1]) elif CensoringType.is_interval_censoring(self): if E.sum() > 0: # Ts[1] can contain infs, so ignore this data okay_data = Ts[1] < 1e10 T = Ts[1] T = T[okay_data] else: T = np.array([1.0]) return np.array([np.median(T), 1.0])
def _ll_null(self): if hasattr(self, "_ll_null_"): return self._ll_null_ initial_point = np.zeros(len(self._fitted_parameter_names)) regressors = { name: ["intercept"] for name in self._fitted_parameter_names } model = self.__class__() with warnings.catch_warnings(): warnings.simplefilter("ignore") if CensoringType.is_right_censoring(self): df = pd.DataFrame({ "T": self.durations, "E": self.event_observed, "entry": self.entry, "intercept": 1 }) model.fit_right_censoring(df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) elif CensoringType.is_interval_censoring(self): df = pd.DataFrame({ "lb": self.lower_bound, "ub": self.upper_bound, "E": self.event_observed, "entry": self.entry, "intercept": 1, }) model.fit_interval_censoring(df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors) if CensoringType.is_left_censoring(self): raise NotImplementedError() self._ll_null_ = model._log_likelihood return self._ll_null_
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): """ Cumulative Distribution Function """ from lifelines import KaplanMeierFitter #kmf = KaplanMeierFitter() #kmf.fit(durations = churn_data['tenure'], event_observed = churn_data['Churn - Yes'] ) if ax is None: ax = plt.gca() if timeline is None: timeline = model.timeline CDL_EMP = "empirical CDF" if CensoringType.is_left_censoring(model): emp_kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_right_censoring(model): emp_kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry) if CensoringType.is_interval_censoring(model): emp_kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=CDL_EMP, timeline=timeline, weights=model.weights, entry=model.entry)
def plot_cumulative_density(self, **kwargs): """ Plots a pretty figure of the cumulative density function. Matplotlib plot arguments can be passed in inside the kwargs. Parameters ----------- show_censors: bool place markers at censorship events. Default: False censor_styles: bool If show_censors, this dictionary will be passed into the plot call. ci_alpha: bool the transparency level of the confidence interval. Default: 0.3 ci_force_lines: bool force the confidence intervals to be line plots (versus default shaded areas). Default: False ci_show: bool show confidence intervals. Default: True ci_legend: bool if ci_force_lines is True, this is a boolean flag to add the lines' labels to the legend. Default: False at_risk_counts: bool show group sizes at time points. See function ``add_at_risk_counts`` for details. Default: False loc: slice specify a time-based subsection of the curves to plot, ex: >>> model.plot(loc=slice(0.,10.)) will plot the time values between t=0. and t=10. iloc: slice specify a location-based subsection of the curves to plot, ex: >>> model.plot(iloc=slice(0,10)) will plot the first 10 time points. Returns ------- ax: a pyplot axis object """ if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="cumulative_density_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.cumulative_density_.plot(drawstyle="steps", color=color, **kwargs)
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. def safe_pop(dict, key): if key in dict: return dict.pop(key) else: return None color = coalesce(safe_pop(kwargs, "c"), safe_pop(kwargs, "color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def _fit(self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ durations = np.asarray(durations) self._check_values(durations) if event_observed is not None: event_observed = np.asarray(event_observed) self._check_values(event_observed) self._label = coalesce(label, self._label, "KM_estimate") if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) else: weights = np.ones_like(durations, dtype=float) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" (self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights) = _preprocess_inputs(durations, event_observed, timeline, entry, weights) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds( cumulative_sq_.values[:, None], alpha, ci_labels) self._median = median_survival_times(self.survival_function_) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name return self
def print_summary(self, decimals=2, **kwargs): """ Print summary statistics describing the fit, the coefficients, and the error bounds. Parameters ----------- decimals: int, optional (default=2) specify the number of decimal places to show alpha: float or iterable specify confidence intervals to show kwargs: print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing multiple outputs. """ # Print information about data first justify = string_justify(18) print(self) if self.event_col: print("{} = '{}'".format(justify("event col"), self.event_col)) if self.weights_col: print("{} = '{}'".format(justify("weights col"), self.weights_col)) if self.penalizer > 0: print("{} = {}".format(justify("penalizer"), self.penalizer)) if self.robust: print("{} = {}".format(justify("robust variance"), True)) print("{} = {}".format(justify("number of subjects"), self._n_examples)) print("{} = {}".format(justify("number of events"), self.event_observed.sum())) print("{} = {:.{prec}f}".format(justify("log-likelihood"), self._log_likelihood, prec=decimals)) print("{} = {}".format(justify("time fit was run"), self._time_fit_was_called)) for k, v in kwargs.items(): print("{} = {}\n".format(justify(k), v)) print(end="\n") print("---") df = self.summary print( df.to_string( float_format=format_floats(decimals), formatters={ "p": format_p_value(decimals), "exp(coef)": format_exp_floats(decimals) }, )) print("---") if CensoringType.is_right_censoring(self): print("Concordance = {:.{prec}f}".format(self.score_, prec=decimals)) with np.errstate(invalid="ignore", divide="ignore"): sr = self.log_likelihood_ratio_test() print( "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}" .format(sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals))
def survival_probability_calibration(model: RegressionFitter, training_df: pd.DataFrame, t0: float, ax=None): r""" Smoothed calibration curves for time-to-event models. This is analogous to calibration curves for classification models, extended to handle survival probabilities and censoring. Produces a matplotlib figure and some metrics. We want to calibrate our model's prediction of :math:`P(T < \text{t0})` against the observed frequencies. Parameters ------------- model: a fitted lifelines regression model to be evaluated training_df: DataFrame the DataFrame used to train the model t0: float the time to evaluate the probability of event occurring prior at. Returns ---------- ax: mpl axes ICI: mean absolute difference between predicted and observed E50: median absolute difference between predicted and observed https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8570 """ def ccl(p): return np.log(-np.log(1 - p)) if ax is None: ax = plt.gca() T = model.duration_col E = model.event_col predictions_at_t0 = np.clip( 1 - model.predict_survival_function(training_df, times=[t0]).T.squeeze(), 1e-10, 1 - 1e-10) # create new dataset with the predictions prediction_df = pd.DataFrame({ "ccl_at_%d" % t0: ccl(predictions_at_t0), "constant": 1, T: model.durations, E: model.event_observed }) # fit new dataset to flexible spline model # this new model connects prediction probabilities and actual survival. It should be very flexible, almost to the point of overfitting. It's goal is just to smooth out the data! knots = 3 regressors = { "beta_": ["ccl_at_%d" % t0], "gamma0_": ["constant"], "gamma1_": ["constant"], "gamma2_": ["constant"] } # this model is from examples/royson_crowther_clements_splines.py crc = CRCSplineFitter(knots, penalizer=0) if CensoringType.is_right_censoring(model): crc.fit_right_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_left_censoring(model): crc.fit_left_censoring(prediction_df, T, E, regressors=regressors) elif CensoringType.is_interval_censoring(model): crc.fit_interval_censoring(prediction_df, T, E, regressors=regressors) # predict new model at values 0 to 1, but remember to ccl it! x = np.linspace(np.clip(predictions_at_t0.min() - 0.01, 0, 1), np.clip(predictions_at_t0.max() + 0.01, 0, 1), 100) y = 1 - crc.predict_survival_function(pd.DataFrame({ "ccl_at_%d" % t0: ccl(x), "constant": 1 }), times=[t0]).T.squeeze() # plot our results ax.set_title( "Smoothed calibration curve of \npredicted vs observed probabilities of t ≤ %d mortality" % t0) color = "tab:red" ax.plot(x, y, label="smoothed calibration curve", color=color) ax.set_xlabel("Predicted probability of \nt ≤ %d mortality" % t0) ax.set_ylabel("Observed probability of \nt ≤ %d mortality" % t0, color=color) ax.tick_params(axis="y", labelcolor=color) # plot x=y line ax.plot(x, x, c="k", ls="--") ax.legend() # plot histogram of our original predictions color = "tab:blue" twin_ax = ax.twinx() twin_ax.set_ylabel("Count of \npredicted probabilities", color=color) # we already handled the x-label with ax1 twin_ax.tick_params(axis="y", labelcolor=color) twin_ax.hist(predictions_at_t0, alpha=0.3, bins="sqrt", color=color) plt.tight_layout() deltas = ((1 - crc.predict_survival_function( prediction_df, times=[t0])).T.squeeze() - predictions_at_t0).abs() ICI = deltas.mean() E50 = np.percentile(deltas, 50) print("ICI = ", ICI) print("E50 = ", E50) return ax, ICI, E50
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- >>> from lifelines import * >>> from lifelines.plotting import qq_plot >>> from lifelines.datasets import load_rossi >>> df = load_rossi() >>> wf = WeibullFitter().fit(df['week'], df['arrest']) >>> qq_plot(wf) """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") q = np.unique(kmf.cumulative_density_.values[:, 0]) # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)` quantiles = qth_survival_times(1 - q, kmf.survival_function_) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- .. code:: python from lifelines import * from lifelines.plotting import qq_plot from lifelines.datasets import load_rossi df = load_rossi() wf = WeibullFitter().fit(df['week'], df['arrest']) qq_plot(wf) Notes ------ The interval censoring case uses the mean between the upper and lower bounds. """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring( model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP] elif CensoringType.is_interval_censoring(model): kmf = KaplanMeierFitter().fit_interval_censoring( model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry ) sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"] q = np.unique(cdf.values) quantiles = qth_survival_times(1 - q, sf) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def _fit( self, durations, event_observed=None, timeline=None, entry=None, label="KM_estimate", alpha=None, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (postively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: bool, optional (default=False) True if durations and event_observed refer to left censorship events. Default False ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median`` """ self._check_values(durations) if event_observed is not None: self._check_values(event_observed) self._label = label if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" self.durations, self.event_observed, self.timeline, self.entry, self.event_table = _preprocess_inputs( durations, event_observed, timeline, entry, weights ) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring ) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[: int(n / 2)].min() == 0: ix = net_population.iloc[: int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix ) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=is_left_censoring) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name self._update_docstrings() return self