def __init__(self, cls, estimate, loc, iloc, show_censors, censor_styles, bandwidth, **kwargs): self.censor_styles = coalesce(censor_styles, {}) set_kwargs_ax(kwargs) set_kwargs_color(kwargs) set_kwargs_drawstyle(kwargs) self.estimate = coalesce(estimate, cls._estimate_name) self.loc = loc self.iloc = iloc self.show_censors = show_censors # plot censors self.ax = kwargs["ax"] self.colour = kwargs["c"] self.kwargs = kwargs if (self.loc is not None) and (self.iloc is not None): raise ValueError( "Cannot set both loc and iloc in call to .plot().") if self.estimate == "hazard_": if bandwidth is None: raise ValueError( "Must specify a bandwidth parameter in the call to plot_hazard." ) self.estimate_ = cls.smoothed_hazard_(bandwidth) self.confidence_interval_ = cls.smoothed_hazard_confidence_intervals_( bandwidth, hazard_=self.estimate_.values[:, 0]) else: self.estimate_ = getattr(cls, self.estimate) self.confidence_interval_ = getattr(cls, "confidence_interval_")
def plot(self, **kwargs): kwargs['alpha'] = coalesce(kwargs.pop('alpha', None), 0.05) kwargs['legend'] = False kwargs['c'] = coalesce(kwargs.pop('c', None), kwargs.pop('color', None), '#348ABD') ax = self.sample_survival_functions_.plot(**kwargs) return ax
def plot(ix=None, iloc=None, flat=False, show_censors=False, censor_styles={}, ci_legend=False, ci_force_lines=False, ci_alpha=0.25, ci_show=True, bandwidth=None, **kwargs): assert (ix is None or iloc is None), 'Cannot set both ix and iloc in call to .plot().' if "ax" not in kwargs: kwargs["ax"] = plt.figure().add_subplot(111) kwargs['color'] = coalesce(kwargs.get('c'), kwargs.get('color'), next(kwargs["ax"]._get_lines.color_cycle)) kwargs['drawstyle'] = coalesce(kwargs.get('drawstyle'), 'steps-post') # R-style graphics if flat: ci_force_lines = True show_censors = True if estimate == "hazard_": assert bandwidth is not None, 'Must specify a bandwidth parameter in the call to plot_hazard.' estimate_ = self.smoothed_hazard_(bandwidth) confidence_interval_ = self.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0]) else: confidence_interval_ = getattr(self, 'confidence_interval_') estimate_ = getattr(self, estimate) # did user specify certain indexes or locations? if iloc == ix is None: user_submitted_ix = slice(0, None) else: user_submitted_ix = ix if ix is not None else iloc get_method = "ix" if ix is not None else "iloc" get_loc = lambda df: getattr(df, get_method)[user_submitted_ix] # plot censors if show_censors and self.event_table['censored'].sum() > 0: cs = {'marker': '+', 'ms': 12, 'mew': 1} cs.update(censor_styles) times = get_loc(self.event_table.ix[(self.event_table['censored'] > 0)]).index.values.astype(float) v = self.predict(times) kwargs['ax'].plot(times, v, linestyle='None', color=kwargs['color'], **cs) # plot esimate get_loc(estimate_).plot(**kwargs) # plot confidence intervals if ci_show: if ci_force_lines: get_loc(confidence_interval_).plot(linestyle="-", linewidth=1, c=kwargs['color'], legend=True, drawstyle=kwargs.get('drawstyle', 'default'), ax=kwargs['ax'], alpha=0.6) else: x = get_loc(confidence_interval_).index.values.astype(float) lower = get_loc(confidence_interval_.filter(like='lower')).values[:, 0] upper = get_loc(confidence_interval_.filter(like='upper')).values[:, 0] fill_between_steps(x, lower, y2=upper, ax=kwargs['ax'], alpha=ci_alpha, color=kwargs['color'], linewidth=1.0) return kwargs['ax']
def fit( self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = coalesce(label, self._label, "BFH_estimate") alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit(durations, event_observed=event_observed, timeline=timeline, label=self._label, entry=entry, ci_labels=ci_labels) self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, naf.weights, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density = 1 - self.confidence_interval_ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" # plotting functions self.plot_survival_function = self.plot return self
def predict_cumulative_hazard(self, X, times=None, ancillary_X=None): """ Return the cumulative hazard rate of subjects in X at time points. Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: iterable, optional an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. ancillary_X: numpy array or DataFrame, optional a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- cumulative_hazard_ : DataFrame the cumulative hazard of individuals over the timeline """ times = coalesce(times, self.timeline, np.unique(self.durations)) alpha_, beta_ = self._prep_inputs_for_prediction_and_return_scores(X, ancillary_X) return pd.DataFrame(np.log1p(np.outer(times, 1 / alpha_) ** beta_), columns=_get_index(X), index=times)
def predict_cumulative_hazard(self, df, times=None): """ Return the cumulative hazard rate of subjects in X at time points. Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: iterable, optional an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. Returns ------- cumulative_hazard_ : DataFrame the cumulative hazard of individuals over the timeline """ times = np.asarray( coalesce(times, self.timeline, np.unique(self.durations))) n = times.shape[0] times = times.reshape((n, 1)) lambdas_ = self._prep_inputs_for_prediction_and_return_parameters(df) bp = self.breakpoints M = np.minimum(np.tile(bp, (n, 1)), times) M = np.hstack([M[:, tuple([0])], np.diff(M, axis=1)]) return pd.DataFrame(np.dot(M, (1 / lambdas_)), columns=_get_index(df), index=times[:, 0])
def __init__(self, cls, estimate, loc, iloc, show_censors, censor_styles, ax, **kwargs): self.censor_styles = coalesce(censor_styles, {}) if ax is None: ax = plt.gca() kwargs["ax"] = ax set_kwargs_color(kwargs) set_kwargs_drawstyle(kwargs) set_kwargs_label(kwargs, cls) self.loc = loc self.iloc = iloc self.show_censors = show_censors # plot censors self.ax = ax self.colour = kwargs["color"] self.kwargs = kwargs if isinstance(estimate, str): self.estimate_ = getattr(cls, estimate) self.confidence_interval_ = getattr(cls, "confidence_interval_" + estimate) self.predict_at_times = getattr(cls, estimate + "at_times") else: self.estimate_ = estimate self.confidence_interval_ = kwargs.pop("confidence_intervals")
def predict_cumulative_hazard(self, X, times=None, ancillary_X=None): """ Return the cumulative hazard rate of subjects in X at time points. Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: iterable, optional an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. ancillary_X: numpy array or DataFrame, optional a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- cumulative_hazard_ : DataFrame the cumulative hazard of individuals over the timeline """ import numpy as np times = coalesce(times, self.timeline, np.unique(self.durations)) exp_mu_, sigma_ = self._prep_inputs_for_prediction_and_return_scores(X, ancillary_X) mu_ = np.log(exp_mu_) Z = np.subtract.outer(np.log(times), mu_) / sigma_ return pd.DataFrame(-logsf(Z), columns=_get_index(X), index=times)
def create_dataframe_slicer(iloc, loc): user_did_not_specify_certain_indexes = (iloc is None) and (loc is None) user_submitted_slice = slice( None) if user_did_not_specify_certain_indexes else coalesce(loc, iloc) get_method = "loc" if loc is not None else "iloc" return lambda df: getattr(df, get_method)[user_submitted_slice]
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def predict_cumulative_hazard(self, X, times=None, ancillary_X=None): """ Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: iterable, optional an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. ancillary_X: numpy array or DataFrame, optional a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- cumulative_hazard_ : DataFrame the cumulative hazard of individuals over the timeline """ X = X.copy() times = coalesce(times, self.timeline, np.unique(self.durations)) if ancillary_X is None: ancillary_X = pd.DataFrame(np.ones((X.shape[0], 1)), columns=["_intercept"]) elif isinstance(ancillary_X, pd.DataFrame): ancillary_X = ancillary_X.copy() if self.fit_intercept: ancillary_X["_intercept"] = 1.0 ancillary_X = ancillary_X[self.params_.loc["rho_"].index] else: assert ancillary_X.shape[1] == (self.params_.loc["rho_"].shape[0] + 1) # 1 for _intercept if isinstance(X, pd.DataFrame): if self.fit_intercept: X["_intercept"] = 1.0 X = X[self.params_.loc["lambda_"].index] else: assert X.shape[1] == (self.params_.loc["lambda_"].shape[0] + 1 ) # 1 for _intercept lambda_params = self.params_[self._LOOKUP_SLICE["lambda_"]] lambda_ = np.exp(np.dot(X, lambda_params)) rho_params = self.params_[self._LOOKUP_SLICE["rho_"]] rho_ = np.exp(np.dot(ancillary_X, rho_params)) cols = _get_index(X) return pd.DataFrame(np.outer(times, 1 / lambda_)**rho_, columns=cols, index=times)
def create_dataframe_slicer(iloc, loc, timeline): if (loc is not None) and (iloc is not None): raise ValueError("Cannot set both loc and iloc in call to .plot().") user_did_not_specify_certain_indexes = (iloc is None) and (loc is None) user_submitted_slice = (slice(timeline.min(), timeline.max()) if user_did_not_specify_certain_indexes else coalesce( loc, iloc)) get_method = "iloc" if iloc is not None else "loc" return lambda df: getattr(df, get_method)[user_submitted_slice]
def survival_function_at_times(self, times, label=None) -> pd.Series: """ Return a Pandas series of the predicted survival value at specific times Parameters ----------- times: iterable or float label: str """ label = coalesce(label, self._label) return pd.Series(self.predict(times), index=_to_1d_array(times), name=label)
def plot_cumulative_density(self, **kwargs): """ Plots a pretty figure of the cumulative density function. Matplotlib plot arguments can be passed in inside the kwargs. Parameters ----------- show_censors: bool place markers at censorship events. Default: False censor_styles: bool If show_censors, this dictionary will be passed into the plot call. ci_alpha: bool the transparency level of the confidence interval. Default: 0.3 ci_force_lines: bool force the confidence intervals to be line plots (versus default shaded areas). Default: False ci_show: bool show confidence intervals. Default: True ci_legend: bool if ci_force_lines is True, this is a boolean flag to add the lines' labels to the legend. Default: False at_risk_counts: bool show group sizes at time points. See function ``add_at_risk_counts`` for details. Default: False loc: slice specify a time-based subsection of the curves to plot, ex: >>> model.plot(loc=slice(0.,10.)) will plot the time values between t=0. and t=10. iloc: slice specify a location-based subsection of the curves to plot, ex: >>> model.plot(iloc=slice(0,10)) will plot the first 10 time points. Returns ------- ax: a pyplot axis object """ if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="cumulative_density_", **kwargs) else: # hack for now. color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") self.cumulative_density_.plot(drawstyle="steps", color=color, **kwargs)
def plot_survival_function(self, **kwargs): """Alias of ``plot``""" if not CensoringType.is_interval_censoring(self): return _plot_estimate(self, estimate="survival_function_", **kwargs) else: # hack for now. def safe_pop(dict, key): if key in dict: return dict.pop(key) else: return None color = coalesce(safe_pop(kwargs, "c"), safe_pop(kwargs, "color"), "k") self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
def survival_function_at_times(self, times, label=None): """ Return a Pandas series of the predicted survival value at specific times Parameters ----------- times: iterable or float Returns -------- pd.Series """ label = coalesce(label, self._label) return pd.Series(self.predict(times), index=_to_array(times), name=label)
def cumulative_density_at_times(self, times, label=None): """ Return a Pandas series of the predicted cumulative density at specific times Parameters ----------- times: iterable or float Returns -------- pd.Series """ label = coalesce(label, self._label) return pd.Series(1 - self.predict(times), index=_to_array(times), name=label)
def cumulative_density_at_times(self, times, label=None) -> pd.Series: """ Return a Pandas series of the predicted cumulative density at specific times Parameters ----------- times: iterable or float Returns -------- pd.Series """ label = coalesce(label, self._label) return pd.Series(1 - self.predict(times), index=_to_1d_array(times), name=label)
def plot(ix=None, iloc=None, columns=[], legend=True, **kwargs): """" A wrapper around plotting. Matplotlib plot arguments can be passed in, plus: ix: specify a time-based subsection of the curves to plot, ex: .plot(ix=slice(0.,10.)) will plot the time values between t=0. and t=10. iloc: specify a location-based subsection of the curves to plot, ex: .plot(iloc=slice(0,10)) will plot the first 10 time points. columns: If not empty, plot a subset of columns from the cumulative_hazards_. Default all. legend: show legend in figure. """ from matplotlib import pyplot as plt assert (ix is None or iloc is None), 'Cannot set both ix and iloc in call to .plot' get_method = "ix" if ix is not None else "iloc" if iloc == ix is None: user_submitted_ix = slice(0, None) else: user_submitted_ix = ix if ix is not None else iloc get_loc = lambda df: getattr(df, get_method)[user_submitted_ix] if len(columns) == 0: columns = self.cumulative_hazards_.columns if "ax" not in kwargs: kwargs["ax"] = plt.figure().add_subplot(111) x = get_loc(self.cumulative_hazards_).index.values.astype(float) for column in columns: y = get_loc(self.cumulative_hazards_[column]).values y_upper = get_loc( self.confidence_intervals_[column].ix['upper']).values y_lower = get_loc( self.confidence_intervals_[column].ix['lower']).values shaded_plot(x, y, y_upper, y_lower, ax=kwargs["ax"], label=coalesce(kwargs.get('label'), column)) if legend: kwargs["ax"].legend() return kwargs["ax"]
def predict_cumulative_hazard(self, df, times=None): times = coalesce(times, self.timeline, np.unique(self.durations)) n = df.shape[0] Xs = self._create_Xs_dict(df) params_dict = { parameter_name: self.params_.values[self._LOOKUP_SLICE[parameter_name]] for parameter_name in self._fitted_parameter_names } return pd.DataFrame(self._cumulative_hazard(params_dict, np.tile(times, (n, 1)).T, Xs), index=times, columns=df.index)
def hazard_at_times(self, times, label=None): """ Return a Pandas series of the predicted hazard at specific times. Parameters ----------- times: iterable or float values to return the hazard at. label: string, optional Rename the series returned. Useful for plotting. Returns -------- pd.Series """ label = coalesce(label, self._label) return pd.Series(self._hazard(self._fitted_parameters_, times), index=_to_array(times), name=label)
def predict_cumulative_hazard(self, df, times=None, conditional_after=None) -> pd.DataFrame: """ Return the cumulative hazard rate of subjects in X at time points. Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. times: iterable, optional an iterable of increasing times to predict the cumulative hazard at. Default is the set of all durations (observed and unobserved). Uses a linear interpolation if points in time are not in the index. Returns ------- cumulative_hazard_ : DataFrame the cumulative hazard of individuals over the timeline """ if isinstance(df, pd.Series): return self.predict_cumulative_hazard(df.to_frame().T) if conditional_after is not None: raise NotImplementedError() times = np.atleast_1d(coalesce(times, self.timeline)).astype(float) n = times.shape[0] times = times.reshape((n, 1)) lambdas_ = self._prep_inputs_for_prediction_and_return_parameters(df) bp = np.append(self.breakpoints, [np.inf]) M = np.minimum(np.tile(bp, (n, 1)), times) M = np.hstack([M[:, tuple([0])], np.diff(M, axis=1)]) return pd.DataFrame(np.dot(M, (1 / lambdas_)), columns=_get_index(df), index=times[:, 0])
def plot(ix=None, iloc=None, columns=[], legend=True, **kwargs): """" A wrapper around plotting. Matplotlib plot arguments can be passed in, plus: ix: specify a time-based subsection of the curves to plot, ex: .plot(ix=slice(0.,10.)) will plot the time values between t=0. and t=10. iloc: specify a location-based subsection of the curves to plot, ex: .plot(iloc=slice(0,10)) will plot the first 10 time points. columns: If not empty, plot a subset of columns from the cumulative_hazards_. Default all. legend: show legend in figure. """ from matplotlib import pyplot as plt assert ix is None or iloc is None, "Cannot set both ix and iloc in call to .plot" get_method = "ix" if ix is not None else "iloc" if iloc == ix is None: user_submitted_ix = slice(0, None) else: user_submitted_ix = ix if ix is not None else iloc get_loc = lambda df: getattr(df, get_method)[user_submitted_ix] if len(columns) == 0: columns = self.cumulative_hazards_.columns if "ax" not in kwargs: kwargs["ax"] = plt.figure().add_subplot(111) x = get_loc(self.cumulative_hazards_).index.values.astype(float) for column in columns: y = get_loc(self.cumulative_hazards_[column]).values y_upper = get_loc(self.confidence_intervals_[column].ix["upper"]).values y_lower = get_loc(self.confidence_intervals_[column].ix["lower"]).values shaded_plot(x, y, y_upper, y_lower, ax=kwargs["ax"], label=coalesce(kwargs.get("label"), column)) if legend: kwargs["ax"].legend() return kwargs["ax"]
def __init__(self, cls, estimate, confidence_intervals, loc, iloc, show_censors, censor_styles, **kwargs): self.censor_styles = coalesce(censor_styles, {}) set_kwargs_ax(kwargs) set_kwargs_color(kwargs) set_kwargs_drawstyle(kwargs) set_kwargs_label(kwargs, cls) self.loc = loc self.iloc = iloc self.show_censors = show_censors # plot censors self.ax = kwargs["ax"] self.colour = kwargs["c"] self.kwargs = kwargs if (self.loc is not None) and (self.iloc is not None): raise ValueError("Cannot set both loc and iloc in call to .plot().") else: self.estimate_ = estimate self.confidence_interval_ = confidence_intervals
def predict(self, x): if self.model is None: raise AssertionError("Model must be fit before calling predict.") np.random.seed(909) if isinstance(self.prediction_time_frame_in_hours, tuple): earlier = self.predict_at_time( x, self.prediction_time_frame_in_hours[0]) later = self.predict_at_time( x, self.prediction_time_frame_in_hours[1]) y_pred = later - earlier else: times_to_evaluate_at = coalesce( self.prediction_time_frame_in_hours, self.model.baseline_cumulative_hazard_.index) y_pred = self.predict_at_time(x, times_to_evaluate_at) y_pred = np.concatenate( [1 - y_pred[..., np.newaxis], y_pred[..., np.newaxis]], axis=-1) y_pred = y_pred[0] return y_pred
def _fit(self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ durations = np.asarray(durations) self._check_values(durations) if event_observed is not None: event_observed = np.asarray(event_observed) self._check_values(event_observed) self._label = coalesce(label, self._label, "KM_estimate") if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) else: weights = np.ones_like(durations, dtype=float) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" (self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights) = _preprocess_inputs(durations, event_observed, timeline, entry, weights) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds( cumulative_sq_.values[:, None], alpha, ci_labels) self._median = median_survival_times(self.survival_function_) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name return self
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, entry=None, weights=None, tol: float = 1e-5, show_progress: bool = False, **kwargs, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turnbull Estimator. Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals. >>> left, right = df['left'], df['right'] >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001) Note ------ This is new and experimental, and many features are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. tol: float, optional minimum difference in log likelihood changes for iterative algorithm. show_progress: bool, optional display information during fitting. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ if entry is not None: raise NotImplementedError("entry is not supported yet") if weights is None: weights = np.ones_like(upper_bound) self.weights = np.asarray(weights) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") results = npmle(self.lower_bound, self.upper_bound, verbose=show_progress, tol=tol, weights=weights, **kwargs) self.survival_function_ = reconstruct_survival_function( *results, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" return self
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) params_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.params_ = pd.Series(params_, index=df.columns, name="coef") / self._norm_std self.hazard_ratios_ = pd.Series(np.exp(self.params_), index=df.columns, name="exp(coef)") self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard( df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({ "event": events, "start": start, "stop": stop }) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def plot( ix=None, iloc=None, flat=False, show_censors=False, censor_styles=None, ci_legend=False, ci_force_lines=False, ci_alpha=0.25, ci_show=True, at_risk_counts=False, bandwidth=None, **kwargs ): from matplotlib import pyplot as plt if censor_styles is None: censor_styles = {} if ix is not None and iloc is not None: raise ValueError("Cannot set both ix and iloc in call to .plot().") if "ax" not in kwargs: kwargs["ax"] = plt.figure().add_subplot(111) kwargs["color"] = coalesce(kwargs.get("c"), kwargs.get("color"), next(kwargs["ax"]._get_lines.color_cycle)) kwargs["drawstyle"] = coalesce(kwargs.get("drawstyle"), "steps-post") # R-style graphics if flat: ci_force_lines = True show_censors = True if estimate == "hazard_": if bandwidth is None: raise ValueError("Must specify a bandwidth parameter in the " + "call to plot_hazard.") estimate_ = cls.smoothed_hazard_(bandwidth) confidence_interval_ = cls.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0]) else: confidence_interval_ = getattr(cls, "confidence_interval_") estimate_ = getattr(cls, estimate) # did user specify certain indexes or locations? if iloc == ix is None: user_submitted_ix = slice(None) else: user_submitted_ix = ix if ix is not None else iloc get_method = "ix" if ix is not None else "iloc" get_loc = lambda df: getattr(df, get_method)[user_submitted_ix] # plot censors if show_censors and cls.event_table["censored"].sum() > 0: cs = {"marker": "+", "ms": 12, "mew": 1} cs.update(censor_styles) times = get_loc(cls.event_table.ix[(cls.event_table["censored"] > 0)]).index.values.astype(float) v = cls.predict(times) kwargs["ax"].plot(times, v, linestyle="None", color=kwargs["color"], **cs) # plot estimate get_loc(estimate_).plot(**kwargs) # plot confidence intervals if ci_show: if ci_force_lines: get_loc(confidence_interval_).plot( linestyle="-", linewidth=1, color=[kwargs["color"]], legend=True, drawstyle=kwargs.get("drawstyle", "default"), ax=kwargs["ax"], alpha=0.6, ) else: x = get_loc(confidence_interval_).index.values.astype(float) lower = get_loc(confidence_interval_.filter(like="lower")).values[:, 0] upper = get_loc(confidence_interval_.filter(like="upper")).values[:, 0] fill_between_steps( x, lower, y2=upper, ax=kwargs["ax"], alpha=ci_alpha, color=kwargs["color"], linewidth=1.0 ) if at_risk_counts: add_at_risk_counts(cls, ax=kwargs["ax"]) return kwargs["ax"]
def set_kwargs_color(kwargs): kwargs["c"] = coalesce(kwargs.get("c"), kwargs.get("color"), kwargs["ax"]._get_lines.get_next_color())
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None, cluster_col=None, robust=False): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. This can be used for case-weights. For example, a weight of 2 means there were two subjects with identical observations. This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. step_size: set an initial step size for the fitting algorithm. robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 cluster_col: specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to be used. Returns: self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc. """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + ' UTC' self.duration_col = duration_col self.event_col = event_col self.robust = robust self.cluster_col = cluster_col self.weights_col = weights_col self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col) if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) if (weights <= 0).any(): raise ValueError("values in weights_col must be positive.") else: weights = pd.Series(np.ones((self._n_examples, )), index=df.index) if self.cluster_col: self._clusters = df.pop(self.cluster_col) self._check_values(df, T, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), T, E, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards( df, T, E, weights) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard( ) self.baseline_survival_ = self._compute_baseline_survival() self._predicted_partial_hazards_ = self.predict_partial_hazard( df).values self._train_log_partial_hazard = self.predict_log_partial_hazard( self._norm_mean.to_frame().T) return self
def fit(self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None): # pylint: disable=too-many-arguments """ Parameters ----------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: iterable return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: n array, or pd.Series, of length n if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self, with new properties like ``cumulative_hazard_``. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) if weights is not None: if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) (self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights) = _preprocess_inputs(durations, event_observed, timeline, entry, weights) cumulative_hazard_, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._variance_f, False) # estimates self._label = coalesce(label, self._label, "NA_estimate") self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label]) self.confidence_interval_ = self._bounds( cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels) self.confidence_interval_cumulative_hazard_ = self.confidence_interval_ self._cumulative_sq = cumulative_sq_ # estimation methods self._estimation_method = "cumulative_hazard_" self._estimate_name = "cumulative_hazard_" self._update_docstrings() # plotting self.plot_cumulative_hazard = self.plot return self
def _create_initial_point(self, Ts, E, entry, weights): return np.array([utils.coalesce(*Ts).mean(), 1.0])
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.") if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"} ) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights ) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop}) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def fit( self, durations, event_observed, event_of_interest, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array or pd.Series of length n -- duration of subject was observed for event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be only positive integers, where 0 indicates censoring. event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2) is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer timeline: return the best estimate at the values in timelines (positively increasing) entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self : AalenJohansenFitter self, with new properties like ``cumulative_incidence_``. """ # Checking for tied event times ties = self._check_for_duplicates(durations=durations, events=event_observed) if ties: warnings.warn( dedent( """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times. To resolve ties, data is randomly jittered."""), Warning, ) durations = self._jitter( durations=pd.Series(durations), event=pd.Series(event_observed), jitter_level=self._jitter_level, seed=self._seed, ) alpha = alpha if alpha else self.alpha # Creating label for event of interest & indicator for that event event_of_interest = int(event_of_interest) cmprisk_label = "CIF_" + str(event_of_interest) self.label_cmprisk = "observed_" + str(event_of_interest) # Fitting Kaplan-Meier for either event of interest OR competing risk km = KaplanMeierFitter().fit(durations, event_observed=event_observed, timeline=timeline, entry=entry, weights=weights) aj = km.event_table aj["overall_survival"] = km.survival_function_ aj["lagged_overall_survival"] = aj["overall_survival"].shift() # Setting up table for calculations and to return to user event_spec = pd.Series(event_observed) == event_of_interest self.durations, self.event_observed, *_, event_table, weights = _preprocess_inputs( durations=durations, event_observed=event_spec, timeline=timeline, entry=entry, weights=weights) event_spec_times = event_table["observed"] event_spec_times = event_spec_times.rename(self.label_cmprisk) aj = pd.concat([aj, event_spec_times], axis=1).reset_index() # Estimator of Cumulative Incidence (Density) Function aj[cmprisk_label] = (aj[self.label_cmprisk] / aj["at_risk"] * aj["lagged_overall_survival"]).cumsum() aj.loc[0, cmprisk_label] = 0 # Setting initial CIF to be zero aj = aj.set_index("event_at") # Setting attributes self._estimation_method = "cumulative_density_" self._estimate_name = "cumulative_density_" self.timeline = km.timeline self._update_docstrings() self._label = coalesce(label, self._label, "AJ_estimate") self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label]) # Technically, cumulative incidence, but consistent with KaplanMeierFitter self.event_table = aj[[ "removed", "observed", self.label_cmprisk, "censored", "entrance", "at_risk" ]] # Event table if self._calc_var: self.variance_, self.confidence_interval_ = self._bounds( aj["lagged_overall_survival"], alpha=alpha, ci_labels=ci_labels) else: self.variance_, self.confidence_interval_ = None, None self.confidence_interval_cumulative_density_ = self.confidence_interval_ return self
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) else: weights = pd.DataFrame(np.ones((self._n_examples, 1)), index=df.index) self._check_values(df, T, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard( ) self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index( self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard( self._norm_mean.to_frame().T) return self
def plot(self, **kwargs): kwargs['alpha'] = coalesce(kwargs.get('alpha'), 15./self.samples) kwargs['legend'] = False kwargs['c'] = coalesce( kwargs.get('c'), kwargs.get('color'), '#348ABD') ax = self.sample_survival_functions_.plot(**kwargs) return ax
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col).values else: weights = np.ones(self._n_examples) self._check_values(df, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard() self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index(self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T) return self
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="BFH_estimate", alpha=None, ci_labels=None, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = label alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit( durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels ) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() # plotting functions self.plot_survival_function = self.plot return self
def plot(ix=None, iloc=None, flat=False, show_censors=False, censor_styles=None, ci_legend=False, ci_force_lines=False, ci_alpha=0.25, ci_show=True, at_risk_counts=False, bandwidth=None, **kwargs): from matplotlib import pyplot as plt if censor_styles is None: censor_styles = {} if (ix is not None and iloc is not None): raise ValueError('Cannot set both ix and iloc in call to .plot().') if "ax" not in kwargs: kwargs["ax"] = plt.figure().add_subplot(111) kwargs['color'] = coalesce(kwargs.get('c'), kwargs.get('color'), next(kwargs["ax"]._get_lines.color_cycle)) kwargs['drawstyle'] = coalesce(kwargs.get('drawstyle'), 'steps-post') # R-style graphics if flat: ci_force_lines = True show_censors = True if estimate == "hazard_": if bandwidth is None: raise ValueError('Must specify a bandwidth parameter in the ' + 'call to plot_hazard.') estimate_ = cls.smoothed_hazard_(bandwidth) confidence_interval_ = \ cls.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0]) else: confidence_interval_ = getattr(cls, 'confidence_interval_') estimate_ = getattr(cls, estimate) # did user specify certain indexes or locations? if iloc == ix is None: user_submitted_ix = slice(None) else: user_submitted_ix = ix if ix is not None else iloc get_method = "ix" if ix is not None else "iloc" get_loc = lambda df: getattr(df, get_method)[user_submitted_ix] # plot censors if show_censors and cls.event_table['censored'].sum() > 0: cs = {'marker': '+', 'ms': 12, 'mew': 1} cs.update(censor_styles) times = get_loc(cls.event_table.ix[(cls.event_table['censored'] > 0)]).index.values.astype(float) v = cls.predict(times) kwargs['ax'].plot(times, v, linestyle='None', color=kwargs['color'], **cs) # plot estimate get_loc(estimate_).plot(**kwargs) # plot confidence intervals if ci_show: if ci_force_lines: get_loc(confidence_interval_).plot(linestyle="-", linewidth=1, color=[kwargs['color']], legend=True, drawstyle=kwargs.get( 'drawstyle', 'default'), ax=kwargs['ax'], alpha=0.6) else: x = get_loc(confidence_interval_).index.values.astype(float) lower = get_loc( confidence_interval_.filter(like='lower')).values[:, 0] upper = get_loc( confidence_interval_.filter(like='upper')).values[:, 0] fill_between_steps(x, lower, y2=upper, ax=kwargs['ax'], alpha=ci_alpha, color=kwargs['color'], linewidth=1.0) if at_risk_counts: add_at_risk_counts(cls, ax=kwargs['ax']) return kwargs['ax']