def KM_median(array, upper_lim_flags, left_censor=True, return_type='percentile'): kmf = KaplanMeierFitter() if upper_lim_flags is not None: if left_censor == True: kmf.fit_left_censoring(array, upper_lim_flags) else: kmf.fit(array, event_observed=upper_lim_flags) #right censoring else: kmf.fit(array, upper_lim_flags) median = median_survival_times(kmf.survival_function_) if return_type == 'percentile': upper_perc = kmf.percentile(0.25) lower_perc = kmf.percentile(0.75) print( f'median and 1st/3rd quartiles: {median}, {lower_perc}, {upper_perc}' ) return median, upper_perc, lower_perc elif return_type == 'ci': median_ci = median_survival_times(kmf.confidence_interval_).values print(f'median and CI: {median}, {median_ci}') return median, median_ci[0][0], median_ci[0][1] elif return_type == 'median': return median
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_'. """ # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_' v = _preprocess_inputs(durations, event_observed, timeline, entry) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].argmin() raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate) # estimation methods self.predict = self._predict(estimate_name, label) self.subtract = self._subtract(estimate_name) self.divide = self._divide(estimate_name) # plotting functions self.plot = self._plot_estimate(estimate_name) setattr(self, "plot_" + estimate_name, self.plot) self.plot_loglogs = plot_loglogs(self) return self
def __init__(self, time, events, label=None, color=None): self._kmf = KaplanMeierFitter().fit(time.astype(np.float64), events.astype(np.float64)) self._label: str = label self.color: List[int] = color # refactor this time, survival = self._kmf.survival_function_.reset_index( ).values.T.tolist() lower, upper = self._kmf.confidence_interval_.values.T.tolist() self.x, self.y = self.generate_curve_coordinates(time, survival) _, self.lower_bound = self.generate_curve_coordinates(time, lower) _, self.upper_bound = self.generate_curve_coordinates(time, upper) # Estimated function curve self.estimated_fun = pg.PlotDataItem(self.x, self.y, pen=self.get_pen()) # Lower and upper confidence intervals pen = self.get_pen(width=1, alpha=70) self.lower_conf_limit = pg.PlotDataItem(self.x, self.lower_bound, pen=pen) self.upper_conf_limit = pg.PlotDataItem(self.x, self.upper_bound, pen=pen) self.confidence_interval = pg.FillBetweenItem( self.upper_conf_limit, self.lower_conf_limit, brush=self.get_color(alpha=50)) self.selection = pg.PlotDataItem( pen=mkPen(color=QColor(Qt.yellow), width=4)) self.selection.hide() self.median_survival = median = np.round( median_survival_times( self._kmf.survival_function_.astype(np.float32)), 1) self.median_vertical = pg.PlotDataItem( x=(median, median), y=(0, 0.5), pen=pg.mkPen(**MEDIAN_LINE_PEN_STYLE)) censored_data = self.get_censored_data() self.censored_data = pg.ScatterPlotItem( x=censored_data[:, 0], y=censored_data[:, 1], brush=QBrush(Qt.black), pen=self.get_pen(width=1, alpha=255), symbol=create_line_symbol(), size=15, ) self.censored_data.setZValue(10) self.num_of_samples = len(events) self.num_of_censored_samples = len(censored_data)
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_'. """ # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_' v = _preprocess_inputs(durations, event_observed, timeline, entry) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].argmin() raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate) # estimation methods self.predict = self._predict(estimate_name, label) self.subtract = self._subtract(estimate_name) self.divide = self._divide(estimate_name) # plotting functions self.plot = self._plot_estimate(estimate_name) setattr(self, "plot_" + estimate_name, self.plot) return self
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='BFH_estimate', alpha=None, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_'. """ self._label = label alpha = alpha if alpha is not None else self.alpha naf = NelsonAalenFitter(alpha) naf.fit(durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = \ naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) self.median_ = median_survival_times(self.survival_function_) # estimation methods self.predict = self._predict("survival_function_", label) self.subtract = self._subtract("survival_function_") self.divide = self._divide("survival_function_") # plotting functions self.plot = self._plot_estimate("survival_function_") self.plot_survival_function = self.plot return self
def predict_50(conditioned_sf): # Predict the month number where the survival chance of customer is 50% # This can also be modified as predictions_50 = qth_survival_times(.50, conditioned_sf), # where the percentile can be modified depending on our requirement predictions_50 = median_survival_times(conditioned_sf) ''' ### predictions_50 Predicting the month at which the survival chance of the customer is 50% ''' st.write(predictions_50[[customer]]) return predictions_50
def predict_50(conditioned_sf): # Predict the month number where the survival chance of customer is 50% # This can also be modified as predictions_50 = qth_survival_times(.50, conditioned_sf), # where the percentile can be modified depending on our requirement predictions_50 = median_survival_times(conditioned_sf) return predictions_50
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="KM_estimate", alpha=None, left_censorship=False, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like 'survival_function_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) if weights is not None: if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly prospenity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = "survival_function_" if not left_censorship else "cumulative_density_" v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr( self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship) # estimation methods self._estimation_method = estimate_name self._estimate_name = estimate_name self._predict_label = label self._update_docstrings() # plotting functions setattr(self, "plot_" + estimate_name, self.plot) return self
# print(df) print(df.head(),'\n') print(df['T'].min(), df['T'].max(),'\n') print(df['E'].value_counts(),'\n') print(df['group'].value_counts(),'\n') kmf = KaplanMeierFitter() kmf.fit(df['T'], event_observed=df['E']) # kmf.plot_survival_function() ax=kmf.survival_function_.plot() #共享一个画布 ax=kmf.plot(ax=ax) median_ = kmf.median_survival_time_ median_confidence_interval_ = median_survival_times(kmf.confidence_interval_) print(median_confidence_interval_) groups = df['group'] ix = (groups == 'largeAmount') kmf.fit(df['T'][ix], df['E'][ix], label='largeAmount') ax=kmf.survival_function_.plot(ax=ax) ax = kmf.plot(ax=ax) # plt.show() treatment_median_confidence_interval_ = median_survival_times(kmf.confidence_interval_) print("使用量基数较大的音乐存活50%对应的存活时间95%置信区间:'\n'", treatment_median_confidence_interval_, '\n') kmf.fit(df['T'][~ix], df['E'][~ix], label='smallAmount') ax=kmf.survival_function_.plot(ax=ax) ax = kmf.plot(ax=ax) # plt.show()
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, entry=None, weights=None, tol: float = 1e-5, show_progress: bool = False, **kwargs, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turnbull Estimator. Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals. >>> left, right = df['left'], df['right'] >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001) Note ------ This is new and experimental, and many features are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. tol: float, optional minimum difference in log likelihood changes for iterative algorithm. show_progress: bool, optional display information during fitting. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ if entry is not None: raise NotImplementedError("entry is not supported yet") if weights is None: weights = np.ones_like(upper_bound) self.weights = np.asarray(weights) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") results = npmle(self.lower_bound, self.upper_bound, verbose=show_progress, tol=tol, weights=weights, **kwargs) self.survival_function_ = reconstruct_survival_function( *results, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" return self
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, show_progress=False, entry=None, weights=None, tol=1e-7, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turball Estimator. Note ------ This is new and experimental, and many feature are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ warnings.warn( "This is new and experimental, many feature are missing and accuracy is not reliable", UserWarning) if entry is not None or weights is not None: raise NotImplementedError("entry / weights is not supported yet") self.weights = np.ones_like(upper_bound) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") probs, t_intervals = npmle(self.lower_bound, self.upper_bound, verbose=show_progress) self.survival_function_ = reconstruct_survival_function( probs, t_intervals, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) self.percentile = functools.partial( qth_survival_time, model_or_survival_function=self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() return self
def test_median_accepts_series(): sv = pd.Series(1 - np.linspace(0, 1, 1000)) assert utils.median_survival_times(sv) == 500
def _fit( self, durations, event_observed=None, timeline=None, entry=None, label="KM_estimate", alpha=None, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (postively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: bool, optional (default=False) True if durations and event_observed refer to left censorship events. Default False ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median`` """ self._check_values(durations) if event_observed is not None: self._check_values(event_observed) self._label = label if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" self.durations, self.event_observed, self.timeline, self.entry, self.event_table = _preprocess_inputs( durations, event_observed, timeline, entry, weights ) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring ) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[: int(n / 2)].min() == 0: ix = net_population.iloc[: int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix ) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=is_left_censoring) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name self._update_docstrings() return self
ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) add_at_risk_counts(12, kmf_overall, labels=None) leg_1 = Line2D([0], [0], color='grey', ls='--') leg_2 = Line2D([0], [0], color='blue', marker='|', ms=10) leg = [leg_1, leg_2] leg_lab = ['Due Date', 'Censored'] plt.legend(leg, leg_lab, fontsize=20) #plt.savefig(parent + '/Figures/survival_with_censor.png') # - #This provides the values for the 95% CI at the median from lifelines.utils import median_survival_times median_survival_times(kmf_overall.confidence_interval_) # + #The proportion reported any day can be checked by running this code and editing the value in loc[]: print(kmf_overall.survival_function_.loc[796]['KM_estimate']) print(kmf_overall.survival_function_.loc[815]['KM_estimate']) #If you are interested in the full data produced by the survival function you can view it here: surv_func = kmf_overall.survival_function_ surv_func.head() # - #Similarly, the upper and lower condifence interval values can be checked by editing the value in loc[]: kmf_overall.confidence_interval_survival_function_.loc[412] # +
from lifelines.datasets import load_waltons from lifelines import KaplanMeierFitter from lifelines.utils import median_survival_times df = load_waltons() print(df.head(),'\n') print(df['T'].min(), df['T'].max(),'\n') print(df['E'].value_counts(),'\n') print(df['group'].value_counts(),'\n') kmf = KaplanMeierFitter() kmf.fit(df['T'], event_observed=df['E']) kmf.plot_survival_function() median_ = kmf.median_survival_time_ median_confidence_interval_ = median_survival_times(kmf.confidence_interval_) print(median_confidence_interval_)
def test_median(): sv = pd.DataFrame(1 - np.linspace(0, 1, 1000)) assert utils.median_survival_times(sv) == 500
def median_(self): """ Return the unique time point, t, such that S(t) = 0.5. This is the "half-life" of the population, and a robust summary statistic for the population, if it exists. """ return median_survival_times(self.survival_function_)
def _fit(self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array, list, pd.DataFrame or pd.Series length n -- duration subject was observed for event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ durations = np.asarray(durations) self._check_values(durations) if event_observed is not None: event_observed = np.asarray(event_observed) self._check_values(event_observed) self._label = coalesce(label, self._label, "KM_estimate") if weights is not None: weights = np.asarray(weights) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) else: weights = np.ones_like(durations, dtype=float) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, is_left_censoring = CensoringType.is_left_censoring(self) primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_" secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_" (self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights) = _preprocess_inputs(durations, event_observed, timeline, entry, weights) alpha = alpha if alpha else self.alpha log_estimate, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label])) setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label])) self.__estimate = getattr(self, primary_estimate_name) self.confidence_interval_ = self._bounds( cumulative_sq_.values[:, None], alpha, ci_labels) self._median = median_survival_times(self.survival_function_) self._cumulative_sq_ = cumulative_sq_ setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_) setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_) # estimation methods self._estimation_method = primary_estimate_name self._estimate_name = primary_estimate_name return self
add_at_risk_counts(kmf_train, kmf_test, ax=ax, fontsize=12) ax.set_xlabel('Time (months)', fontsize=12, fontweight='bold') ax.set_ylabel('Survival probability', fontsize=12, fontweight='bold') ax.legend(fontsize=12) ax.text(10, 0.75, 'p=0.85', fontsize=12, fontweight='bold') # In[27]: print(kmf_train.median_survival_time_) print(kmf_test.median_survival_time_) # In[28]: print(median_survival_times(kmf_train.confidence_interval_)) print(median_survival_times(kmf_test.confidence_interval_)) # In[29]: print(kmf_train.event_table) print(kmf_test.event_table) # In[30]: print('Survival probability for t=60 for train set: ', kmf_train.predict(60)) print('Survival probability for t=60 for test set: ', kmf_test.predict(60)) # In[31]: results = logrank_test(train['PFS'],
dfn = pd.DataFrame( d, columns=["ID", "KM", "DEAD", "ENGINE", "MOUNTAIN", "CITY", "MONDAY"]) print(dfn) censored_subjects = censored_subjects.append(dfn, ignore_index=True) print(censored_subjects) unconditioned_sf = cph.predict_survival_function(censored_subjects) print(unconditioned_sf) from lifelines.utils import median_survival_times, qth_survival_times predictions_75 = qth_survival_times(0.75, unconditioned_sf) predictions_25 = qth_survival_times(0.25, unconditioned_sf) predictions_50 = median_survival_times(unconditioned_sf) print(predictions_50) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4)) for f in unconditioned_sf: ax.plot(unconditioned_sf[f], alpha=.5, label=f) #ax.legend() fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4)) for i, f in enumerate(reversed(unconditioned_sf.columns)): #print( i ) if i < num_d: print(i, f) ax.plot(unconditioned_sf[f], alpha=1, label=f) else: ax.plot(unconditioned_sf[f], alpha=0.1, label=f, c='grey')