def fit(self, durations, event_observed=None, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None, weights=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns: self, with new properties like 'cumulative_hazard_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) if weights is not None: if (weights.astype(int) != weights).any(): warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, RuntimeWarning) v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline, self._additive_f, self._variance_f, False) # esimates self._label = label self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label]) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels) self._cumulative_sq = cumulative_sq_ # estimation methods self._estimation_method = "cumulative_hazard_" self._estimate_name = "cumulative_hazard_" self._predict_label = label self._update_docstrings() # plotting self.plot_cumulative_hazard = self.plot return self
def _check_values(self, df, events, start, stop): # check_for_overlapping_intervals(df) # this is currently too slow for production. check_nans_or_infs(df) check_low_var(df) check_complete_separation_low_variance(df, events, self.event_col) check_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(events, start, stop) check_for_instantaneous_events(start, stop)
def _check_values(df, stop_times_events): # check_for_overlapping_intervals(df) # this is currenty too slow for production. check_nans_or_infs(df) check_low_var(df) check_complete_separation_low_variance(df, stop_times_events['event']) pass_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(stop_times_events) check_for_instantaneous_events(stop_times_events)
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='Exponential_estimate', alpha=None, ci_labels=None): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns: self, with new properties like 'survival_function_' and 'lambda_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations) self.timeline = np.sort(np.asarray(timeline)) if timeline is not None else np.arange(int(self.durations.min()), int(self.durations.max()) + 1) self._label = label # estimation D = self.event_observed.sum() T = self.durations.sum() self.lambda_ = D / T self._lambda_variance_ = self.lambda_ / T self._log_likelihood = np.log(self.lambda_) * D - self.lambda_ * T self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline) self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels) self.median_ = 1. / self.lambda_ * (np.log(2)) # estimation methods self._estimate_name = "survival_function_" self._predict_label = label self._update_docstrings() # plotting self.plot_survival_function_ = self.plot return self
def _preprocess_dataframe(self, df): n, _ = df.shape df = df.sort_values(by=self.duration_col) # Extract time and event T = df.pop(self.duration_col) E = df.pop( self.event_col) if (self.event_col is not None) else pd.Series( np.ones(n), index=df.index, name="E") W = (df.pop(self.weights_col) if (self.weights_col is not None) else pd.Series(np.ones((n, )), index=df.index, name="weights")) # check to make sure their weights are okay if self.weights_col: if (W.astype(int) != W).any(): warnings.warn( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased." """, StatisticalWarning, ) if (W <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) X = df.astype(float) T = T.astype(float) check_nans_or_infs(E) E = E.astype(bool) self._check_values(df, T, E) if self.fit_intercept: assert ( "_intercept" not in df.columns ), "_intercept is an internal lifelines column, please rename your column first." X["_intercept"] = 1.0 return X, T, E, W
def _preprocess_dataframe(self, df): n, _ = df.shape df = df.sort_values(by=self.duration_col) # Extract time and event T = df.pop(self.duration_col) E = df.pop(self.event_col) if (self.event_col is not None) else pd.Series(np.ones(n), index=df.index, name="E") W = ( df.pop(self.weights_col) if (self.weights_col is not None) else pd.Series(np.ones((n,)), index=df.index, name="weights") ) # check to make sure their weights are okay if self.weights_col: if (W.astype(int) != W).any(): warnings.warn( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased." """, StatisticalWarning, ) if (W <= 0).any(): raise ValueError("values in weight column %s must be positive." % self.weights_col) X = df.astype(float) T = T.astype(float) check_nans_or_infs(E) E = E.astype(bool) self._check_values(df, T, E) if self.fit_intercept: assert ( "_intercept" not in df.columns ), "_intercept is an internal lifelines column, please rename your column first." X["_intercept"] = 1.0 return X, T, E, W
def _check_values(df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(df) check_low_var(df) check_complete_separation(df, E, T)
def _check_values(self, df, T, E, weights, entries): check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) check_nans_or_infs(T) check_nans_or_infs(E) check_positivity(T) check_complete_separation(df, E, T, self.event_col) if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) if self.entry_col: count_invalid_rows = (entries > T).sum() if count_invalid_rows: warnings.warn( """There exist %d rows where entry > duration.""")
def _preprocess_dataframe(self, df): n, _ = df.shape df = df.sort_values(by=self.duration_col) # Extract time and event T = df.pop(self.duration_col) E = df.pop( self.event_col) if (self.event_col is not None) else pd.Series( np.ones(n), index=df.index, name="E") W = (df.pop(self.weights_col) if (self.weights_col is not None) else pd.Series(np.ones((n, )), index=df.index, name="weights")) # check to make sure their weights are okay if self.weights_col: if (W.astype(int) != W).any(): warnings.warn( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased." """, StatisticalWarning, ) if (W <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) self.regressors = utils.CovariateParameterMappings( {"beta_": self.formula}, df, force_intercept=self.fit_intercept) X = self.regressors.transform_df(df)["beta_"] T = T.astype(float) check_nans_or_infs(E) E = E.astype(bool) self._check_values(df, T, E) return X, T, E, W
def _check_values(self, df, T, E, event_col): check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(df) check_complete_separation(df, E, T, event_col) if self.fit_intercept: check_low_var(df)
def fit( self, durations, event_observed=None, timeline=None, label="Weibull_estimate", alpha=None, ci_labels=None, show_progress=False, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series length n, duration subject was observed for event_observed: numpy array or pd.Series, optional length n, True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: list, optional return the estimate at the values in timeline (postively increasing) label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: list, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> show_progress: boolean, optional since this is an iterative fitting algorithm, switching this to True will display some iteration details. Returns ------- self : WeibullFitter self with new properties like ``cumulative_hazard_``, ``survival_function_``, ``lambda_``, and ``rho_``. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) self.event_observed = ( np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations) ) if timeline is not None: self.timeline = np.sort(np.asarray(timeline)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label alpha = alpha if alpha is not None else self.alpha # estimation (self.lambda_, self.rho_), self._hessian_ = self._newton_rhaphson( self.durations, self.event_observed, show_progress=show_progress ) self._log_likelihood = -_negative_log_likelihood((self.lambda_, self.rho_), self.durations, self.event_observed) self.variance_matrix_ = inv(self._hessian_) self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame(name=self._label) self.hazard_ = self.hazard_at_times(self.timeline).to_frame(self._label) self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame(self._label) self.confidence_interval_ = self._bounds(alpha, ci_labels) self.median_ = 1.0 / self.lambda_ * (np.log(2)) ** (1.0 / self.rho_) # estimation methods self._estimate_name = "cumulative_hazard_" self._predict_label = label self._update_docstrings() # plotting - Cumulative hazard takes priority. self.plot_cumulative_hazard = self.plot return self
def fit(self, durations, event_observed=None, timeline=None, label="LogNormal_estimate", alpha=0.95, ci_labels=None): # pylint: disable=too-many-arguments """ Parameters ---------- durations: iterable an array, or pd.Series, of length n -- duration subject was observed for event_observed: iterable, optional an array, list, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: iterable, optional return the best estimate at the values in timelines (postively increasing) label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: list, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self : LogNormalFitter self, with new properties like 'survival_function_', 'sigma_' and 'mu_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) self.event_observed = (np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like( self.durations)) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if timeline is not None: self.timeline = np.sort(np.asarray(timeline)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label (self.mu_, self.sigma_ ), self._log_likelihood, self.variance_matrix_ = self._fit_model( self.durations, self.event_observed) self.survival_function_ = self.survival_function_at_times( self.timeline).to_frame(name=self._label) self.hazard_ = self.hazard_at_times( self.timeline).to_frame(name=self._label) self.cumulative_hazard_ = self.cumulative_hazard_at_times( self.timeline).to_frame(name=self._label) self.median_ = np.exp(self.mu_) self.confidence_interval_ = self._bounds(alpha, ci_labels) # estimation methods self._estimate_name = "cumulative_hazard_" self._predict_label = label self._update_docstrings() # plotting - Cumulative hazard takes priority. self.plot_cumulative_hazard = self.plot return self
def _check_values(self, array): check_nans_or_infs(array)
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, entry=None, weights=None, tol: float = 1e-5, show_progress: bool = False, **kwargs, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turnbull Estimator. Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals. >>> left, right = df['left'], df['right'] >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001) Note ------ This is new and experimental, and many features are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. tol: float, optional minimum difference in log likelihood changes for iterative algorithm. show_progress: bool, optional display information during fitting. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ if entry is not None: raise NotImplementedError("entry is not supported yet") if weights is None: weights = np.ones_like(upper_bound) self.weights = np.asarray(weights) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") results = npmle(self.lower_bound, self.upper_bound, verbose=show_progress, tol=tol, weights=weights, **kwargs) self.survival_function_ = reconstruct_survival_function( *results, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" return self
def _check_values(self, X, T, E): check_for_numeric_dtypes_or_raise(X) check_nans_or_infs(T) check_nans_or_infs(X)
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, show_progress=False, entry=None, weights=None, tol=1e-7, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turball Estimator. Note ------ This is new and experimental, and many feature are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ warnings.warn( "This is new and experimental, many feature are missing and accuracy is not reliable", UserWarning) if entry is not None or weights is not None: raise NotImplementedError("entry / weights is not supported yet") self.weights = np.ones_like(upper_bound) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") probs, t_intervals = npmle(self.lower_bound, self.upper_bound, verbose=show_progress) self.survival_function_ = reconstruct_survival_function( probs, t_intervals, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) self.percentile = functools.partial( qth_survival_time, model_or_survival_function=self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() return self
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="KM_estimate", alpha=None, left_censorship=False, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- duration: an array, or pd.Series, of length n -- duration subject was observed for timeline: return the best estimate at the values in timelines (postively increasing) event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. left_censorship: True if durations and event_observed refer to left censorship events. Default False ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like 'survival_function_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) if weights is not None: if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly prospenity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data." """, StatisticalWarning, ) # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_, estimate_name = "survival_function_" if not left_censorship else "cumulative_density_" v = _preprocess_inputs(durations, event_observed, timeline, entry, weights) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v self._label = label alpha = alpha if alpha else self.alpha log_survival_function, cumulative_sq_ = _additive_estimate( self.event_table, self.timeline, self._additive_f, self._additive_var, left_censorship) if entry is not None: # a serious problem with KM is that when the sample size is small and there are too few early # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same. # we adjust for this using the Breslow-Fleming-Harrington estimator n = self.event_table.shape[0] net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum() if net_population.iloc[:int(n / 2)].min() == 0: ix = net_population.iloc[:int(n / 2)].idxmin() raise StatError( """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix) # estimation setattr( self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label])) self.__estimate = getattr(self, estimate_name) self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels) self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship) # estimation methods self._estimation_method = estimate_name self._estimate_name = estimate_name self._predict_label = label self._update_docstrings() # plotting functions setattr(self, "plot_" + estimate_name, self.plot) return self
def fit(self, durations, event_observed=None, timeline=None, entry=None, label='Weibull_estimate', alpha=None, ci_labels=None, show_progress=False): """ Parameters: duration: an array, or pd.Series, of length n -- duration subject was observed for event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: return the estimate at the values in timeline (postively increasing) entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> show_progress: since this is an iterative fitting algorithm, switching this to True will display some iteration details. Returns: self, with new properties like `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( 'This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements.' ) self.event_observed = np.asarray( event_observed, dtype=int) if event_observed is not None else np.ones_like( self.durations) if timeline is not None: self.timeline = np.sort(np.asarray(timeline)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label alpha = alpha if alpha is not None else self.alpha # estimation (self.lambda_, self.rho_), self._hessian_ = self._newton_rhaphson( self.durations, self.event_observed, show_progress=show_progress) self._log_likelihood = -_negative_log_likelihood( (self.lambda_, self.rho_), self.durations, self.event_observed) self.variance_matrix_ = -inv(self._hessian_) self.survival_function_ = pd.DataFrame(self.survival_function_at_times( self.timeline), columns=[self._label], index=self.timeline) self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline), columns=[self._label], index=self.timeline) self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times( self.timeline), columns=[self._label], index=self.timeline) self.confidence_interval_ = self._bounds(alpha, ci_labels) self.median_ = 1. / self.lambda_ * (np.log(2))**(1. / self.rho_) # estimation methods self._estimate_name = "cumulative_hazard_" self._predict_label = label self._update_docstrings() # plotting - Cumulative hazard takes priority. self.plot_cumulative_hazard = self.plot return self
def _check_values(self, X, T, E): pass_for_numeric_dtypes_or_raise(X) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(X)
def fit( self, durations, event_observed=None, timeline=None, label="Exponential_estimate", alpha=None, ci_labels=None ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: iterable an array, or pd.Series, of length n -- duration subject was observed for event_observed: iterable, optional an array, list, or pd.Series, of length n -- True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: iterable, optional return the best estimate at the values in timelines (postively increasing) label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: list, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self : ExponentialFitter self, with new properties like 'survival_function_', 'cumulative_hazard_', and 'lambda_'. """ check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) self.event_observed = ( np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations) ) if timeline is not None: self.timeline = np.sort(np.asarray(timeline)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label # estimation D = self.event_observed.sum() T = self.durations.sum() self.lambda_ = D / T self._lambda_variance_ = self.lambda_ / T self._log_likelihood = np.log(self.lambda_) * D - self.lambda_ * T self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame(self._label) self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame(self._label) self.hazard_ = self.hazard_at_times(self.timeline).to_frame(self._label) self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels) self.median_ = 1.0 / self.lambda_ * (np.log(2)) # estimation methods self._estimate_name = "cumulative_hazard_" self._predict_label = label self._update_docstrings() # plotting self.plot_cumulative_hazards_ = self.plot return self
def _fit( self, log_likelihood_function, df, Ts, regressors, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, entry_col=None, ): self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.weights_col = weights_col self.entry_col = entry_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust self.regressors = regressors # TODO name E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones( self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) entries = (pass_for_numeric_dtypes_or_raise_array( df.pop(entry_col)).astype(float) if (entry_col is not None) else pd.Series(np.zeros(self._n_examples, dtype=float), index=df.index, name="entry")) check_nans_or_infs(E) E = E.astype(bool) self.event_observed = E.copy() self.entry = entries.copy() self.weights = weights.copy() df = df.astype(float) self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries) check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) _norm_std = df.std(0) _norm_std[_norm_std < 1e-8] = 1.0 df_normalized = normalize(df, 0, _norm_std) Xs = self._create_Xs_dict(df_normalized) self._LOOKUP_SLICE = self._create_slicer(Xs) _index = pd.MultiIndex.from_tuples( sum(([(name, col) for col in columns] for name, columns in regressors.items()), [])) self._norm_std = pd.Series( [_norm_std.loc[variable_name] for _, variable_name in _index], index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( log_likelihood_function, Ts, Xs, E.values, weights.values, entries.values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( Ts, E.values, weights.values, entries.values, Xs) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df)
def fit( self, durations, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, show_progress=False, entry=None, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series length n, duration subject was observed for event_observed: numpy array or pd.Series, optional length n, True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: list, optional return the estimate at the values in timeline (postively increasing) label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: list, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> show_progress: boolean, optional since this is an iterative fitting algorithm, switching this to True will display some iteration details. entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born": time zero. Returns ------- self : WeibullFitter self with new properties like ``cumulative_hazard_``, ``survival_function_`` """ label = coalesce( label, self.__class__.__name__.replace("Fitter", "") + "_estimate") check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(durations, dtype=float) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if not self._KNOWN_MODEL: self._check_cumulative_hazard_is_monotone_and_positive( self.durations, self._initial_values) self.event_observed = (np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like( self.durations)) self.entry = np.asarray(entry) if entry is not None else np.zeros_like( self.durations) if timeline is not None: self.timeline = np.sort(np.asarray(timeline)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label self._ci_labels = ci_labels self.alpha = coalesce(alpha, self.alpha) # estimation self._fitted_parameters_, self._log_likelihood, self._hessian_ = self._fit_model( self.durations, self.event_observed.astype(bool), self.entry, show_progress=show_progress) if not self._KNOWN_MODEL: self._check_cumulative_hazard_is_monotone_and_positive( self.durations, self._fitted_parameters_) for param_name, fitted_value in zip(self._fitted_parameter_names, self._fitted_parameters_): setattr(self, param_name, fitted_value) try: self.variance_matrix_ = inv(self._hessian_) except np.linalg.LinAlgError: self.variance_matrix_ = pinv(self._hessian_) warning_text = dedent("""\ The hessian was not invertable. This could be a model problem: 1. Are two parameters in the model colinear / exchangeable? 2. Is the cumulative hazard always non-negative and always non-decreasing? 3. Are there cusps/ in the cumulative hazard? We will instead approximate it using the psuedo-inverse. It's advisable to not trust the variances reported, and to be suspicious of the fitted parameters too. Perform plots of the cumulative hazard to help understand the latter's bias. """) warnings.warn(warning_text, StatisticalWarning) self._predict_label = label self._update_docstrings() self.survival_function_ = self.survival_function_at_times( self.timeline).to_frame() self.hazard_ = self.hazard_at_times(self.timeline).to_frame() self.cumulative_hazard_ = self.cumulative_hazard_at_times( self.timeline).to_frame() return self