def preprocess_df(self, df, event_col, start_col, stop_col, weights_col, id_col): df = df.copy() if not (event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert "__weights" not in df.columns, "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is not None and self.id_col is not None: df = df.set_index(_to_list(self.strata) + [id_col]) df = df.sort_index() elif self.strata is not None and self.id_col is None: df = df.set_index(_to_list(self.strata)) elif self.strata is None and self.id_col is not None: df = df.set_index([id_col]) events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) return df, events, start, stop, weights
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) params_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.params_ = pd.Series(params_, index=df.columns, name="coef") / self._norm_std self.hazard_ratios_ = pd.Series(np.exp(self.params_), index=df.columns, name="exp(coef)") self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard( df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({ "event": events, "start": start, "stop": stop }) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, entry=None, weights=None, tol: float = 1e-5, show_progress: bool = False, **kwargs, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turnbull Estimator. Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals. >>> left, right = df['left'], df['right'] >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001) Note ------ This is new and experimental, and many features are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. tol: float, optional minimum difference in log likelihood changes for iterative algorithm. show_progress: bool, optional display information during fitting. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ if entry is not None: raise NotImplementedError("entry is not supported yet") if weights is None: weights = np.ones_like(upper_bound) self.weights = np.asarray(weights) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") results = npmle(self.lower_bound, self.upper_bound, verbose=show_progress, tol=tol, weights=weights, **kwargs) self.survival_function_ = reconstruct_survival_function( *results, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" return self
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.") if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"} ) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights ) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop}) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def fit(self, df, duration_col=None, event_col=None, ancillary_df=None, show_progress=False, timeline=None): """ Fit the Weibull accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights, strata). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in dataframe that contains the subjects' lifetimes. event_col: string, optional the name of thecolumn in dataframe that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. ancillary_df: None, boolean, or DataFrame, optional (default=None) Choose to model the ancillary parameters. If None or False, explicity do not fit the ancillary parameters using any covariates. If True, model the ancillary parameters with the same covariates as ``df``. If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``. timeline: array, optional Specify a timeline that will be used for plotting and prediction Returns ------- self: WeibullAFTFitter self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- >>> from lifelines import WeibullAFTFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E') >>> aft.print_summary() >>> aft.predict_median(df) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E', ancillary_df=df) >>> aft.print_summary() >>> aft.predict_median(df) """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) E = (pass_for_numeric_dtypes_or_raise_array(df.pop( self.event_col)).astype(bool) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples), index=df.index, name="E")) self.durations = T.copy() self.event_observed = E.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) self._check_values(df, T, E, self.event_col) if isinstance(ancillary_df, pd.DataFrame): assert ancillary_df.shape[0] == df.shape[ 0], "ancillary_df must be the same shape[0] as df" ancillary_df = ancillary_df.copy().drop([duration_col, event_col], axis=1, errors="ignore") self._check_values(ancillary_df, T, E, self.event_col) elif (ancillary_df is None) or (ancillary_df is False): ancillary_df = pd.DataFrame(np.ones((df.shape[0], )), index=df.index, columns=["_intercept"]) elif ancillary_df is True: ancillary_df = df.copy() if self.fit_intercept: assert "_intercept" not in df ancillary_df["_intercept"] = 1.0 df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns), len(ancillary_df.columns)) _norm_std, _norm_std_ancillary = df.std(0), ancillary_df.std(0) self._norm_mean, self._norm_mean_ancillary = df.mean( 0), ancillary_df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 _norm_std_ancillary["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _norm_std_ancillary[_norm_std_ancillary < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples([("lambda_", c) for c in df.columns] + [("rho_", c) for c in ancillary_df.columns]) self._norm_std = pd.Series(np.append(_norm_std, _norm_std_ancillary), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, normalize(df, 0, _norm_std).values, normalize(ancillary_df, 0, _norm_std_ancillary).values, show_progress=show_progress, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors() self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df, ancillary_df) return self
def fit_interval_censoring( self, lower_bound, upper_bound, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, show_progress=False, entry=None, weights=None, tol=1e-7, ) -> "KaplanMeierFitter": """ Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is also called the Turball Estimator. Note ------ This is new and experimental, and many feature are missing. Parameters ---------- lower_bound: an array, list, pd.DataFrame or pd.Series length n -- lower bound of observations upper_bound: an array, list, pd.DataFrame or pd.Series length n -- upper bound of observations event_observed: an array, list, pd.DataFrame, or pd.Series, optional True if the the death was observed, False if the event was lost (right-censored). This can be computed from the lower_bound and upper_bound, and can be left blank. timeline: an array, list, pd.DataFrame, or pd.Series, optional return the best estimate at the values in timelines (positively increasing) entry: an array, list, pd.DataFrame, or pd.Series, optional relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: tuple, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: an array, list, pd.DataFrame, or pd.Series, optional if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self: KaplanMeierFitter self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ warnings.warn( "This is new and experimental, many feature are missing and accuracy is not reliable", UserWarning) if entry is not None or weights is not None: raise NotImplementedError("entry / weights is not supported yet") self.weights = np.ones_like(upper_bound) self.upper_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(upper_bound)) self.lower_bound = np.atleast_1d( pass_for_numeric_dtypes_or_raise_array(lower_bound)) check_nans_or_infs(self.lower_bound) self.event_observed = self.lower_bound == self.upper_bound self.timeline = coalesce( timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) if (self.upper_bound < self.lower_bound).any(): raise ValueError( "All upper_bound times must be greater than or equal to lower_bound times." ) if event_observed is None: event_observed = self.upper_bound == self.lower_bound if ((self.lower_bound == self.upper_bound) != event_observed).any(): raise ValueError( "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" ) self._label = coalesce(label, self._label, "NPMLE_estimate") probs, t_intervals = npmle(self.lower_bound, self.upper_bound, verbose=show_progress) self.survival_function_ = reconstruct_survival_function( probs, t_intervals, self.timeline, label=self._label).loc[self.timeline] self.cumulative_density_ = 1 - self.survival_function_ self._median = median_survival_times(self.survival_function_) self.percentile = functools.partial( qth_survival_time, model_or_survival_function=self.survival_function_) """ self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ """ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() return self
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- >>> N, d = 80000, 2 >>> # some numbers take from http://statwonk.com/parametric-survival.html >>> breakpoints = (1, 31, 34, 62, 65) >>> betas = np.array( >>> [ >>> [1.0, -0.2, np.log(15)], >>> [5.0, -0.4, np.log(333)], >>> [9.0, -0.6, np.log(18)], >>> [5.0, -0.8, np.log(500)], >>> [2.0, -1.0, np.log(20)], >>> [1.0, -1.2, np.log(500)], >>> ] >>> ) >>> X = 0.1 * np.random.exponential(size=(N, d)) >>> X = np.c_[X, np.ones(N)] >>> T = np.empty(N) >>> for i in range(N): >>> lambdas = np.exp(-betas.dot(X[i, :])) >>> T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0] >>> T_censor = np.minimum( >>> T.mean() * np.random.exponential(size=N), 110 >>> ) # 110 is the end of observation, eg. current time. >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"]) >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1) >>> df["E"] = T <= T_censor >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E") >>> pew.print_summary() >>> pew.plot() """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float) E = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E") ) weights = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights") ) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError("values in weight column %s must be positive." % self.weights_col) df = df.astype(float) self._check_values(df, T, E, self.event_col) E = E.astype(bool) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], []) ) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T return self
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- TODO >>> from lifelines import WeibullAFTFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E') >>> aft.print_summary() >>> aft.predict_median(df) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E', ancillary_df=df) >>> aft.print_summary() >>> aft.predict_median(df) """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) E = (pass_for_numeric_dtypes_or_raise_array(df.pop( self.event_col)).astype(bool) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) df = df.astype(float) self._check_values(df, T, E, self.event_col) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) # TODO _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard( df, times=[np.percentile(T, 75)]).T return self
def _fit( self, log_likelihood_function, df, Ts, regressors, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, entry_col=None, ): self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.weights_col = weights_col self.entry_col = entry_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust self.regressors = regressors # TODO name E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones( self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) entries = (pass_for_numeric_dtypes_or_raise_array( df.pop(entry_col)).astype(float) if (entry_col is not None) else pd.Series(np.zeros(self._n_examples, dtype=float), index=df.index, name="entry")) check_nans_or_infs(E) E = E.astype(bool) self.event_observed = E.copy() self.entry = entries.copy() self.weights = weights.copy() df = df.astype(float) self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries) check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) _norm_std = df.std(0) _norm_std[_norm_std < 1e-8] = 1.0 df_normalized = normalize(df, 0, _norm_std) Xs = self._create_Xs_dict(df_normalized) self._LOOKUP_SLICE = self._create_slicer(Xs) _index = pd.MultiIndex.from_tuples( sum(([(name, col) for col in columns] for name, columns in regressors.items()), [])) self._norm_std = pd.Series( [_norm_std.loc[variable_name] for _, variable_name in _index], index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( log_likelihood_function, Ts, Xs, E.values, weights.values, entries.values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( Ts, E.values, weights.values, entries.values, Xs) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df)
def fit( self, df, duration_col, event_col=None, regressors=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, entry_col=None, ): """ Fit the accelerated failure time model to a right-censored dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. regressors: TODO timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in DataFrame that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. entry_col: specify a column in the DataFrame that denotes any late-entries (left truncation) that occurred. See the docs on `left truncation <https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html#left-truncated-late-entry-data>`__ Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more """ self.duration_col = duration_col self._time_cols = [duration_col] df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) self.durations = T.copy() self._fit( self._log_likelihood_right_censoring, df, (T.values, None), event_col=event_col, regressors=regressors, show_progress=show_progress, timeline=timeline, weights_col=weights_col, robust=robust, initial_point=initial_point, entry_col=entry_col, ) return self
def fit( self, durations, event_observed=None, timeline=None, label=None, alpha=None, ci_labels=None, show_progress=False, entry=None, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series length n, duration subject was observed for event_observed: numpy array or pd.Series, optional length n, True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None timeline: list, optional return the estimate at the values in timeline (postively increasing) label: string, optional a string to name the column of the estimate. alpha: float, optional the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: list, optional add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> show_progress: boolean, optional since this is an iterative fitting algorithm, switching this to True will display some iteration details. entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born": time zero. Returns ------- self self with new properties like ``cumulative_hazard_``, ``survival_function_`` """ label = coalesce(label, self.__class__.__name__.replace("Fitter", "") + "_estimate") check_nans_or_infs(durations) if event_observed is not None: check_nans_or_infs(event_observed) self.durations = np.asarray(pass_for_numeric_dtypes_or_raise_array(durations)) # check for negative or 0 durations - these are not allowed in a weibull model. if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if not self._KNOWN_MODEL: self._check_cumulative_hazard_is_monotone_and_positive(self.durations, self._initial_values) self.event_observed = ( np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations) ) self.entry = np.asarray(entry) if entry is not None else np.zeros_like(self.durations) if timeline is not None: self.timeline = np.sort(np.asarray(timeline).astype(float)) else: self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0]) self._label = label self._ci_labels = ci_labels self.alpha = coalesce(alpha, self.alpha) # estimation self._fitted_parameters_, self._log_likelihood, self._hessian_ = self._fit_model( self.durations, self.event_observed.astype(bool), self.entry, show_progress=show_progress ) if not self._KNOWN_MODEL: self._check_cumulative_hazard_is_monotone_and_positive(self.durations, self._fitted_parameters_) for param_name, fitted_value in zip(self._fitted_parameter_names, self._fitted_parameters_): setattr(self, param_name, fitted_value) try: self.variance_matrix_ = inv(self._hessian_) except np.linalg.LinAlgError: self.variance_matrix_ = pinv(self._hessian_) warning_text = dedent( """\ The hessian was not invertable. This could be a model problem: 1. Are two parameters in the model colinear / exchangeable? 2. Is the cumulative hazard always non-negative and always non-decreasing? 3. Are there cusps/ in the cumulative hazard? We will instead approximate it using the psuedo-inverse. It's advisable to not trust the variances reported, and to be suspicious of the fitted parameters too. Perform plots of the cumulative hazard to help understand the latter's bias. """ ) warnings.warn(warning_text, StatisticalWarning) self._predict_label = label self._update_docstrings() self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame() self.hazard_ = self.hazard_at_times(self.timeline).to_frame() self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame() return self