def predict_partial_hazard(self, X): """ X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. If covariates were normalized during fitting, they are normalized in the same way here. If X is a dataframe, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. Returns the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to \exp{\beta X} """ index = _get_index(X) if isinstance(X, pd.DataFrame): order = self.hazards_.columns X = X[order] if self.normalize: # Assuming correct ordering and number of columns X = normalize(X, self._norm_mean.values, self._norm_std.values) return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): check_for_numeric_dtypes_or_raise(X) X = X.astype(float) X = normalize(X, self._norm_mean.values, 1) X_pt = torch.tensor(X, dtype=self.type_pt) return pd.Series(self.net(X_pt).detach().numpy().ravel())
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta ` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): order = self.params_.index X = X[order] check_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.params_), index=index)
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta ` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a DataFrame, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): order = self.hazards_.index X = X[order] check_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_), index=index)
def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.") df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'}) df['event'] = df['event'].astype(bool) df = df.set_index(['id']) self._check_values(df.drop(["event", "stop", "start"], axis=1), df['event']) stop_times_events = df[["event", "stop", "start"]] df = df.drop(["event", "stop", "start"], axis=1) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self._n_examples = df.shape[0] return self
def test_unnormalize(): df = load_larynx() m = df.mean(0) s = df.std(0) ndf = utils.normalize(df) npt.assert_almost_equal(df.values, utils.unnormalize(ndf, m, s).values)
def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): """ Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: A subject could have multiple rows in the dataframe. This column contains the unique identifer per subject. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: the column that contains the start of a subject's time period. stop_col: the column that contains the end of a subject's time period. show_progress: since the fitter is iterative, show convergence diagnostics. step_size: set an initial step size for the fitting algorithm. Returns: self, with additional properties: hazards_ """ df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.") df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'}) df = df.set_index('id') stop_times_events = df[["event", "stop", "start"]].copy() df = df.drop(["event", "stop", "start"], axis=1) stop_times_events['event'] = stop_times_events['event'].astype(bool) self._check_values(df, stop_times_events) df = df.astype(float) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = stop_times_events['event'] self.start_stop_and_events = stop_times_events self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`\beta (X - mean(X_{train}))` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- log_partial_hazard: DataFrame Notes ----- If X is a dataframe, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ hazard_names = self.hazards_.columns if isinstance(X, pd.DataFrame): order = hazard_names X = X[order] pass_for_numeric_dtypes_or_raise(X) elif isinstance( X, pd.Series) and ((X.shape[0] == len(hazard_names) + 2) or (X.shape[0] == len(hazard_names))): X = X.to_frame().T order = hazard_names X = X[order] pass_for_numeric_dtypes_or_raise(X) elif isinstance(X, pd.Series): assert len(hazard_names) == 1, "Series not the correct arugment" X = pd.DataFrame(X) pass_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the dataframe provided." ) df = df.rename(columns={ id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop' }) df['event'] = df['event'].astype(bool) df = df.set_index(['id']) self._check_values(df.drop(["event", "stop", "start"], axis=1), df['event']) stop_times_events = df[["event", "stop", "start"]] df = df.drop(["event", "stop", "start"], axis=1) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self._n_examples = df.shape[0] return self
def predict_log_partial_hazard(self, X): """ X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. If X is a dataframe, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to \beta X """ if isinstance(X, pd.DataFrame): order = self.hazards_.columns X = X[order] index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- TODO >>> from lifelines import WeibullAFTFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E') >>> aft.print_summary() >>> aft.predict_median(df) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E', ancillary_df=df) >>> aft.print_summary() >>> aft.predict_median(df) """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) E = (pass_for_numeric_dtypes_or_raise_array(df.pop( self.event_col)).astype(bool) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) df = df.astype(float) self._check_values(df, T, E, self.event_col) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) # TODO _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard( df, times=[np.percentile(T, 75)]).T return self
def fit(self, df, duration_col, event_col=None, weights_col=None, show_progress=False): """ Parameters ---------- Fit the Aalen Additive model to a dataset. Parameters ---------- df: DataFrame a Pandas dataframe with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in dataframe that contains the subjects' lifetimes. event_col: string, optional the name of thecolumn in dataframe that contains the subjects' death observation. If left as None, assume all individuals are uncensored. weights_col: string, optional an optional column in the dataframe, df, that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. This can be used for case-weights. For example, a weight of 2 means there were two subjects with identical observations. This can be used for sampling weights. show_progress: boolean, optional (default=False) Since the fitter is iterative, show iteration number. Returns ------- self: AalenAdditiveFitter self with additional new properties: ``cumulative_hazards_``, etc. Examples -------- >>> from lifelines import AalenAdditiveFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aaf = AalenAdditiveFitter() >>> aaf.fit(df, 'T', 'E') >>> aaf.predict_median(df) >>> aaf.print_summary() """ self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" df = df.copy() self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] X, T, E, weights = self._preprocess_dataframe(df) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() self._norm_std = X.std(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: self._norm_std["baseline"] = 1.0 else: # a baseline was provided self._norm_std[self._norm_std < 1e-8] = 1.0 self.hazards_, self.cumulative_hazards_, self.cumulative_variance_ = self._fit_model( normalize(X, 0, self._norm_std), T, E, weights, show_progress) self.hazards_ /= self._norm_std self.cumulative_hazards_ /= self._norm_std self.cumulative_variance_ /= self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self._index = self.hazards_.index self._predicted_hazards_ = self.predict_cumulative_hazard( X).iloc[-1].values.ravel() return self
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- >>> N, d = 80000, 2 >>> # some numbers take from http://statwonk.com/parametric-survival.html >>> breakpoints = (1, 31, 34, 62, 65) >>> betas = np.array( >>> [ >>> [1.0, -0.2, np.log(15)], >>> [5.0, -0.4, np.log(333)], >>> [9.0, -0.6, np.log(18)], >>> [5.0, -0.8, np.log(500)], >>> [2.0, -1.0, np.log(20)], >>> [1.0, -1.2, np.log(500)], >>> ] >>> ) >>> X = 0.1 * np.random.exponential(size=(N, d)) >>> X = np.c_[X, np.ones(N)] >>> T = np.empty(N) >>> for i in range(N): >>> lambdas = np.exp(-betas.dot(X[i, :])) >>> T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0] >>> T_censor = np.minimum( >>> T.mean() * np.random.exponential(size=N), 110 >>> ) # 110 is the end of observation, eg. current time. >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"]) >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1) >>> df["E"] = T <= T_censor >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E") >>> pew.print_summary() >>> pew.plot() """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float) E = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E") ) weights = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights") ) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError("values in weight column %s must be positive." % self.weights_col) df = df.astype(float) self._check_values(df, T, E, self.event_col) E = E.astype(bool) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], []) ) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T return self
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, include_likelihood=False, strata=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. include_likelihood: saves the final log-likelihood to the CoxPHFitter under the property _log_likelihood. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df.sort_values(by=duration_col, inplace=True) # remove strata coefs self.strata = strata if strata is not None: df = df.set_index(strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] # Store original non-normalized data self.data = df if self.strata is None else df.reset_index() self._check_values(df) if self.normalize: # Need to normalize future inputs as well self._norm_mean = df.mean(0) self._norm_std = df.std(0) df = normalize(df) E = E.astype(bool) hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta, show_progress=show_progress, include_likelihood=include_likelihood) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) self.confidence_intervals_ = self._compute_confidence_intervals() self.durations = T self.event_observed = E self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum() self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_) return self
def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights_col=None, show_progress=False, step_size=None, robust=False): """ Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: A subject could have multiple rows in the dataframe. This column contains the unique identifer per subject. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: the column that contains the start of a subject's time period. stop_col: the column that contains the end of a subject's time period. weights_col: the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. step_size: set an initial step size for the fitting algorithm. robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 Returns: self, with additional properties: hazards_ """ self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.") if weights_col is None: assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.' df['__weights'] = 1.0 else: if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'}) df = df.set_index('id') stop_times_events = df[["event", "stop", "start"]].copy() weights = df[['__weights']].copy().astype(float) df = df.drop(["event", "stop", "start", "__weights"], axis=1) stop_times_events['event'] = stop_times_events['event'].astype(bool) self._check_values(df, stop_times_events) df = df.astype(float) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = stop_times_events['event'] self.start_stop_and_events = stop_times_events self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def test_normalize(): df = load_larynx() n, d = df.shape npt.assert_almost_equal(utils.normalize(df).mean(0).values, np.zeros(d)) npt.assert_almost_equal(utils.normalize(df).std(0).values, np.ones(d))
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col).values else: weights = np.ones(self._n_examples) self._check_values(df, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard() self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index(self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T) return self
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) else: weights = pd.DataFrame(np.ones((self._n_examples, 1)), index=df.index) self._check_values(df, T, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard( ) self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index( self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard( self._norm_mean.to_frame().T) return self
def fit(self, df, duration_col=None, event_col=None, ancillary_df=None, show_progress=False, timeline=None): """ Fit the Weibull accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights, strata). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in dataframe that contains the subjects' lifetimes. event_col: string, optional the name of thecolumn in dataframe that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. ancillary_df: None, boolean, or DataFrame, optional (default=None) Choose to model the ancillary parameters. If None or False, explicity do not fit the ancillary parameters using any covariates. If True, model the ancillary parameters with the same covariates as ``df``. If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``. timeline: array, optional Specify a timeline that will be used for plotting and prediction Returns ------- self: WeibullAFTFitter self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- >>> from lifelines import WeibullAFTFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E') >>> aft.print_summary() >>> aft.predict_median(df) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E', ancillary_df=df) >>> aft.print_summary() >>> aft.predict_median(df) """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) E = (pass_for_numeric_dtypes_or_raise_array(df.pop( self.event_col)).astype(bool) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples), index=df.index, name="E")) self.durations = T.copy() self.event_observed = E.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) self._check_values(df, T, E, self.event_col) if isinstance(ancillary_df, pd.DataFrame): assert ancillary_df.shape[0] == df.shape[ 0], "ancillary_df must be the same shape[0] as df" ancillary_df = ancillary_df.copy().drop([duration_col, event_col], axis=1, errors="ignore") self._check_values(ancillary_df, T, E, self.event_col) elif (ancillary_df is None) or (ancillary_df is False): ancillary_df = pd.DataFrame(np.ones((df.shape[0], )), index=df.index, columns=["_intercept"]) elif ancillary_df is True: ancillary_df = df.copy() if self.fit_intercept: assert "_intercept" not in df ancillary_df["_intercept"] = 1.0 df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns), len(ancillary_df.columns)) _norm_std, _norm_std_ancillary = df.std(0), ancillary_df.std(0) self._norm_mean, self._norm_mean_ancillary = df.mean( 0), ancillary_df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 _norm_std_ancillary["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _norm_std_ancillary[_norm_std_ancillary < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples([("lambda_", c) for c in df.columns] + [("rho_", c) for c in ancillary_df.columns]) self._norm_std = pd.Series(np.append(_norm_std, _norm_std_ancillary), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, normalize(df, 0, _norm_std).values, normalize(ancillary_df, 0, _norm_std_ancillary).values, show_progress=show_progress, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors() self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df, ancillary_df) return self
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None, cluster_col=None, robust=False): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. This can be used for case-weights. For example, a weight of 2 means there were two subjects with identical observations. This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. step_size: set an initial step size for the fitting algorithm. robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 cluster_col: specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to be used. Returns: self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc. """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + ' UTC' self.duration_col = duration_col self.event_col = event_col self.robust = robust self.cluster_col = cluster_col self.weights_col = weights_col self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col) if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) if (weights <= 0).any(): raise ValueError("values in weights_col must be positive.") else: weights = pd.Series(np.ones((self._n_examples, )), index=df.index) if self.cluster_col: self._clusters = df.pop(self.cluster_col) self._check_values(df, T, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), T, E, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards( df, T, E, weights) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard( ) self.baseline_survival_ = self._compute_baseline_survival() self._predicted_partial_hazards_ = self.predict_partial_hazard( df).values self._train_log_partial_hazard = self.predict_log_partial_hazard( self._norm_mean.to_frame().T) return self
def fit(self, df, duration_col, event_col=None, weights_col=None, show_progress=False): """ Parameters ---------- Fit the Aalen Additive model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. weights_col: string, optional an optional column in the DataFrame, df, that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. This can be used for case-weights. For example, a weight of 2 means there were two subjects with identical observations. This can be used for sampling weights. show_progress: boolean, optional (default=False) Since the fitter is iterative, show iteration number. Returns ------- self: AalenAdditiveFitter self with additional new properties: ``cumulative_hazards_``, etc. Examples -------- >>> from lifelines import AalenAdditiveFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aaf = AalenAdditiveFitter() >>> aaf.fit(df, 'T', 'E') >>> aaf.predict_median(df) >>> aaf.print_summary() """ self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC" df = df.copy() self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] X, T, E, weights = self._preprocess_dataframe(df) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() self._norm_std = X.std(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: self._norm_std["_intercept"] = 1.0 else: # a _intercept was provided self._norm_std[self._norm_std < 1e-8] = 1.0 self.hazards_, self.cumulative_hazards_, self.cumulative_variance_ = self._fit_model( normalize(X, 0, self._norm_std), T, E, weights, show_progress ) self.hazards_ /= self._norm_std self.cumulative_hazards_ /= self._norm_std self.cumulative_variance_ /= self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self._index = self.hazards_.index self._predicted_hazards_ = self.predict_cumulative_hazard(X).iloc[-1].values.ravel() return self
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError( "A column specified in the call to `fit` does not exist in the DataFrame provided." ) if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={ id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights" }) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array( df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) params_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.params_ = pd.Series(params_, index=df.columns, name="coef") / self._norm_std self.hazard_ratios_ = pd.Series(np.exp(self.params_), index=df.columns, name="exp(coef)") self.variance_matrix_ = -inv(self._hessian_) / np.outer( self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard( df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({ "event": events, "start": start, "stop": stop }) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def _fit( self, log_likelihood_function, df, Ts, regressors, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, entry_col=None, ): self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.weights_col = weights_col self.entry_col = entry_col self.event_col = event_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust self.regressors = regressors # TODO name E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones( self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) entries = (pass_for_numeric_dtypes_or_raise_array( df.pop(entry_col)).astype(float) if (entry_col is not None) else pd.Series(np.zeros(self._n_examples, dtype=float), index=df.index, name="entry")) check_nans_or_infs(E) E = E.astype(bool) self.event_observed = E.copy() self.entry = entries.copy() self.weights = weights.copy() df = df.astype(float) self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries) check_for_numeric_dtypes_or_raise(df) check_nans_or_infs(df) _norm_std = df.std(0) _norm_std[_norm_std < 1e-8] = 1.0 df_normalized = normalize(df, 0, _norm_std) Xs = self._create_Xs_dict(df_normalized) self._LOOKUP_SLICE = self._create_slicer(Xs) _index = pd.MultiIndex.from_tuples( sum(([(name, col) for col in columns] for name, columns in regressors.items()), [])) self._norm_std = pd.Series( [_norm_std.loc[variable_name] for _, variable_name in _index], index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( log_likelihood_function, Ts, Xs, E.values, weights.values, entries.values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( Ts, E.values, weights.values, entries.values, Xs) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_median = self.predict_median(df)
def fit(self, df, event_col, start_col="start", stop_col="stop", weights_col=None, id_col=None, show_progress=False, robust=False, strata=None, initial_point=None, val_df=None): # pylint: disable=too-many-arguments """ Fit the Cox Nonlinear Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. id_col: string, optional A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. If not provided, it's up to the user to make sure that there are no violations. show_progress: since the fitter is iterative, show convergence diagnostics. robust: bool, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxNonLinearTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" df, events, start, stop, weights = self.preprocess_df( df, event_col, start_col, stop_col, weights_col, id_col) val_df, val_events, val_start, val_stop, val_weights = \ self.preprocess_df(val_df, event_col, start_col, stop_col, weights_col, id_col) self._norm_mean = df.mean(0) self._norm_std = df.std(0) self._norm_std[self._norm_std == 0] = 1.0 # Avoid div by zero. # Network architecture in_features = df.values.shape[-1] out_features = 1 self.type_pt = torch.float self.net = Net(in_features, self.num_units, out_features, self.num_layers, self.p_dropout, self.type_pt) self.net = self._neural_cox( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, normalize(val_df, self._norm_mean, self._norm_std), val_events, val_start, val_stop, val_weights, net=self.net, show_progress=show_progress, training_epochs=self.num_epochs, batch_size=self.batch_size, step_size=self.learning_rate, ) self.beta_params_ = pd.Series(list( self.net.beta.parameters())[0].detach().numpy().ravel(), name="coef") self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard( df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({ "event": events, "start": start, "stop": stop }) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self
def fit( self, df, id_col, event_col, start_col="start", stop_col="stop", weights_col=None, show_progress=False, step_size=None, robust=False, strata=None, initial_point=None, ): # pylint: disable=too-many-arguments """ Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. Parameters ----------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). id_col: string A subject could have multiple rows in the DataFrame. This column contains the unique identifier per subject. event_col: string the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are non-censored. start_col: string the column that contains the start of a subject's time period. stop_col: string the column that contains the end of a subject's time period. weights_col: string, optional the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. robust: boolean, optional (default: True) Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. See "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 step_size: float, optional set an initial step size for the fitting algorithm. strata: list or string, optional specify a column or list of columns n to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns -------- self: CoxTimeVaryingFitter self, with additional properties like ``hazards_`` and ``print_summary`` """ self.strata = coalesce(strata, self.strata) self.robust = robust if self.robust: raise NotImplementedError("Not available yet.") self.event_col = event_col self.id_col = id_col self.stop_col = stop_col self.start_col = start_col self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.") if weights_col is None: self.weights_col = None assert ( "__weights" not in df.columns ), "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col if (df[weights_col] <= 0).any(): raise ValueError("values in weights_col must be positive.") df = df.rename( columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"} ) if self.strata is None: df = df.set_index("id") else: df = df.set_index(_to_list(self.strata) + ["id"]) # TODO: needs to be a list df = df.sort_index() events, start, stop = ( pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool), df.pop("start"), df.pop("stop"), ) weights = df.pop("__weights").astype(float) df = df.astype(float) self._check_values(df, events, start, stop) self._norm_mean = df.mean(0) self._norm_std = df.std(0) hazards_ = self._newton_rhaphson( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size, ) self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights ) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights) self.baseline_survival_ = self._compute_baseline_survival() self.event_observed = events self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop}) self.weights = weights self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] return self