コード例 #1
0
    def predict_partial_hazard(self, X):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        If covariates were normalized during fitting, they are normalized
        in the same way here.

        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.

        Returns the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to \exp{\beta X}
        """
        index = _get_index(X)

        if isinstance(X, pd.DataFrame):
            order = self.hazards_.columns
            X = X[order]

        if self.normalize:
            # Assuming correct ordering and number of columns
            X = normalize(X, self._norm_mean.values, self._norm_std.values)

        return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)
コード例 #2
0
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta`
        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        Returns
        -------
        DataFrame
        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        X = normalize(X, self._norm_mean.values, 1)
        X_pt = torch.tensor(X, dtype=self.type_pt)
        return pd.Series(self.net(X_pt).detach().numpy().ravel())
コード例 #3
0
ファイル: coxph_fitter.py プロジェクト: rosejn/lifelines
    def predict_partial_hazard(self, X):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        If covariates were normalized during fitting, they are normalized
        in the same way here.

        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.

        Returns the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to \exp{\beta X}
        """
        index = _get_index(X)

        if isinstance(X, pd.DataFrame):
            order = self.hazards_.columns
            X = X[order]

        if self.normalize:
            # Assuming correct ordering and number of columns
            X = normalize(X, self._norm_mean.values, self._norm_std.values)

        return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)
コード例 #4
0
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta `


        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        DataFrame

        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            order = self.params_.index
            X = X[order]
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.params_), index=index)
コード例 #5
0
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta `


        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        DataFrame

        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            order = self.hazards_.index
            X = X[order]
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_), index=index)
コード例 #6
0
    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):

        df = df.copy()
        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.")

        df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'})
        df['event'] = df['event'].astype(bool)

        df = df.set_index(['id'])

        self._check_values(df.drop(["event", "stop", "start"], axis=1), df['event'])

        stop_times_events = df[["event", "stop", "start"]]
        df = df.drop(["event", "stop", "start"], axis=1)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._n_examples = df.shape[0]

        return self
コード例 #7
0
ファイル: test_utils.py プロジェクト: DGaffney/lifelines
def test_unnormalize():
    df = load_larynx()
    m = df.mean(0)
    s = df.std(0)

    ndf = utils.normalize(df)

    npt.assert_almost_equal(df.values, utils.unnormalize(ndf, m, s).values)
コード例 #8
0
def test_unnormalize():
    df = load_larynx()
    m = df.mean(0)
    s = df.std(0)

    ndf = utils.normalize(df)

    npt.assert_almost_equal(df.values, utils.unnormalize(ndf, m, s).values)
コード例 #9
0
    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):
        """
        Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          id_col:  A subject could have multiple rows in the dataframe. This column contains
             the unique identifer per subject.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          start_col: the column that contains the start of a subject's time period.
          stop_col: the column that contains the end of a subject's time period.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          step_size: set an initial step size for the fitting algorithm.

        Returns:
            self, with additional properties: hazards_

        """

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.")

        df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'})
        df = df.set_index('id')
        stop_times_events = df[["event", "stop", "start"]].copy()
        df = df.drop(["event", "stop", "start"], axis=1)
        stop_times_events['event'] = stop_times_events['event'].astype(bool)

        self._check_values(df, stop_times_events)
        df = df.astype(float)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = stop_times_events['event']
        self.start_stop_and_events = stop_times_events

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]

        return self
コード例 #10
0
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`\beta (X - mean(X_{train}))`


        Parameters
        ----------
        X:  numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        log_partial_hazard: DataFrame


        Notes
        -----
        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """

        hazard_names = self.hazards_.columns
        if isinstance(X, pd.DataFrame):
            order = hazard_names
            X = X[order]
            pass_for_numeric_dtypes_or_raise(X)
        elif isinstance(
                X, pd.Series) and ((X.shape[0] == len(hazard_names) + 2) or
                                   (X.shape[0] == len(hazard_names))):
            X = X.to_frame().T
            order = hazard_names
            X = X[order]
            pass_for_numeric_dtypes_or_raise(X)
        elif isinstance(X, pd.Series):
            assert len(hazard_names) == 1, "Series not the correct arugment"
            X = pd.DataFrame(X)
            pass_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)

        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
コード例 #11
0
    def fit(self,
            df,
            id_col,
            event_col,
            start_col='start',
            stop_col='stop',
            show_progress=False,
            step_size=None):

        df = df.copy()
        if not (id_col in df and event_col in df and start_col in df
                and stop_col in df):
            raise KeyError(
                "A column specified in the call to `fit` does not exist in the dataframe provided."
            )

        df = df.rename(columns={
            id_col: 'id',
            event_col: 'event',
            start_col: 'start',
            stop_col: 'stop'
        })
        df['event'] = df['event'].astype(bool)

        df = df.set_index(['id'])

        self._check_values(df.drop(["event", "stop", "start"], axis=1),
                           df['event'])

        stop_times_events = df[["event", "stop", "start"]]
        df = df.drop(["event", "stop", "start"], axis=1)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         stop_times_events,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._n_examples = df.shape[0]

        return self
コード例 #12
0
ファイル: coxph_fitter.py プロジェクト: ticmrk/lifelines
    def predict_log_partial_hazard(self, X):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.


        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.

        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to \beta X
        """
        if isinstance(X, pd.DataFrame):
            order = self.hazards_.columns
            X = X[order]

        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
コード例 #13
0
ファイル: coxph_fitter.py プロジェクト: springcoil/lifelines
    def predict_log_partial_hazard(self, X):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.


        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.

        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to \beta X
        """
        if isinstance(X, pd.DataFrame):
            order = self.hazards_.columns
            X = X[order]

        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------
        TODO
        >>> from lifelines import WeibullAFTFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E')
        >>> aft.print_summary()
        >>> aft.predict_median(df)
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E', ancillary_df=df)
        >>> aft.print_summary()
        >>> aft.predict_median(df)

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(
            self.event_col)).astype(bool) if (self.event_col is not None) else
             pd.Series(np.ones(self._n_examples, dtype=bool),
                       index=df.index,
                       name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))  # TODO

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns]
                 for name in self._fitted_parameter_names], []))

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] *
                                                  self.n_breakpoints),
                                   index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(
            df, times=[np.percentile(T, 75)]).T

        return self
コード例 #15
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            weights_col=None,
            show_progress=False):
        """
        Parameters
        ----------
        Fit the Aalen Additive model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas dataframe with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in dataframe that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of thecolumn in dataframe that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        weights_col: string, optional
            an optional column in the dataframe, df, that denotes the weight per subject.
            This column is expelled and not used as a covariate, but as a weight in the
            final regression. Default weight is 1.
            This can be used for case-weights. For example, a weight of 2 means there were two subjects with
            identical observations.
            This can be used for sampling weights.

        show_progress: boolean, optional (default=False)
            Since the fitter is iterative, show iteration number.


        Returns
        -------
        self: AalenAdditiveFitter
            self with additional new properties: ``cumulative_hazards_``, etc.

        Examples
        --------
        >>> from lifelines import AalenAdditiveFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aaf = AalenAdditiveFitter()
        >>> aaf.fit(df, 'T', 'E')
        >>> aaf.predict_median(df)
        >>> aaf.print_summary()

        """
        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"

        df = df.copy()

        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col

        self._n_examples = df.shape[0]

        X, T, E, weights = self._preprocess_dataframe(df)

        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        self._norm_std = X.std(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            self._norm_std["baseline"] = 1.0
        else:
            # a baseline was provided
            self._norm_std[self._norm_std < 1e-8] = 1.0

        self.hazards_, self.cumulative_hazards_, self.cumulative_variance_ = self._fit_model(
            normalize(X, 0, self._norm_std), T, E, weights, show_progress)
        self.hazards_ /= self._norm_std
        self.cumulative_hazards_ /= self._norm_std
        self.cumulative_variance_ /= self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self._index = self.hazards_.index

        self._predicted_hazards_ = self.predict_cumulative_hazard(
            X).iloc[-1].values.ravel()
        return self
コード例 #16
0
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------

        >>> N, d = 80000, 2
        >>> # some numbers take from http://statwonk.com/parametric-survival.html
        >>> breakpoints = (1, 31, 34, 62, 65)
        >>> betas = np.array(
        >>>     [
        >>>         [1.0, -0.2, np.log(15)],
        >>>         [5.0, -0.4, np.log(333)],
        >>>         [9.0, -0.6, np.log(18)],
        >>>         [5.0, -0.8, np.log(500)],
        >>>         [2.0, -1.0, np.log(20)],
        >>>         [1.0, -1.2, np.log(500)],
        >>>     ]
        >>> )

        >>> X = 0.1 * np.random.exponential(size=(N, d))
        >>> X = np.c_[X, np.ones(N)]
        >>> T = np.empty(N)
        >>> for i in range(N):
        >>>     lambdas = np.exp(-betas.dot(X[i, :]))
        >>>     T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0]
        >>> T_censor = np.minimum(
        >>>     T.mean() * np.random.exponential(size=N), 110
        >>> )  # 110 is the end of observation, eg. current time.
        >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"])
        >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1)
        >>> df["E"] = T <= T_censor

        >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E")
        >>> pew.print_summary()
        >>> pew.plot()

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float)
        E = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col))
            if (self.event_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")
        )
        weights = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float)
            if (self.weights_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights")
        )
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError("values in weight column %s must be positive." % self.weights_col)

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        E = E.astype(bool)
        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])
        )

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T

        return self
コード例 #17
0
    def fit(self, df, duration_col, event_col=None,
            show_progress=False, initial_beta=None, include_likelihood=False,
            strata=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          include_likelihood: saves the final log-likelihood to the CoxPHFitter under
             the property _log_likelihood.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """
        df = df.copy()
        # Sort on time
        df.sort_values(by=duration_col, inplace=True)

        # remove strata coefs
        self.strata = strata
        if strata is not None:
            df = df.set_index(strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        # Store original non-normalized data
        self.data = df if self.strata is None else df.reset_index()
        self._check_values(df)

        if self.normalize:
            # Need to normalize future inputs as well
            self._norm_mean = df.mean(0)
            self._norm_std = df.std(0)
            df = normalize(df)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         include_likelihood=include_likelihood)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns,
                                     index=['coef'])
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.durations = T
        self.event_observed = E

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum()
        self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_)
        return self
コード例 #18
0
    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights_col=None, show_progress=False, step_size=None, robust=False):
        """
        Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          id_col:  A subject could have multiple rows in the dataframe. This column contains
             the unique identifer per subject.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          start_col: the column that contains the start of a subject's time period.
          stop_col: the column that contains the end of a subject's time period.
          weights_col: the column that contains (possibly time-varying) weight of each subject-period row.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          step_size: set an initial step size for the fitting algorithm.
          robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
            ties, so if there are high number of ties, results may significantly differ. See
            "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078


        Returns:
            self, with additional properties: hazards_

        """

        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.")

        if weights_col is None:
            assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.'
            df['__weights'] = 1.0
        else:
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")


        df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'})
        df = df.set_index('id')
        stop_times_events = df[["event", "stop", "start"]].copy()
        weights = df[['__weights']].copy().astype(float)
        df = df.drop(["event", "stop", "start", "__weights"], axis=1)
        stop_times_events['event'] = stop_times_events['event'].astype(bool)


        self._check_values(df, stop_times_events)
        df = df.astype(float)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights, show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = stop_times_events['event']
        self.start_stop_and_events = stop_times_events

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
コード例 #19
0
def test_normalize():
    df = load_larynx()
    n, d = df.shape
    npt.assert_almost_equal(utils.normalize(df).mean(0).values, np.zeros(d))
    npt.assert_almost_equal(utils.normalize(df).std(0).values, np.ones(d))
コード例 #20
0
ファイル: test_utils.py プロジェクト: DGaffney/lifelines
def test_normalize():
    df = load_larynx()
    n, d = df.shape
    npt.assert_almost_equal(utils.normalize(df).mean(0).values, np.zeros(d))
    npt.assert_almost_equal(utils.normalize(df).std(0).values, np.ones(d))
コード例 #21
0
ファイル: coxph_fitter.py プロジェクト: springcoil/lifelines
    def fit(self, df, duration_col, event_col=None,
            show_progress=False, initial_beta=None,
            strata=None, step_size=None, weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """
        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col).values
        else:
            weights = np.ones(self._n_examples)

        self._check_values(df, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(self.durations,
                                        -self.predict_partial_hazard(df).values.ravel(),
                                        self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
        return self
コード例 #22
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            strata=None,
            step_size=None,
            weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """

        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
                    """, RuntimeWarning)

        else:
            weights = pd.DataFrame(np.ones((self._n_examples, 1)),
                                   index=df.index)

        self._check_values(df, T, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         T,
                                         E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(
        )
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(
            self.durations, -self.predict_partial_hazard(df).values.ravel(),
            self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(
            self._norm_mean.to_frame().T)
        return self
コード例 #23
0
    def fit(self,
            df,
            duration_col=None,
            event_col=None,
            ancillary_df=None,
            show_progress=False,
            timeline=None):
        """
        Fit the Weibull accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights, strata).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in dataframe that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of thecolumn in dataframe that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        ancillary_df: None, boolean, or DataFrame, optional (default=None)
            Choose to model the ancillary parameters.
            If None or False, explicity do not fit the ancillary parameters using any covariates.
            If True, model the ancillary parameters with the same covariates as ``df``.
            If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        Returns
        -------
        self: WeibullAFTFitter
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------
        >>> from lifelines import WeibullAFTFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E')
        >>> aft.print_summary()
        >>> aft.predict_median(df)
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E', ancillary_df=df)
        >>> aft.print_summary()
        >>> aft.predict_median(df)

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(
            self.event_col)).astype(bool) if (self.event_col is not None) else
             pd.Series(np.ones(self._n_examples), index=df.index, name="E"))
        self.durations = T.copy()
        self.event_observed = E.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        self._check_values(df, T, E, self.event_col)

        if isinstance(ancillary_df, pd.DataFrame):
            assert ancillary_df.shape[0] == df.shape[
                0], "ancillary_df must be the same shape[0] as df"

            ancillary_df = ancillary_df.copy().drop([duration_col, event_col],
                                                    axis=1,
                                                    errors="ignore")
            self._check_values(ancillary_df, T, E, self.event_col)

        elif (ancillary_df is None) or (ancillary_df is False):
            ancillary_df = pd.DataFrame(np.ones((df.shape[0], )),
                                        index=df.index,
                                        columns=["_intercept"])
        elif ancillary_df is True:
            ancillary_df = df.copy()

        if self.fit_intercept:
            assert "_intercept" not in df
            ancillary_df["_intercept"] = 1.0
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns),
                                                 len(ancillary_df.columns))

        _norm_std, _norm_std_ancillary = df.std(0), ancillary_df.std(0)
        self._norm_mean, self._norm_mean_ancillary = df.mean(
            0), ancillary_df.mean(0)
        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
            _norm_std_ancillary["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0
            _norm_std_ancillary[_norm_std_ancillary < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples([("lambda_", c)
                                            for c in df.columns] +
                                           [("rho_", c)
                                            for c in ancillary_df.columns])

        self._norm_std = pd.Series(np.append(_norm_std, _norm_std_ancillary),
                                   index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            normalize(df, 0, _norm_std).values,
            normalize(ancillary_df, 0, _norm_std_ancillary).values,
            show_progress=show_progress,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors()
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df, ancillary_df)

        return self
コード例 #24
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            strata=None,
            step_size=None,
            weights_col=None,
            cluster_col=None,
            robust=False):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
             This can be used for case-weights. For example, a weight of 2 means there were two subjects with
             identical observations.
             This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.
          step_size: set an initial step size for the fitting algorithm.
          robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
            ties, so if there are high number of ties, results may significantly differ. See
            "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
          cluster_col: specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
            be used.
        Returns:
            self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.

        """

        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + ' UTC'
        self.duration_col = duration_col
        self.event_col = event_col
        self.robust = robust
        self.cluster_col = cluster_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col)
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    """It appears your weights are not integers, possibly propensity or sampling scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
""", RuntimeWarning)
            if (weights <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        else:
            weights = pd.Series(np.ones((self._n_examples, )), index=df.index)

        if self.cluster_col:
            self._clusters = df.pop(self.cluster_col)

        self._check_values(df, T, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         T,
                                         E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std

        self.variance_matrix_ = -inv(self._hessian_) / np.outer(
            self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), T, E, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(
            df, T, E, weights)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(
        )
        self.baseline_survival_ = self._compute_baseline_survival()
        self._predicted_partial_hazards_ = self.predict_partial_hazard(
            df).values

        self._train_log_partial_hazard = self.predict_log_partial_hazard(
            self._norm_mean.to_frame().T)
        return self
コード例 #25
0
    def fit(self, df, duration_col, event_col=None, weights_col=None, show_progress=False):
        """
        Parameters
        ----------
        Fit the Aalen Additive model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        weights_col: string, optional
            an optional column in the DataFrame, df, that denotes the weight per subject.
            This column is expelled and not used as a covariate, but as a weight in the
            final regression. Default weight is 1.
            This can be used for case-weights. For example, a weight of 2 means there were two subjects with
            identical observations.
            This can be used for sampling weights.

        show_progress: boolean, optional (default=False)
            Since the fitter is iterative, show iteration number.


        Returns
        -------
        self: AalenAdditiveFitter
            self with additional new properties: ``cumulative_hazards_``, etc.

        Examples
        --------
        >>> from lifelines import AalenAdditiveFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aaf = AalenAdditiveFitter()
        >>> aaf.fit(df, 'T', 'E')
        >>> aaf.predict_median(df)
        >>> aaf.print_summary()

        """
        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC"

        df = df.copy()

        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col

        self._n_examples = df.shape[0]

        X, T, E, weights = self._preprocess_dataframe(df)

        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        self._norm_std = X.std(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            self._norm_std["_intercept"] = 1.0
        else:
            # a _intercept was provided
            self._norm_std[self._norm_std < 1e-8] = 1.0

        self.hazards_, self.cumulative_hazards_, self.cumulative_variance_ = self._fit_model(
            normalize(X, 0, self._norm_std), T, E, weights, show_progress
        )
        self.hazards_ /= self._norm_std
        self.cumulative_hazards_ /= self._norm_std
        self.cumulative_variance_ /= self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self._index = self.hazards_.index

        self._predicted_hazards_ = self.predict_cumulative_hazard(X).iloc[-1].values.ravel()
        return self
コード例 #26
0
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df
                and stop_col in df):
            raise KeyError(
                "A column specified in the call to `fit` does not exist in the DataFrame provided."
            )

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={
                id_col: "id",
                event_col: "event",
                start_col: "start",
                stop_col: "stop",
                weights_col: "__weights"
            })

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) +
                              ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(
                df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        params_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.params_ = pd.Series(params_, index=df.columns,
                                 name="coef") / self._norm_std
        self.hazard_ratios_ = pd.Series(np.exp(self.params_),
                                        index=df.columns,
                                        name="exp(coef)")
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(
            self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start,
            stop, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(
            df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({
            "event": events,
            "start": start,
            "stop": stop
        })
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
コード例 #27
0
    def _fit(
        self,
        log_likelihood_function,
        df,
        Ts,
        regressors,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
        entry_col=None,
    ):

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.weights_col = weights_col
        self.entry_col = entry_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust
        self.regressors = regressors  # TODO name

        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if
             (self.event_col is not None) else pd.Series(np.ones(
                 self._n_examples, dtype=bool),
                                                         index=df.index,
                                                         name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))

        entries = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(entry_col)).astype(float) if (entry_col is not None) else
                   pd.Series(np.zeros(self._n_examples, dtype=float),
                             index=df.index,
                             name="entry"))

        check_nans_or_infs(E)
        E = E.astype(bool)
        self.event_observed = E.copy()
        self.entry = entries.copy()
        self.weights = weights.copy()

        df = df.astype(float)
        self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries)
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)

        _norm_std = df.std(0)
        _norm_std[_norm_std < 1e-8] = 1.0
        df_normalized = normalize(df, 0, _norm_std)

        Xs = self._create_Xs_dict(df_normalized)

        self._LOOKUP_SLICE = self._create_slicer(Xs)

        _index = pd.MultiIndex.from_tuples(
            sum(([(name, col) for col in columns]
                 for name, columns in regressors.items()), []))

        self._norm_std = pd.Series(
            [_norm_std.loc[variable_name] for _, variable_name in _index],
            index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            log_likelihood_function,
            Ts,
            Xs,
            E.values,
            weights.values,
            entries.values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            Ts, E.values, weights.values, entries.values, Xs)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df)
コード例 #28
0
ファイル: coxph_fitter.py プロジェクト: rosejn/lifelines
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            include_likelihood=False,
            strata=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          include_likelihood: saves the final log-likelihood to the CoxPHFitter under
             the property _log_likelihood.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """
        df = df.copy()
        # Sort on time
        df.sort_values(by=duration_col, inplace=True)

        # remove strata coefs
        self.strata = strata
        if strata is not None:
            df = df.set_index(strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        # Store original non-normalized data
        self.data = df if self.strata is None else df.reset_index()
        self._check_values(df)

        if self.normalize:
            # Need to normalize future inputs as well
            self._norm_mean = df.mean(0)
            self._norm_std = df.std(0)
            df = normalize(df)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(df,
                                         T,
                                         E,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         include_likelihood=include_likelihood)

        self.hazards_ = pd.DataFrame(hazards_.T,
                                     columns=df.columns,
                                     index=['coef'])
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.durations = T
        self.event_observed = E

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum()
        self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_)
        return self
コード例 #29
0
    def fit(self,
            df,
            event_col,
            start_col="start",
            stop_col="stop",
            weights_col=None,
            id_col=None,
            show_progress=False,
            robust=False,
            strata=None,
            initial_point=None,
            val_df=None):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Nonlinear Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.
        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        id_col: string, optional
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject. If not provided, it's up to the
           user to make sure that there are no violations.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: bool, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.
        Returns
        --------
        self: CoxNonLinearTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``
        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"

        df, events, start, stop, weights = self.preprocess_df(
            df, event_col, start_col, stop_col, weights_col, id_col)
        val_df, val_events, val_start, val_stop, val_weights = \
            self.preprocess_df(val_df, event_col, start_col, stop_col, weights_col, id_col)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)
        self._norm_std[self._norm_std == 0] = 1.0  # Avoid div by zero.

        # Network architecture
        in_features = df.values.shape[-1]
        out_features = 1
        self.type_pt = torch.float
        self.net = Net(in_features, self.num_units, out_features,
                       self.num_layers, self.p_dropout, self.type_pt)
        self.net = self._neural_cox(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            normalize(val_df, self._norm_mean, self._norm_std),
            val_events,
            val_start,
            val_stop,
            val_weights,
            net=self.net,
            show_progress=show_progress,
            training_epochs=self.num_epochs,
            batch_size=self.batch_size,
            step_size=self.learning_rate,
        )

        self.beta_params_ = pd.Series(list(
            self.net.beta.parameters())[0].detach().numpy().ravel(),
                                      name="coef")
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(
            df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({
            "event": events,
            "start": start,
            "stop": stop
        })
        self.weights = weights
        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
コード例 #30
0
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.")

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"}
        )

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) + ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights
        )
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop})
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self