def preprocess_df(self, df, event_col, start_col, stop_col, weights_col,
                      id_col):
        df = df.copy()

        if not (event_col in df and start_col in df and stop_col in df):
            raise KeyError(
                "A column specified in the call to `fit` does not exist in the DataFrame provided."
            )

        if weights_col is None:
            self.weights_col = None
            assert "__weights" not in df.columns, "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={
                event_col: "event",
                start_col: "start",
                stop_col: "stop",
                weights_col: "__weights"
            })
        if self.strata is not None and self.id_col is not None:
            df = df.set_index(_to_list(self.strata) + [id_col])
            df = df.sort_index()
        elif self.strata is not None and self.id_col is None:
            df = df.set_index(_to_list(self.strata))
        elif self.strata is None and self.id_col is not None:
            df = df.set_index([id_col])

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(
                df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)
        return df, events, start, stop, weights
Beispiel #2
0
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df
                and stop_col in df):
            raise KeyError(
                "A column specified in the call to `fit` does not exist in the DataFrame provided."
            )

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={
                id_col: "id",
                event_col: "event",
                start_col: "start",
                stop_col: "stop",
                weights_col: "__weights"
            })

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) +
                              ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(
                df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        params_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.params_ = pd.Series(params_, index=df.columns,
                                 name="coef") / self._norm_std
        self.hazard_ratios_ = pd.Series(np.exp(self.params_),
                                        index=df.columns,
                                        name="exp(coef)")
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(
            self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start,
            stop, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(
            df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({
            "event": events,
            "start": start,
            "stop": stop
        })
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        entry=None,
        weights=None,
        tol: float = 1e-5,
        show_progress: bool = False,
        **kwargs,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turnbull Estimator.

        Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small
        value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals.

        >>> left, right = df['left'], df['right']
        >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001)

        Note
        ------
        This is new and experimental, and many features are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.
          tol: float, optional
            minimum difference in log likelihood changes for iterative algorithm.
          show_progress: bool, optional
            display information during fitting.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        if entry is not None:
            raise NotImplementedError("entry is not supported yet")

        if weights is None:
            weights = np.ones_like(upper_bound)

        self.weights = np.asarray(weights)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        results = npmle(self.lower_bound,
                        self.upper_bound,
                        verbose=show_progress,
                        tol=tol,
                        weights=weights,
                        **kwargs)
        self.survival_function_ = reconstruct_survival_function(
            *results, self.timeline, label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        return self
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.")

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"}
        )

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) + ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights
        )
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop})
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
Beispiel #5
0
    def fit(self,
            df,
            duration_col=None,
            event_col=None,
            ancillary_df=None,
            show_progress=False,
            timeline=None):
        """
        Fit the Weibull accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights, strata).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in dataframe that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of thecolumn in dataframe that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        ancillary_df: None, boolean, or DataFrame, optional (default=None)
            Choose to model the ancillary parameters.
            If None or False, explicity do not fit the ancillary parameters using any covariates.
            If True, model the ancillary parameters with the same covariates as ``df``.
            If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        Returns
        -------
        self: WeibullAFTFitter
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------
        >>> from lifelines import WeibullAFTFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E')
        >>> aft.print_summary()
        >>> aft.predict_median(df)
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E', ancillary_df=df)
        >>> aft.print_summary()
        >>> aft.predict_median(df)

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(
            self.event_col)).astype(bool) if (self.event_col is not None) else
             pd.Series(np.ones(self._n_examples), index=df.index, name="E"))
        self.durations = T.copy()
        self.event_observed = E.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        self._check_values(df, T, E, self.event_col)

        if isinstance(ancillary_df, pd.DataFrame):
            assert ancillary_df.shape[0] == df.shape[
                0], "ancillary_df must be the same shape[0] as df"

            ancillary_df = ancillary_df.copy().drop([duration_col, event_col],
                                                    axis=1,
                                                    errors="ignore")
            self._check_values(ancillary_df, T, E, self.event_col)

        elif (ancillary_df is None) or (ancillary_df is False):
            ancillary_df = pd.DataFrame(np.ones((df.shape[0], )),
                                        index=df.index,
                                        columns=["_intercept"])
        elif ancillary_df is True:
            ancillary_df = df.copy()

        if self.fit_intercept:
            assert "_intercept" not in df
            ancillary_df["_intercept"] = 1.0
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns),
                                                 len(ancillary_df.columns))

        _norm_std, _norm_std_ancillary = df.std(0), ancillary_df.std(0)
        self._norm_mean, self._norm_mean_ancillary = df.mean(
            0), ancillary_df.mean(0)
        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
            _norm_std_ancillary["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0
            _norm_std_ancillary[_norm_std_ancillary < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples([("lambda_", c)
                                            for c in df.columns] +
                                           [("rho_", c)
                                            for c in ancillary_df.columns])

        self._norm_std = pd.Series(np.append(_norm_std, _norm_std_ancillary),
                                   index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            normalize(df, 0, _norm_std).values,
            normalize(ancillary_df, 0, _norm_std_ancillary).values,
            show_progress=show_progress,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors()
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df, ancillary_df)

        return self
Beispiel #6
0
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        show_progress=False,
        entry=None,
        weights=None,
        tol=1e-7,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turball Estimator.

        Note
        ------
        This is new and experimental, and many feature are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        warnings.warn(
            "This is new and experimental, many feature are missing and accuracy is not reliable",
            UserWarning)

        if entry is not None or weights is not None:
            raise NotImplementedError("entry / weights is not supported yet")
        self.weights = np.ones_like(upper_bound)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        probs, t_intervals = npmle(self.lower_bound,
                                   self.upper_bound,
                                   verbose=show_progress)
        self.survival_function_ = reconstruct_survival_function(
            probs, t_intervals, self.timeline,
            label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        self.percentile = functools.partial(
            qth_survival_time,
            model_or_survival_function=self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        self._update_docstrings()
        return self
Beispiel #7
0
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------

        >>> N, d = 80000, 2
        >>> # some numbers take from http://statwonk.com/parametric-survival.html
        >>> breakpoints = (1, 31, 34, 62, 65)
        >>> betas = np.array(
        >>>     [
        >>>         [1.0, -0.2, np.log(15)],
        >>>         [5.0, -0.4, np.log(333)],
        >>>         [9.0, -0.6, np.log(18)],
        >>>         [5.0, -0.8, np.log(500)],
        >>>         [2.0, -1.0, np.log(20)],
        >>>         [1.0, -1.2, np.log(500)],
        >>>     ]
        >>> )

        >>> X = 0.1 * np.random.exponential(size=(N, d))
        >>> X = np.c_[X, np.ones(N)]
        >>> T = np.empty(N)
        >>> for i in range(N):
        >>>     lambdas = np.exp(-betas.dot(X[i, :]))
        >>>     T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0]
        >>> T_censor = np.minimum(
        >>>     T.mean() * np.random.exponential(size=N), 110
        >>> )  # 110 is the end of observation, eg. current time.
        >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"])
        >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1)
        >>> df["E"] = T <= T_censor

        >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E")
        >>> pew.print_summary()
        >>> pew.plot()

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float)
        E = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col))
            if (self.event_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")
        )
        weights = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float)
            if (self.weights_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights")
        )
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError("values in weight column %s must be positive." % self.weights_col)

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        E = E.astype(bool)
        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])
        )

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T

        return self
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------
        TODO
        >>> from lifelines import WeibullAFTFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E')
        >>> aft.print_summary()
        >>> aft.predict_median(df)
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E', ancillary_df=df)
        >>> aft.print_summary()
        >>> aft.predict_median(df)

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(
            self.event_col)).astype(bool) if (self.event_col is not None) else
             pd.Series(np.ones(self._n_examples, dtype=bool),
                       index=df.index,
                       name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))  # TODO

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns]
                 for name in self._fitted_parameter_names], []))

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] *
                                                  self.n_breakpoints),
                                   index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(
            df, times=[np.percentile(T, 75)]).T

        return self
    def _fit(
        self,
        log_likelihood_function,
        df,
        Ts,
        regressors,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
        entry_col=None,
    ):

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.weights_col = weights_col
        self.entry_col = entry_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust
        self.regressors = regressors  # TODO name

        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if
             (self.event_col is not None) else pd.Series(np.ones(
                 self._n_examples, dtype=bool),
                                                         index=df.index,
                                                         name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))

        entries = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(entry_col)).astype(float) if (entry_col is not None) else
                   pd.Series(np.zeros(self._n_examples, dtype=float),
                             index=df.index,
                             name="entry"))

        check_nans_or_infs(E)
        E = E.astype(bool)
        self.event_observed = E.copy()
        self.entry = entries.copy()
        self.weights = weights.copy()

        df = df.astype(float)
        self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries)
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)

        _norm_std = df.std(0)
        _norm_std[_norm_std < 1e-8] = 1.0
        df_normalized = normalize(df, 0, _norm_std)

        Xs = self._create_Xs_dict(df_normalized)

        self._LOOKUP_SLICE = self._create_slicer(Xs)

        _index = pd.MultiIndex.from_tuples(
            sum(([(name, col) for col in columns]
                 for name, columns in regressors.items()), []))

        self._norm_std = pd.Series(
            [_norm_std.loc[variable_name] for _, variable_name in _index],
            index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            log_likelihood_function,
            Ts,
            Xs,
            E.values,
            weights.values,
            entries.values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            Ts, E.values, weights.values, entries.values, Xs)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df)
Beispiel #10
0
    def fit(
        self,
        df,
        duration_col,
        event_col=None,
        regressors=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
        entry_col=None,
    ):
        """
        Fit the accelerated failure time model to a right-censored dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        regressors: TODO

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in DataFrame that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        entry_col: specify a column in the DataFrame that denotes any late-entries (left truncation) that occurred. See
            the docs on `left truncation <https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html#left-truncated-late-entry-data>`__

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        """
        self.duration_col = duration_col
        self._time_cols = [duration_col]

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        self.durations = T.copy()

        self._fit(
            self._log_likelihood_right_censoring,
            df,
            (T.values, None),
            event_col=event_col,
            regressors=regressors,
            show_progress=show_progress,
            timeline=timeline,
            weights_col=weights_col,
            robust=robust,
            initial_point=initial_point,
            entry_col=entry_col,
        )

        return self
Beispiel #11
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        show_progress=False,
        entry=None,
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
        durations: an array, or pd.Series
          length n, duration subject was observed for
        event_observed: numpy array or pd.Series, optional
          length n, True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
        timeline: list, optional
            return the estimate at the values in timeline (postively increasing)
        label: string, optional
            a string to name the column of the estimate.
        alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: list, optional
            add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
        show_progress: boolean, optional
            since this is an iterative fitting algorithm, switching this to True will display some iteration details.
        entry: an array, or pd.Series, of length n
            relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
            entered study when they were "born": time zero.

        Returns
        -------
          self
            self with new properties like ``cumulative_hazard_``, ``survival_function_``

        """
        label = coalesce(label, self.__class__.__name__.replace("Fitter", "") + "_estimate")

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(pass_for_numeric_dtypes_or_raise_array(durations))
        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if not self._KNOWN_MODEL:
            self._check_cumulative_hazard_is_monotone_and_positive(self.durations, self._initial_values)

        self.event_observed = (
            np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
        )

        self.entry = np.asarray(entry) if entry is not None else np.zeros_like(self.durations)

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline).astype(float))
        else:
            self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0])

        self._label = label
        self._ci_labels = ci_labels
        self.alpha = coalesce(alpha, self.alpha)

        # estimation
        self._fitted_parameters_, self._log_likelihood, self._hessian_ = self._fit_model(
            self.durations, self.event_observed.astype(bool), self.entry, show_progress=show_progress
        )

        if not self._KNOWN_MODEL:
            self._check_cumulative_hazard_is_monotone_and_positive(self.durations, self._fitted_parameters_)

        for param_name, fitted_value in zip(self._fitted_parameter_names, self._fitted_parameters_):
            setattr(self, param_name, fitted_value)

        try:
            self.variance_matrix_ = inv(self._hessian_)
        except np.linalg.LinAlgError:
            self.variance_matrix_ = pinv(self._hessian_)
            warning_text = dedent(
                """\

                The hessian was not invertable. This could be a model problem:

                1. Are two parameters in the model colinear / exchangeable?
                2. Is the cumulative hazard always non-negative and always non-decreasing?
                3. Are there cusps/ in the cumulative hazard?

                We will instead approximate it using the psuedo-inverse.

                It's advisable to not trust the variances reported, and to be suspicious of the
                fitted parameters too. Perform plots of the cumulative hazard to help understand
                the latter's bias.
                """
            )
            warnings.warn(warning_text, StatisticalWarning)

        self._predict_label = label
        self._update_docstrings()

        self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame()
        self.hazard_ = self.hazard_at_times(self.timeline).to_frame()
        self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame()

        return self