def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='NA_estimate', alpha=None, ci_labels=None, weights=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns:
          self, with new properties like 'cumulative_hazard_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
          if (weights.astype(int) != weights).any():
              warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """, RuntimeWarning)

        v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                self._additive_f, self._variance_f, False)

        # esimates
        self._label = label
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label])
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self._cumulative_sq = cumulative_sq_

        # estimation methods
        self._estimation_method = "cumulative_hazard_"
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting
        self.plot_cumulative_hazard = self.plot

        return self
 def _check_values(self, df, events, start, stop):
     # check_for_overlapping_intervals(df) # this is currently too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, events, self.event_col)
     check_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(events, start, stop)
     check_for_instantaneous_events(start, stop)
 def _check_values(self, df, events, start, stop):
     # check_for_overlapping_intervals(df) # this is currently too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, events, self.event_col)
     check_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(events, start, stop)
     check_for_instantaneous_events(start, stop)
Exemple #4
0
 def _check_values(df, stop_times_events):
     # check_for_overlapping_intervals(df) # this is currenty too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, stop_times_events['event'])
     pass_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(stop_times_events)
     check_for_instantaneous_events(stop_times_events)
Exemple #5
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='Exponential_estimate', alpha=None, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns:
          self, with new properties like 'survival_function_' and 'lambda_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
        self.timeline = np.sort(np.asarray(timeline)) if timeline is not None else np.arange(int(self.durations.min()), int(self.durations.max()) + 1)
        self._label = label

        # estimation
        D = self.event_observed.sum()
        T = self.durations.sum()

        self.lambda_ = D / T
        self._lambda_variance_ = self.lambda_ / T
        self._log_likelihood = np.log(self.lambda_) * D - self.lambda_ * T
        self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline)
        self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
        self.median_ = 1. / self.lambda_ * (np.log(2))

        # estimation methods
        self._estimate_name = "survival_function_"
        self._predict_label = label
        self._update_docstrings()

        # plotting
        self.plot_survival_function_ = self.plot

        return self
    def _preprocess_dataframe(self, df):
        n, _ = df.shape

        df = df.sort_values(by=self.duration_col)

        # Extract time and event
        T = df.pop(self.duration_col)
        E = df.pop(
            self.event_col) if (self.event_col is not None) else pd.Series(
                np.ones(n), index=df.index, name="E")
        W = (df.pop(self.weights_col) if (self.weights_col is not None) else
             pd.Series(np.ones((n, )), index=df.index, name="weights"))

        # check to make sure their weights are okay
        if self.weights_col:
            if (W.astype(int) != W).any():
                warnings.warn(
                    """It appears your weights are not integers, possibly propensity or sampling scores then?
It's important to know that the naive variance estimates of the coefficients are biased."
""",
                    StatisticalWarning,
                )
            if (W <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        X = df.astype(float)
        T = T.astype(float)

        check_nans_or_infs(E)
        E = E.astype(bool)

        self._check_values(df, T, E)

        if self.fit_intercept:
            assert (
                "_intercept" not in df.columns
            ), "_intercept is an internal lifelines column, please rename your column first."
            X["_intercept"] = 1.0

        return X, T, E, W
    def _preprocess_dataframe(self, df):
        n, _ = df.shape

        df = df.sort_values(by=self.duration_col)

        # Extract time and event
        T = df.pop(self.duration_col)
        E = df.pop(self.event_col) if (self.event_col is not None) else pd.Series(np.ones(n), index=df.index, name="E")
        W = (
            df.pop(self.weights_col)
            if (self.weights_col is not None)
            else pd.Series(np.ones((n,)), index=df.index, name="weights")
        )

        # check to make sure their weights are okay
        if self.weights_col:
            if (W.astype(int) != W).any():
                warnings.warn(
                    """It appears your weights are not integers, possibly propensity or sampling scores then?
It's important to know that the naive variance estimates of the coefficients are biased."
""",
                    StatisticalWarning,
                )
            if (W <= 0).any():
                raise ValueError("values in weight column %s must be positive." % self.weights_col)

        X = df.astype(float)
        T = T.astype(float)

        check_nans_or_infs(E)
        E = E.astype(bool)

        self._check_values(df, T, E)

        if self.fit_intercept:
            assert (
                "_intercept" not in df.columns
            ), "_intercept is an internal lifelines column, please rename your column first."
            X["_intercept"] = 1.0

        return X, T, E, W
Exemple #8
0
 def _check_values(df, T, E):
     pass_for_numeric_dtypes_or_raise(df)
     check_nans_or_infs(T)
     check_nans_or_infs(E)
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation(df, E, T)
    def _check_values(self, df, T, E, weights, entries):
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)
        check_nans_or_infs(T)
        check_nans_or_infs(E)
        check_positivity(T)
        check_complete_separation(df, E, T, self.event_col)

        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        if self.entry_col:
            count_invalid_rows = (entries > T).sum()
            if count_invalid_rows:
                warnings.warn(
                    """There exist %d rows where entry > duration.""")
Exemple #10
0
    def _preprocess_dataframe(self, df):
        n, _ = df.shape

        df = df.sort_values(by=self.duration_col)

        # Extract time and event
        T = df.pop(self.duration_col)
        E = df.pop(
            self.event_col) if (self.event_col is not None) else pd.Series(
                np.ones(n), index=df.index, name="E")
        W = (df.pop(self.weights_col) if (self.weights_col is not None) else
             pd.Series(np.ones((n, )), index=df.index, name="weights"))

        # check to make sure their weights are okay
        if self.weights_col:
            if (W.astype(int) != W).any():
                warnings.warn(
                    """It appears your weights are not integers, possibly propensity or sampling scores then?
It's important to know that the naive variance estimates of the coefficients are biased."
""",
                    StatisticalWarning,
                )
            if (W <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        self.regressors = utils.CovariateParameterMappings(
            {"beta_": self.formula}, df, force_intercept=self.fit_intercept)
        X = self.regressors.transform_df(df)["beta_"]

        T = T.astype(float)

        check_nans_or_infs(E)
        E = E.astype(bool)

        self._check_values(df, T, E)

        return X, T, E, W
Exemple #11
0
    def _check_values(self, df, T, E, event_col):
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(T)
        check_nans_or_infs(E)
        check_nans_or_infs(df)
        check_complete_separation(df, E, T, event_col)

        if self.fit_intercept:
            check_low_var(df)
Exemple #12
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        label="Weibull_estimate",
        alpha=None,
        ci_labels=None,
        show_progress=False,
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
        durations: an array, or pd.Series
          length n, duration subject was observed for
        event_observed: numpy array or pd.Series, optional
          length n, True if the the death was observed, False if the event
           was lost (right-censored). Defaults all True if event_observed==None
        timeline: list, optional
            return the estimate at the values in timeline (postively increasing)
        label: string, optional
            a string to name the column of the estimate.
        alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: list, optional
            add custom column names to the generated confidence intervals
              as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
        show_progress: boolean, optional
            since this is an iterative fitting algorithm, switching this to True will display some iteration details.

        Returns
        -------
          self : WeibullFitter
            self with new properties like ``cumulative_hazard_``, ``survival_function_``, ``lambda_``, and ``rho_``.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        self.event_observed = (
            np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
        )

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline))
        else:
            self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0])

        self._label = label
        alpha = alpha if alpha is not None else self.alpha

        # estimation
        (self.lambda_, self.rho_), self._hessian_ = self._newton_rhaphson(
            self.durations, self.event_observed, show_progress=show_progress
        )
        self._log_likelihood = -_negative_log_likelihood((self.lambda_, self.rho_), self.durations, self.event_observed)
        self.variance_matrix_ = inv(self._hessian_)

        self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame(name=self._label)
        self.hazard_ = self.hazard_at_times(self.timeline).to_frame(self._label)
        self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame(self._label)

        self.confidence_interval_ = self._bounds(alpha, ci_labels)
        self.median_ = 1.0 / self.lambda_ * (np.log(2)) ** (1.0 / self.rho_)

        # estimation methods
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting - Cumulative hazard takes priority.
        self.plot_cumulative_hazard = self.plot

        return self
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            label="LogNormal_estimate",
            alpha=0.95,
            ci_labels=None):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
          durations: iterable
            an array, or pd.Series, of length n -- duration subject was observed for
          event_observed: iterable, optional
            an array, list, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          timeline: iterable, optional
            return the best estimate at the values in timelines (postively increasing)
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: list, optional
            add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns
        -------
        self : LogNormalFitter
          self, with new properties like 'survival_function_', 'sigma_' and 'mu_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        self.event_observed = (np.asarray(event_observed, dtype=int)
                               if event_observed is not None else np.ones_like(
                                   self.durations))

        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline))
        else:
            self.timeline = np.linspace(self.durations.min(),
                                        self.durations.max(),
                                        self.durations.shape[0])

        self._label = label

        (self.mu_, self.sigma_
         ), self._log_likelihood, self.variance_matrix_ = self._fit_model(
             self.durations, self.event_observed)

        self.survival_function_ = self.survival_function_at_times(
            self.timeline).to_frame(name=self._label)
        self.hazard_ = self.hazard_at_times(
            self.timeline).to_frame(name=self._label)
        self.cumulative_hazard_ = self.cumulative_hazard_at_times(
            self.timeline).to_frame(name=self._label)
        self.median_ = np.exp(self.mu_)
        self.confidence_interval_ = self._bounds(alpha, ci_labels)

        # estimation methods
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting - Cumulative hazard takes priority.
        self.plot_cumulative_hazard = self.plot

        return self
 def _check_values(self, array):
     check_nans_or_infs(array)
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        entry=None,
        weights=None,
        tol: float = 1e-5,
        show_progress: bool = False,
        **kwargs,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turnbull Estimator.

        Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small
        value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals.

        >>> left, right = df['left'], df['right']
        >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001)

        Note
        ------
        This is new and experimental, and many features are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.
          tol: float, optional
            minimum difference in log likelihood changes for iterative algorithm.
          show_progress: bool, optional
            display information during fitting.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        if entry is not None:
            raise NotImplementedError("entry is not supported yet")

        if weights is None:
            weights = np.ones_like(upper_bound)

        self.weights = np.asarray(weights)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        results = npmle(self.lower_bound,
                        self.upper_bound,
                        verbose=show_progress,
                        tol=tol,
                        weights=weights,
                        **kwargs)
        self.survival_function_ = reconstruct_survival_function(
            *results, self.timeline, label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        return self
 def _check_values(self, array):
     check_nans_or_infs(array)
 def _check_values(self, X, T, E):
     check_for_numeric_dtypes_or_raise(X)
     check_nans_or_infs(T)
     check_nans_or_infs(X)
Exemple #18
0
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        show_progress=False,
        entry=None,
        weights=None,
        tol=1e-7,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turball Estimator.

        Note
        ------
        This is new and experimental, and many feature are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        warnings.warn(
            "This is new and experimental, many feature are missing and accuracy is not reliable",
            UserWarning)

        if entry is not None or weights is not None:
            raise NotImplementedError("entry / weights is not supported yet")
        self.weights = np.ones_like(upper_bound)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        probs, t_intervals = npmle(self.lower_bound,
                                   self.upper_bound,
                                   verbose=show_progress)
        self.survival_function_ = reconstruct_survival_function(
            probs, t_intervals, self.timeline,
            label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        self.percentile = functools.partial(
            qth_survival_time,
            model_or_survival_function=self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        self._update_docstrings()
        return self
Exemple #19
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        left_censorship=False,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like 'survival_function_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly prospenity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = "survival_function_" if not left_censorship else "cumulative_density_"
        v = _preprocess_inputs(durations, event_observed, timeline, entry,
                               weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(
            self, estimate_name,
            pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None],
                                                 alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate,
                                             left_censorship=left_censorship)

        # estimation methods
        self._estimation_method = estimate_name
        self._estimate_name = estimate_name
        self._predict_label = label
        self._update_docstrings()

        # plotting functions
        setattr(self, "plot_" + estimate_name, self.plot)
        return self
Exemple #20
0
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            entry=None,
            label='Weibull_estimate',
            alpha=None,
            ci_labels=None,
            show_progress=False):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          timeline: return the estimate at the values in timeline (postively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          show_progress: since this is an iterative fitting algorithm, switching this to True will display some iteration details.
        Returns:
          self, with new properties like `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                'This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements.'
            )

        self.event_observed = np.asarray(
            event_observed,
            dtype=int) if event_observed is not None else np.ones_like(
                self.durations)

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline))
        else:
            self.timeline = np.linspace(self.durations.min(),
                                        self.durations.max(),
                                        self.durations.shape[0])

        self._label = label
        alpha = alpha if alpha is not None else self.alpha

        # estimation
        (self.lambda_, self.rho_), self._hessian_ = self._newton_rhaphson(
            self.durations, self.event_observed, show_progress=show_progress)
        self._log_likelihood = -_negative_log_likelihood(
            (self.lambda_, self.rho_), self.durations, self.event_observed)
        self.variance_matrix_ = -inv(self._hessian_)
        self.survival_function_ = pd.DataFrame(self.survival_function_at_times(
            self.timeline),
                                               columns=[self._label],
                                               index=self.timeline)
        self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline),
                                    columns=[self._label],
                                    index=self.timeline)
        self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times(
            self.timeline),
                                               columns=[self._label],
                                               index=self.timeline)
        self.confidence_interval_ = self._bounds(alpha, ci_labels)
        self.median_ = 1. / self.lambda_ * (np.log(2))**(1. / self.rho_)

        # estimation methods
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting - Cumulative hazard takes priority.
        self.plot_cumulative_hazard = self.plot

        return self
 def _check_values(self, X, T, E):
     pass_for_numeric_dtypes_or_raise(X)
     check_nans_or_infs(T)
     check_nans_or_infs(E)
     check_nans_or_infs(X)
    def fit(
        self, durations, event_observed=None, timeline=None, label="Exponential_estimate", alpha=None, ci_labels=None
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
          durations: iterable
            an array, or pd.Series, of length n -- duration subject was observed for
          event_observed: iterable, optional
            an array, list, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          timeline: iterable, optional
            return the best estimate at the values in timelines (postively increasing)
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: list, optional
            add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns
        -------
        self : ExponentialFitter
          self, with new properties like 'survival_function_', 'cumulative_hazard_', and 'lambda_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        self.event_observed = (
            np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
        )

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline))
        else:
            self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0])

        self._label = label

        # estimation
        D = self.event_observed.sum()
        T = self.durations.sum()

        self.lambda_ = D / T
        self._lambda_variance_ = self.lambda_ / T
        self._log_likelihood = np.log(self.lambda_) * D - self.lambda_ * T
        self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame(self._label)
        self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame(self._label)
        self.hazard_ = self.hazard_at_times(self.timeline).to_frame(self._label)

        self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
        self.median_ = 1.0 / self.lambda_ * (np.log(2))

        # estimation methods
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting
        self.plot_cumulative_hazards_ = self.plot

        return self
    def _fit(
        self,
        log_likelihood_function,
        df,
        Ts,
        regressors,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
        entry_col=None,
    ):

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.weights_col = weights_col
        self.entry_col = entry_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust
        self.regressors = regressors  # TODO name

        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if
             (self.event_col is not None) else pd.Series(np.ones(
                 self._n_examples, dtype=bool),
                                                         index=df.index,
                                                         name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))

        entries = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(entry_col)).astype(float) if (entry_col is not None) else
                   pd.Series(np.zeros(self._n_examples, dtype=float),
                             index=df.index,
                             name="entry"))

        check_nans_or_infs(E)
        E = E.astype(bool)
        self.event_observed = E.copy()
        self.entry = entries.copy()
        self.weights = weights.copy()

        df = df.astype(float)
        self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries)
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)

        _norm_std = df.std(0)
        _norm_std[_norm_std < 1e-8] = 1.0
        df_normalized = normalize(df, 0, _norm_std)

        Xs = self._create_Xs_dict(df_normalized)

        self._LOOKUP_SLICE = self._create_slicer(Xs)

        _index = pd.MultiIndex.from_tuples(
            sum(([(name, col) for col in columns]
                 for name, columns in regressors.items()), []))

        self._norm_std = pd.Series(
            [_norm_std.loc[variable_name] for _, variable_name in _index],
            index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            log_likelihood_function,
            Ts,
            Xs,
            E.values,
            weights.values,
            entries.values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            Ts, E.values, weights.values, entries.values, Xs)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df)
Exemple #24
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        show_progress=False,
        entry=None,
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
        durations: an array, or pd.Series
          length n, duration subject was observed for
        event_observed: numpy array or pd.Series, optional
          length n, True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
        timeline: list, optional
            return the estimate at the values in timeline (postively increasing)
        label: string, optional
            a string to name the column of the estimate.
        alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: list, optional
            add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
        show_progress: boolean, optional
            since this is an iterative fitting algorithm, switching this to True will display some iteration details.
        entry: an array, or pd.Series, of length n 
            relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
            entered study when they were "born": time zero.

        Returns
        -------
          self : WeibullFitter
            self with new properties like ``cumulative_hazard_``, ``survival_function_``

        """
        label = coalesce(
            label,
            self.__class__.__name__.replace("Fitter", "") + "_estimate")

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if not self._KNOWN_MODEL:
            self._check_cumulative_hazard_is_monotone_and_positive(
                self.durations, self._initial_values)

        self.event_observed = (np.asarray(event_observed, dtype=int)
                               if event_observed is not None else np.ones_like(
                                   self.durations))

        self.entry = np.asarray(entry) if entry is not None else np.zeros_like(
            self.durations)

        if timeline is not None:
            self.timeline = np.sort(np.asarray(timeline))
        else:
            self.timeline = np.linspace(self.durations.min(),
                                        self.durations.max(),
                                        self.durations.shape[0])

        self._label = label
        self._ci_labels = ci_labels
        self.alpha = coalesce(alpha, self.alpha)

        # estimation
        self._fitted_parameters_, self._log_likelihood, self._hessian_ = self._fit_model(
            self.durations,
            self.event_observed.astype(bool),
            self.entry,
            show_progress=show_progress)

        if not self._KNOWN_MODEL:
            self._check_cumulative_hazard_is_monotone_and_positive(
                self.durations, self._fitted_parameters_)

        for param_name, fitted_value in zip(self._fitted_parameter_names,
                                            self._fitted_parameters_):
            setattr(self, param_name, fitted_value)

        try:
            self.variance_matrix_ = inv(self._hessian_)
        except np.linalg.LinAlgError:
            self.variance_matrix_ = pinv(self._hessian_)
            warning_text = dedent("""\
                
                The hessian was not invertable. This could be a model problem: 

                1. Are two parameters in the model colinear / exchangeable? 
                2. Is the cumulative hazard always non-negative and always non-decreasing?
                3. Are there cusps/ in the cumulative hazard? 

                We will instead approximate it using the psuedo-inverse. 

                It's advisable to not trust the variances reported, and to be suspicious of the
                fitted parameters too. Perform plots of the cumulative hazard to help understand
                the latter's bias.
                """)
            warnings.warn(warning_text, StatisticalWarning)

        self._predict_label = label
        self._update_docstrings()

        self.survival_function_ = self.survival_function_at_times(
            self.timeline).to_frame()
        self.hazard_ = self.hazard_at_times(self.timeline).to_frame()
        self.cumulative_hazard_ = self.cumulative_hazard_at_times(
            self.timeline).to_frame()

        return self