Exemple #1
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate',
            alpha=None, left_censorship=False, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                   self._additive_f, self._additive_var,
                                                                   left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].argmin()
                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)

        # estimation
        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate)

        # estimation methods
        self.predict = self._predict(estimate_name, label)
        self.subtract = self._subtract(estimate_name)
        self.divide = self._divide(estimate_name)

        # plotting functions
        self.plot = self._plot_estimate(estimate_name)
        setattr(self, "plot_" + estimate_name, self.plot)
        self.plot_loglogs = plot_loglogs(self)
        return self
    def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='NA_estimate', alpha=None, ci_labels=None, weights=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns:
          self, with new properties like 'cumulative_hazard_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
          if (weights.astype(int) != weights).any():
              warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """, RuntimeWarning)

        v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                self._additive_f, self._variance_f, False)

        # esimates
        self._label = label
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label])
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self._cumulative_sq = cumulative_sq_

        # estimation methods
        self._estimation_method = "cumulative_hazard_"
        self._estimate_name = "cumulative_hazard_"
        self._predict_label = label
        self._update_docstrings()

        # plotting
        self.plot_cumulative_hazard = self.plot

        return self
Exemple #3
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate',
            alpha=None, left_censorship=False, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                   self._additive_f, self._additive_var,
                                                                   left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].argmin()
                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)

        # estimation
        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate)

        # estimation methods
        self.predict = self._predict(estimate_name, label)
        self.subtract = self._subtract(estimate_name)
        self.divide = self._divide(estimate_name)

        # plotting functions
        self.plot = self._plot_estimate(estimate_name)
        setattr(self, "plot_" + estimate_name, self.plot)
        return self
Exemple #4
0
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            entry=None,
            label='NA_estimate',
            alpha=None,
            ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns:
          self, with new properties like 'cumulative_hazard_'.

        """

        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._variance_f, False)

        # esimates
        self._label = label
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_,
                                               columns=[self._label])
        self.confidence_interval_ = self._bounds(
            cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self._cumulative_sq = cumulative_sq_

        # estimation functions
        self.predict = self._predict("cumulative_hazard_", self._label)
        self.subtract = self._subtract("cumulative_hazard_")
        self.divide = self._divide("cumulative_hazard_")

        # plotting
        self.plot = self._plot_estimate("cumulative_hazard_")
        self.plot_cumulative_hazard = self.plot
        self.plot_hazard = self._plot_estimate('hazard_')

        return self
    def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='NA_estimate', alpha=None, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns:
          self, with new properties like 'cumulative_hazard_'.

        """

        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                self._additive_f, self._variance_f, False)

        # esimates
        self._label = label
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label])
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self._cumulative_sq = cumulative_sq_

        # estimation functions
        self.predict = self._predict("cumulative_hazard_", self._label)
        self.subtract = self._subtract("cumulative_hazard_")
        self.divide = self._divide("cumulative_hazard_")

        # plotting
        self.plot = self._plot_estimate("cumulative_hazard_")
        self.plot_cumulative_hazard = self.plot
        self.plot_hazard = self._plot_estimate('hazard_')

        return self
    def fit(
        self,
        durations,
        event_observed,
        event_of_interest,
        timeline=None,
        entry=None,
        label="AJ_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array or pd.Series of length n -- duration of subject was observed for
          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be
             only positive integers, where 0 indicates censoring.
          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
          timeline: return the best estimate at the values in timelines (positively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self : AalenJohansenFitter
          self, with new properties like ``cumulative_incidence_``.
        """
        # Checking for tied event times
        ties = self._check_for_duplicates(durations=durations, events=event_observed)

        if ties:
            warnings.warn(
                dedent(
                    """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times.
                To resolve ties, data is randomly jittered."""
                ),
                Warning,
            )
            durations = self._jitter(
                durations=pd.Series(durations),
                event=pd.Series(event_observed),
                jitter_level=self._jitter_level,
                seed=self._seed,
            )

        alpha = alpha if alpha else self.alpha

        # Creating label for event of interest & indicator for that event
        event_of_interest = int(event_of_interest)
        cmprisk_label = "CIF_" + str(event_of_interest)
        self.label_cmprisk = "observed_" + str(event_of_interest)

        # Fitting Kaplan-Meier for either event of interest OR competing risk
        km = KaplanMeierFitter().fit(
            durations, event_observed=event_observed, timeline=timeline, entry=entry, weights=weights
        )
        aj = km.event_table
        aj["overall_survival"] = km.survival_function_
        aj["lagged_overall_survival"] = aj["overall_survival"].shift()

        # Setting up table for calculations and to return to user
        event_spec = pd.Series(event_observed) == event_of_interest
        self.durations, self.event_observed, *_, event_table = _preprocess_inputs(
            durations=durations, event_observed=event_spec, timeline=timeline, entry=entry, weights=weights
        )
        event_spec_times = event_table["observed"]
        event_spec_times = event_spec_times.rename(self.label_cmprisk)
        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()

        # Estimator of Cumulative Incidence (Density) Function
        aj[cmprisk_label] = (aj[self.label_cmprisk] / aj["at_risk"] * aj["lagged_overall_survival"]).cumsum()
        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
        aj = aj.set_index("event_at")

        # Setting attributes
        self._estimation_method = "cumulative_density_"
        self._estimate_name = "cumulative_density_"
        self._update_docstrings()

        self._label = label
        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])

        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
        self.event_table = aj[
            ["removed", "observed", self.label_cmprisk, "censored", "entrance", "at_risk"]
        ]  # Event table

        if self._calc_var:
            self.variance_, self.confidence_interval_ = self._bounds(
                aj["lagged_overall_survival"], alpha=alpha, ci_labels=ci_labels
            )
        else:
            self.variance_, self.confidence_interval_ = None, None

        return self
    def fit(
        self,
        durations,
        event_observed,
        event_of_interest,
        timeline=None,
        entry=None,
        label="AJ_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array or pd.Series of length n -- duration of subject was observed for
          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be
             only positive integers, where 0 indicates censoring.
          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
          timeline: return the best estimate at the values in timelines (postively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self : AalenJohansenFitter
          self, with new properties like ``cumulative_incidence_``.
        """
        # Checking for tied event times
        ties = self._check_for_duplicates(durations=durations, events=event_observed)

        if ties:
            warnings.warn(
                """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times.
                To resolve ties, data is randomly jittered.""",
                Warning,
            )
            durations = self._jitter(
                durations=pd.Series(durations),
                event=pd.Series(event_observed),
                jitter_level=self._jitter_level,
                seed=self._seed,
            )

        # Creating label for event of interest & indicator for that event
        cmprisk_label = "CIF_" + str(int(event_of_interest))
        self.label_cmprisk = "observed_" + str(int(event_of_interest))

        # Fitting Kaplan-Meier for either event of interest OR competing risk
        km = KaplanMeierFitter()
        km.fit(durations, event_observed=event_observed, timeline=timeline, entry=entry, weights=weights)
        aj = km.event_table
        aj["overall_survival"] = km.survival_function_
        aj["lagged_overall_survival"] = aj["overall_survival"].shift()

        # Setting up table for calculations and to return to user
        event_spec = np.where(pd.Series(event_observed) == event_of_interest, 1, 0)
        event_spec_proc = _preprocess_inputs(
            durations=durations, event_observed=event_spec, timeline=timeline, entry=entry, weights=weights
        )
        event_spec_times = event_spec_proc[-1]["observed"]
        event_spec_times = event_spec_times.rename(self.label_cmprisk)
        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()

        # Estimator of Cumulative Incidence (Density) Function
        aj[cmprisk_label] = ((aj[self.label_cmprisk]) / (aj["at_risk"]) * aj["lagged_overall_survival"]).cumsum()
        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
        aj = aj.set_index("event_at")

        # Setting attributes
        self._estimation_method = "cumulative_density_"
        self._estimate_name = "cumulative_density_"
        self._predict_label = label
        self._update_docstrings()

        alpha = alpha if alpha else self.alpha
        self._label = label
        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])
        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
        self.event_table = aj[
            ["removed", "observed", self.label_cmprisk, "censored", "entrance", "at_risk"]
        ]  # Event table
        if self._calc_var:
            self.variance, self.confidence_interval_ = self._bounds(
                aj["lagged_overall_survival"], alpha=alpha, ci_labels=ci_labels
            )
        return self
    def _fit(self,
             durations,
             event_observed=None,
             timeline=None,
             entry=None,
             label=None,
             alpha=None,
             ci_labels=None,
             weights=None):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``

        """
        durations = np.asarray(durations)
        self._check_values(durations)

        if event_observed is not None:
            event_observed = np.asarray(event_observed)
            self._check_values(event_observed)

        self._label = coalesce(label, self._label, "KM_estimate")

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )
        else:
            weights = np.ones_like(durations, dtype=float)

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        (self.durations, self.event_observed, self.timeline, self.entry,
         self.event_table,
         self.weights) = _preprocess_inputs(durations, event_observed,
                                            timeline, entry, weights)

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, is_left_censoring)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(self, primary_estimate_name,
                pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name,
                pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(
            cumulative_sq_.values[:, None], alpha, ci_labels)
        self._median = median_survival_times(self.survival_function_)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name,
                self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name,
                1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name

        return self
Exemple #9
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        left_censorship=False,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like 'survival_function_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly prospenity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = "survival_function_" if not left_censorship else "cumulative_density_"
        v = _preprocess_inputs(durations, event_observed, timeline, entry,
                               weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(
            self, estimate_name,
            pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None],
                                                 alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate,
                                             left_censorship=left_censorship)

        # estimation methods
        self._estimation_method = estimate_name
        self._estimate_name = estimate_name
        self._predict_label = label
        self._update_docstrings()

        # plotting functions
        setattr(self, "plot_" + estimate_name, self.plot)
        return self
    def fit(self,
            durations,
            event_observed,
            event_of_interest,
            timeline=None,
            entry=None,
            label='AJ_estimate',
            alpha=None,
            ci_labels=None,
            weights=None):
        """
        Parameters:
          durations: an array or pd.Series of length n -- duration of subject was observed for 
          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be 
             only positive integers, where 0 indicates censoring.
          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
          timeline: return the best estimate at the values in timelines (postively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns:
          self, with new properties like 'cumulative_incidence_'.
        """
        # Checking for tied event times
        if np.sum(pd.Series(durations).duplicated()) > 0:
            # Seeing if there is a large amount of ties in the data (>20%)
            if np.sum(
                    pd.Series(durations).duplicated()) / len(durations) > 0.2:
                warnings.warn(
                    '''It looks like there are many tied events in your data set. The Aalen-Johansen 
                              estimator should only be used when there are no/few tied events''',
                    Warning)
                # I am unaware of a recommended cut-off, but 20% would be suggestive of issues
            # Raise warning if duplicated times, then randomly jitter times
            warnings.warn(
                '''Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times. 
                To resolve ties, data is randomly jittered.''', Warning)
            durations = self._jitter(durations=pd.Series(durations),
                                     event=pd.Series(event_observed),
                                     jitter_level=self._jitter_level,
                                     seed=self._seed)

        # Creating label for event of interest & indicator for that event
        cmprisk_label = 'CIF_' + str(int(event_of_interest))
        self.label_cmprisk = 'observed_' + str(int(event_of_interest))

        # Fitting Kaplan-Meier for either event of interest OR competing risk
        km = KaplanMeierFitter()
        km.fit(durations,
               event_observed=event_observed,
               timeline=timeline,
               entry=entry,
               weights=weights)
        aj = km.event_table
        aj['overall_survival'] = km.survival_function_
        aj['lagged_overall_survival'] = aj['overall_survival'].shift()

        # Setting up table for calculations and to return to user
        event_spec = np.where(
            pd.Series(event_observed) == event_of_interest, 1, 0)
        event_spec_proc = _preprocess_inputs(durations=durations,
                                             event_observed=event_spec,
                                             timeline=timeline,
                                             entry=entry,
                                             weights=weights)
        event_spec_times = event_spec_proc[-1]['observed']
        event_spec_times = event_spec_times.rename(self.label_cmprisk)
        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()

        # Estimator of Cumulative Incidence (Density) Function
        aj[cmprisk_label] = ((aj[self.label_cmprisk]) / (aj['at_risk']) *
                             aj['lagged_overall_survival']).cumsum()
        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
        aj = aj.set_index('event_at')

        # Setting attributes
        self._estimation_method = "cumulative_density_"
        self._estimate_name = "cumulative_density_"
        self._predict_label = label
        self._update_docstrings()

        alpha = alpha if alpha else self.alpha
        self._label = label
        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])
        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
        self.event_table = aj[[
            'removed', 'observed', self.label_cmprisk, 'censored', 'entrance',
            'at_risk'
        ]]  # Event table
        self.variance, self.confidence_interval_ = self._bounds(
            aj['lagged_overall_survival'], alpha=alpha, ci_labels=ci_labels)
        return self
    def _fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (postively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          left_censorship: bool, optional (default=False)
            True if durations and event_observed refer to left censorship events. Default False
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median``

        """
        self._check_values(durations)
        if event_observed is not None:
            self._check_values(event_observed)

        self._label = label

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = _preprocess_inputs(
            durations, event_observed, timeline, entry, weights
        )

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring
        )

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum()
            if net_population.iloc[: int(n / 2)].min() == 0:
                ix = net_population.iloc[: int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix
                )

        # estimation
        setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate, left_censorship=is_left_censoring)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name
        self._update_docstrings()

        return self