def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='NA_estimate', alpha=None, ci_labels=None, weights=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns:
          self, with new properties like 'cumulative_hazard_'.

        """

        check_nans(durations)
        if event_observed is not None:
            check_nans(event_observed)

        v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                self._additive_f, self._variance_f, False)

        # esimates
        self._label = label
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label])
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self._cumulative_sq = cumulative_sq_

        # estimation functions
        self.predict = self._predict("cumulative_hazard_", self._label)
        self.subtract = self._subtract("cumulative_hazard_")
        self.divide = self._divide("cumulative_hazard_")

        # plotting
        self.plot = self._plot_estimate("cumulative_hazard_")
        self.plot_cumulative_hazard = self.plot
        self.plot_hazard = self._plot_estimate('hazard_')

        return self
Exemple #2
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='Exponential_estimate', alpha=None, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns:
          self, with new properties like 'survival_function_' and 'lambda_'.

        """

        check_nans(durations)
        if event_observed is not None:
            check_nans(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
        self.timeline = np.sort(np.asarray(timeline)) if timeline is not None else np.arange(int(self.durations.min()), int(self.durations.max()) + 1)
        self._label = label

        # estimation
        D = self.event_observed.sum()
        T = self.durations.sum()
        self.lambda_ = D / T
        self._lambda_variance_ = self.lambda_ / T
        self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline)
        self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
        self.median_ = 1. / self.lambda_ * (np.log(2))

        # estimation functions
        self.predict = self._predict(lambda t: np.exp(-self.lambda_ * t), self._label)
        self.subtract = self._subtract("survival_function_")
        self.divide = self._divide("survival_function_")

        # plotting
        self.plot = self._plot_estimate("survival_function_")
        self.plot_survival_function_ = self.plot

        return self
Exemple #3
0
 def _check_values(df, T, E):
     pass_for_numeric_dtypes_or_raise(df)
     check_nans(T)
     check_nans(E)
     check_low_var(df)
     check_complete_separation(df, E, T)
Exemple #4
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate',
            alpha=None, left_censorship=False, ci_labels=None, weights=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns:
          self, with new properties like 'survival_function_'.

        """

        check_nans(durations)
        if event_observed is not None:
            check_nans(event_observed)

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
        v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                   self._additive_f, self._additive_var,
                                                                   left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)

        # estimation
        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship)

        # estimation methods
        self.predict = self._predict(estimate_name, label)
        self.subtract = self._subtract(estimate_name)
        self.divide = self._divide(estimate_name)

        # plotting functions
        self.plot = self._plot_estimate(estimate_name)
        setattr(self, "plot_" + estimate_name, self.plot)
        self.plot_loglogs = plot_loglogs(self)
        return self
 def _check_values(self, df, T, E):
     pass_for_numeric_dtypes_or_raise(df)
     check_nans(T)
     check_nans(E)
Exemple #6
0
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            entry=None,
            label='Weibull_estimate',
            alpha=None,
            ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          timeline: return the estimate at the values in timeline (postively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>

        Returns:
          self, with new properties like `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'.

        """

        check_nans(durations)
        if event_observed is not None:
            check_nans(event_observed)

        self.durations = np.asarray(durations, dtype=float)
        # check for negative or 0 durations - these are not allowed in a weibull model.
        if np.any(self.durations <= 0):
            raise ValueError(
                'This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements.'
            )

        self.event_observed = np.asarray(
            event_observed,
            dtype=int) if event_observed is not None else np.ones_like(
                self.durations)
        self.timeline = np.sort(
            np.asarray(timeline)) if timeline is not None else np.arange(
                int(self.durations.min()),
                int(self.durations.max()) + 1)
        self._label = label
        alpha = alpha if alpha is not None else self.alpha

        # estimation
        self.lambda_, self.rho_ = self._newton_rhaphson(
            self.durations, self.event_observed)
        self.survival_function_ = pd.DataFrame(self.survival_function_at_times(
            self.timeline),
                                               columns=[self._label],
                                               index=self.timeline)
        self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline),
                                    columns=[self._label],
                                    index=self.timeline)
        self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times(
            self.timeline),
                                               columns=[self._label],
                                               index=self.timeline)
        self.confidence_interval_ = self._bounds(alpha, ci_labels)
        self.median_ = 1. / self.lambda_ * (np.log(2))**(1. / self.rho_)

        # estimation functions - Cumulative hazard takes priority.
        self.predict = self._predict(
            lambda t: np.exp(-(self.lambda_ * t)**self.rho_), self._label)
        self.subtract = self._subtract("cumulative_hazard_")
        self.divide = self._divide("cumulative_hazard_")

        # plotting - Cumulative hazard takes priority.
        self.plot = self._plot_estimate("cumulative_hazard_")
        self.plot_cumulative_hazard = self.plot

        return self