Example #1
0
def KM_median(array,
              upper_lim_flags,
              left_censor=True,
              return_type='percentile'):
    kmf = KaplanMeierFitter()

    if upper_lim_flags is not None:
        if left_censor == True:
            kmf.fit_left_censoring(array, upper_lim_flags)
        else:
            kmf.fit(array, event_observed=upper_lim_flags)  #right censoring
    else:
        kmf.fit(array, upper_lim_flags)

    median = median_survival_times(kmf.survival_function_)

    if return_type == 'percentile':
        upper_perc = kmf.percentile(0.25)
        lower_perc = kmf.percentile(0.75)

        print(
            f'median and 1st/3rd quartiles: {median}, {lower_perc}, {upper_perc}'
        )
        return median, upper_perc, lower_perc

    elif return_type == 'ci':
        median_ci = median_survival_times(kmf.confidence_interval_).values
        print(f'median and CI: {median}, {median_ci}')
        return median, median_ci[0][0], median_ci[0][1]

    elif return_type == 'median':
        return median
Example #2
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate',
            alpha=None, left_censorship=False, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                   self._additive_f, self._additive_var,
                                                                   left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].argmin()
                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)

        # estimation
        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate)

        # estimation methods
        self.predict = self._predict(estimate_name, label)
        self.subtract = self._subtract(estimate_name)
        self.divide = self._divide(estimate_name)

        # plotting functions
        self.plot = self._plot_estimate(estimate_name)
        setattr(self, "plot_" + estimate_name, self.plot)
        self.plot_loglogs = plot_loglogs(self)
        return self
    def __init__(self, time, events, label=None, color=None):
        self._kmf = KaplanMeierFitter().fit(time.astype(np.float64),
                                            events.astype(np.float64))

        self._label: str = label
        self.color: List[int] = color

        # refactor this
        time, survival = self._kmf.survival_function_.reset_index(
        ).values.T.tolist()
        lower, upper = self._kmf.confidence_interval_.values.T.tolist()
        self.x, self.y = self.generate_curve_coordinates(time, survival)
        _, self.lower_bound = self.generate_curve_coordinates(time, lower)
        _, self.upper_bound = self.generate_curve_coordinates(time, upper)

        # Estimated function curve
        self.estimated_fun = pg.PlotDataItem(self.x,
                                             self.y,
                                             pen=self.get_pen())

        # Lower and upper confidence intervals
        pen = self.get_pen(width=1, alpha=70)
        self.lower_conf_limit = pg.PlotDataItem(self.x,
                                                self.lower_bound,
                                                pen=pen)
        self.upper_conf_limit = pg.PlotDataItem(self.x,
                                                self.upper_bound,
                                                pen=pen)
        self.confidence_interval = pg.FillBetweenItem(
            self.upper_conf_limit,
            self.lower_conf_limit,
            brush=self.get_color(alpha=50))

        self.selection = pg.PlotDataItem(
            pen=mkPen(color=QColor(Qt.yellow), width=4))
        self.selection.hide()

        self.median_survival = median = np.round(
            median_survival_times(
                self._kmf.survival_function_.astype(np.float32)), 1)
        self.median_vertical = pg.PlotDataItem(
            x=(median, median),
            y=(0, 0.5),
            pen=pg.mkPen(**MEDIAN_LINE_PEN_STYLE))

        censored_data = self.get_censored_data()

        self.censored_data = pg.ScatterPlotItem(
            x=censored_data[:, 0],
            y=censored_data[:, 1],
            brush=QBrush(Qt.black),
            pen=self.get_pen(width=1, alpha=255),
            symbol=create_line_symbol(),
            size=15,
        )
        self.censored_data.setZValue(10)

        self.num_of_samples = len(events)
        self.num_of_censored_samples = len(censored_data)
Example #4
0
    def fit(self, durations, event_observed=None, timeline=None, entry=None, label='KM_estimate',
            alpha=None, left_censorship=False, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
        v = _preprocess_inputs(durations, event_observed, timeline, entry)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                   self._additive_f, self._additive_var,
                                                                   left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].argmin()
                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)

        # estimation
        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate)

        # estimation methods
        self.predict = self._predict(estimate_name, label)
        self.subtract = self._subtract(estimate_name)
        self.divide = self._divide(estimate_name)

        # plotting functions
        self.plot = self._plot_estimate(estimate_name)
        setattr(self, "plot_" + estimate_name, self.plot)
        return self
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            entry=None,
            label='BFH_estimate',
            alpha=None,
            ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        self._label = label
        alpha = alpha if alpha is not None else self.alpha

        naf = NelsonAalenFitter(alpha)
        naf.fit(durations,
                event_observed=event_observed,
                timeline=timeline,
                label=label,
                entry=entry,
                ci_labels=ci_labels)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = \
            naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table

        # estimation
        self.survival_function_ = np.exp(-naf.cumulative_hazard_)
        self.confidence_interval_ = np.exp(-naf.confidence_interval_)
        self.median_ = median_survival_times(self.survival_function_)

        # estimation methods
        self.predict = self._predict("survival_function_", label)
        self.subtract = self._subtract("survival_function_")
        self.divide = self._divide("survival_function_")

        # plotting functions
        self.plot = self._plot_estimate("survival_function_")
        self.plot_survival_function = self.plot
        return self
def predict_50(conditioned_sf):
    # Predict the month number where the survival chance of customer is 50%
    # This can also be modified as predictions_50 = qth_survival_times(.50, conditioned_sf),
    # where the percentile can be modified depending on our requirement
    predictions_50 = median_survival_times(conditioned_sf)
    '''
    ### predictions_50
    Predicting the month at which the survival chance of the customer is 50%
    '''
    st.write(predictions_50[[customer]])
    return predictions_50
    def fit(self, durations, event_observed=None, timeline=None, entry=None,
            label='BFH_estimate', alpha=None, ci_labels=None):
        """
        Parameters:
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated observations, i.e the birth event was not observed.
             If None, defaults to all 0 (all birth events observed.)
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns:
          self, with new properties like 'survival_function_'.

        """
        self._label = label
        alpha = alpha if alpha is not None else self.alpha

        naf = NelsonAalenFitter(alpha)
        naf.fit(durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = \
            naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table

        # estimation
        self.survival_function_ = np.exp(-naf.cumulative_hazard_)
        self.confidence_interval_ = np.exp(-naf.confidence_interval_)
        self.median_ = median_survival_times(self.survival_function_)

        # estimation methods
        self.predict = self._predict("survival_function_", label)
        self.subtract = self._subtract("survival_function_")
        self.divide = self._divide("survival_function_")

        # plotting functions
        self.plot = self._plot_estimate("survival_function_")
        self.plot_survival_function = self.plot
        return self
def predict_50(conditioned_sf):
    # Predict the month number where the survival chance of customer is 50%
    # This can also be modified as predictions_50 = qth_survival_times(.50, conditioned_sf),
    # where the percentile can be modified depending on our requirement
    predictions_50 = median_survival_times(conditioned_sf)
    return predictions_50
Example #9
0
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        left_censorship=False,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          duration: an array, or pd.Series, of length n -- duration subject was observed for
          timeline: return the best estimate at the values in timelines (postively increasing)
          event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
             was lost (right-censored). Defaults all True if event_observed==None
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          left_censorship: True if durations and event_observed refer to left censorship events. Default False
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like 'survival_function_'.

        """

        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly prospenity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        estimate_name = "survival_function_" if not left_censorship else "cumulative_density_"
        v = _preprocess_inputs(durations, event_observed, timeline, entry,
                               weights)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v

        self._label = label
        alpha = alpha if alpha else self.alpha
        log_survival_function, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, left_censorship)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(
            self, estimate_name,
            pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
        self.__estimate = getattr(self, estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None],
                                                 alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate,
                                             left_censorship=left_censorship)

        # estimation methods
        self._estimation_method = estimate_name
        self._estimate_name = estimate_name
        self._predict_label = label
        self._update_docstrings()

        # plotting functions
        setattr(self, "plot_" + estimate_name, self.plot)
        return self
    # print(df)
    print(df.head(),'\n')
    print(df['T'].min(), df['T'].max(),'\n')
    print(df['E'].value_counts(),'\n')
    print(df['group'].value_counts(),'\n')
    
    kmf = KaplanMeierFitter()
    kmf.fit(df['T'], event_observed=df['E'])

    # kmf.plot_survival_function()
    ax=kmf.survival_function_.plot()
    #共享一个画布
    ax=kmf.plot(ax=ax)

    median_ = kmf.median_survival_time_
    median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
    print(median_confidence_interval_)
    groups = df['group']
    ix = (groups == 'largeAmount')
    
    kmf.fit(df['T'][ix], df['E'][ix], label='largeAmount')
    ax=kmf.survival_function_.plot(ax=ax)
    ax = kmf.plot(ax=ax)
    # plt.show()
    treatment_median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
    print("使用量基数较大的音乐存活50%对应的存活时间95%置信区间:'\n'", treatment_median_confidence_interval_, '\n')
    
    kmf.fit(df['T'][~ix], df['E'][~ix], label='smallAmount')
    ax=kmf.survival_function_.plot(ax=ax)
    ax = kmf.plot(ax=ax)
    # plt.show()
Example #11
0
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        entry=None,
        weights=None,
        tol: float = 1e-5,
        show_progress: bool = False,
        **kwargs,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turnbull Estimator.

        Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small
        value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals.

        >>> left, right = df['left'], df['right']
        >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001)

        Note
        ------
        This is new and experimental, and many features are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.
          tol: float, optional
            minimum difference in log likelihood changes for iterative algorithm.
          show_progress: bool, optional
            display information during fitting.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        if entry is not None:
            raise NotImplementedError("entry is not supported yet")

        if weights is None:
            weights = np.ones_like(upper_bound)

        self.weights = np.asarray(weights)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        results = npmle(self.lower_bound,
                        self.upper_bound,
                        verbose=show_progress,
                        tol=tol,
                        weights=weights,
                        **kwargs)
        self.survival_function_ = reconstruct_survival_function(
            *results, self.timeline, label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        return self
Example #12
0
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        show_progress=False,
        entry=None,
        weights=None,
        tol=1e-7,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turball Estimator.

        Note
        ------
        This is new and experimental, and many feature are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        warnings.warn(
            "This is new and experimental, many feature are missing and accuracy is not reliable",
            UserWarning)

        if entry is not None or weights is not None:
            raise NotImplementedError("entry / weights is not supported yet")
        self.weights = np.ones_like(upper_bound)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        probs, t_intervals = npmle(self.lower_bound,
                                   self.upper_bound,
                                   verbose=show_progress)
        self.survival_function_ = reconstruct_survival_function(
            probs, t_intervals, self.timeline,
            label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        self.percentile = functools.partial(
            qth_survival_time,
            model_or_survival_function=self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        self._update_docstrings()
        return self
Example #13
0
def test_median_accepts_series():
    sv = pd.Series(1 - np.linspace(0, 1, 1000))
    assert utils.median_survival_times(sv) == 500
    def _fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="KM_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (postively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          left_censorship: bool, optional (default=False)
            True if durations and event_observed refer to left censorship events. Default False
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median``

        """
        self._check_values(durations)
        if event_observed is not None:
            self._check_values(event_observed)

        self._label = label

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = _preprocess_inputs(
            durations, event_observed, timeline, entry, weights
        )

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f, self._additive_var, is_left_censoring
        )

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] - self.event_table["removed"]).cumsum()
            if net_population.iloc[: int(n / 2)].min() == 0:
                ix = net_population.iloc[: int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix
                )

        # estimation
        setattr(self, primary_estimate_name, pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name, pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha, ci_labels)
        self.median_ = median_survival_times(self.__estimate, left_censorship=is_left_censoring)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name, self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name, 1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name
        self._update_docstrings()

        return self
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
add_at_risk_counts(12, kmf_overall, labels=None)

leg_1 = Line2D([0], [0], color='grey', ls='--')
leg_2 = Line2D([0], [0], color='blue', marker='|', ms=10)
leg = [leg_1, leg_2]
leg_lab = ['Due Date', 'Censored']

plt.legend(leg, leg_lab, fontsize=20)
#plt.savefig(parent + '/Figures/survival_with_censor.png')
# -

#This provides the values for the 95% CI at the median
from lifelines.utils import median_survival_times
median_survival_times(kmf_overall.confidence_interval_)

# +
#The proportion reported any day can be checked by running this code and editing the value in loc[]:
print(kmf_overall.survival_function_.loc[796]['KM_estimate'])
print(kmf_overall.survival_function_.loc[815]['KM_estimate'])

#If you are interested in the full data produced by the survival function you can view it here:
surv_func = kmf_overall.survival_function_
surv_func.head()
# -

#Similarly, the upper and lower condifence interval values can be checked by editing the value in loc[]:
kmf_overall.confidence_interval_survival_function_.loc[412]

# +
Example #16
0
from lifelines.datasets import load_waltons
from lifelines import KaplanMeierFitter
from lifelines.utils import median_survival_times

df = load_waltons()
print(df.head(),'\n')
print(df['T'].min(), df['T'].max(),'\n')
print(df['E'].value_counts(),'\n')
print(df['group'].value_counts(),'\n')

kmf = KaplanMeierFitter()
kmf.fit(df['T'], event_observed=df['E'])

kmf.plot_survival_function()

median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(median_confidence_interval_)
Example #17
0
def test_median():
    sv = pd.DataFrame(1 - np.linspace(0, 1, 1000))
    assert utils.median_survival_times(sv) == 500
Example #18
0
 def median_(self):
     """ 
     Return the unique time point, t, such that S(t) = 0.5. This is the "half-life" of the population, and a 
     robust summary statistic for the population, if it exists. 
     """
     return median_survival_times(self.survival_function_)
Example #19
0
    def _fit(self,
             durations,
             event_observed=None,
             timeline=None,
             entry=None,
             label=None,
             alpha=None,
             ci_labels=None,
             weights=None):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``

        """
        durations = np.asarray(durations)
        self._check_values(durations)

        if event_observed is not None:
            event_observed = np.asarray(event_observed)
            self._check_values(event_observed)

        self._label = coalesce(label, self._label, "KM_estimate")

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )
        else:
            weights = np.ones_like(durations, dtype=float)

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        (self.durations, self.event_observed, self.timeline, self.entry,
         self.event_table,
         self.weights) = _preprocess_inputs(durations, event_observed,
                                            timeline, entry, weights)

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, is_left_censoring)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(self, primary_estimate_name,
                pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name,
                pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(
            cumulative_sq_.values[:, None], alpha, ci_labels)
        self._median = median_survival_times(self.survival_function_)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name,
                self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name,
                1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name

        return self
Example #20
0
add_at_risk_counts(kmf_train, kmf_test, ax=ax, fontsize=12)

ax.set_xlabel('Time (months)', fontsize=12, fontweight='bold')
ax.set_ylabel('Survival probability', fontsize=12, fontweight='bold')
ax.legend(fontsize=12)

ax.text(10, 0.75, 'p=0.85', fontsize=12, fontweight='bold')

# In[27]:

print(kmf_train.median_survival_time_)
print(kmf_test.median_survival_time_)

# In[28]:

print(median_survival_times(kmf_train.confidence_interval_))
print(median_survival_times(kmf_test.confidence_interval_))

# In[29]:

print(kmf_train.event_table)
print(kmf_test.event_table)

# In[30]:

print('Survival probability for t=60 for train set: ', kmf_train.predict(60))
print('Survival probability for t=60 for test set: ', kmf_test.predict(60))

# In[31]:

results = logrank_test(train['PFS'],
Example #21
0
def test_median():
    sv = pd.DataFrame(1 - np.linspace(0, 1, 1000))
    assert utils.median_survival_times(sv) == 500
Example #22
0
dfn = pd.DataFrame(
    d, columns=["ID", "KM", "DEAD", "ENGINE", "MOUNTAIN", "CITY", "MONDAY"])
print(dfn)
censored_subjects = censored_subjects.append(dfn, ignore_index=True)

print(censored_subjects)

unconditioned_sf = cph.predict_survival_function(censored_subjects)
print(unconditioned_sf)

from lifelines.utils import median_survival_times, qth_survival_times

predictions_75 = qth_survival_times(0.75, unconditioned_sf)
predictions_25 = qth_survival_times(0.25, unconditioned_sf)
predictions_50 = median_survival_times(unconditioned_sf)
print(predictions_50)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
for f in unconditioned_sf:
    ax.plot(unconditioned_sf[f], alpha=.5, label=f)
#ax.legend()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
for i, f in enumerate(reversed(unconditioned_sf.columns)):
    #print( i )
    if i < num_d:
        print(i, f)
        ax.plot(unconditioned_sf[f], alpha=1, label=f)
    else:
        ax.plot(unconditioned_sf[f], alpha=0.1, label=f, c='grey')