Beispiel #1
0
    def __init__(self, cls, estimate, loc, iloc, show_censors, censor_styles,
                 bandwidth, **kwargs):

        self.censor_styles = coalesce(censor_styles, {})

        set_kwargs_ax(kwargs)
        set_kwargs_color(kwargs)
        set_kwargs_drawstyle(kwargs)

        self.estimate = coalesce(estimate, cls._estimate_name)
        self.loc = loc
        self.iloc = iloc
        self.show_censors = show_censors
        # plot censors
        self.ax = kwargs["ax"]
        self.colour = kwargs["c"]
        self.kwargs = kwargs

        if (self.loc is not None) and (self.iloc is not None):
            raise ValueError(
                "Cannot set both loc and iloc in call to .plot().")

        if self.estimate == "hazard_":
            if bandwidth is None:
                raise ValueError(
                    "Must specify a bandwidth parameter in the call to plot_hazard."
                )
            self.estimate_ = cls.smoothed_hazard_(bandwidth)
            self.confidence_interval_ = cls.smoothed_hazard_confidence_intervals_(
                bandwidth, hazard_=self.estimate_.values[:, 0])
        else:
            self.estimate_ = getattr(cls, self.estimate)
            self.confidence_interval_ = getattr(cls, "confidence_interval_")
Beispiel #2
0
 def plot(self, **kwargs):
     kwargs['alpha'] = coalesce(kwargs.pop('alpha', None), 0.05)
     kwargs['legend'] = False
     kwargs['c'] = coalesce(kwargs.pop('c', None),
                            kwargs.pop('color', None), '#348ABD')
     ax = self.sample_survival_functions_.plot(**kwargs)
     return ax
Beispiel #3
0
    def plot(ix=None, iloc=None, flat=False, show_censors=False, censor_styles={},
             ci_legend=False, ci_force_lines=False, ci_alpha=0.25, ci_show=True,
             bandwidth=None, **kwargs):

        assert (ix is None or iloc is None), 'Cannot set both ix and iloc in call to .plot().'

        if "ax" not in kwargs:
            kwargs["ax"] = plt.figure().add_subplot(111)
        kwargs['color'] = coalesce(kwargs.get('c'), kwargs.get('color'), next(kwargs["ax"]._get_lines.color_cycle))
        kwargs['drawstyle'] = coalesce(kwargs.get('drawstyle'), 'steps-post')

        # R-style graphics
        if flat:
            ci_force_lines = True
            show_censors = True

        if estimate == "hazard_":
            assert bandwidth is not None, 'Must specify a bandwidth parameter in the call to plot_hazard.'
            estimate_ = self.smoothed_hazard_(bandwidth)
            confidence_interval_ = self.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0])
        else:
            confidence_interval_ = getattr(self, 'confidence_interval_')
            estimate_ = getattr(self, estimate)

        # did user specify certain indexes or locations?
        if iloc == ix is None:
            user_submitted_ix = slice(0, None)
        else:
            user_submitted_ix = ix if ix is not None else iloc

        get_method = "ix" if ix is not None else "iloc"
        get_loc = lambda df: getattr(df, get_method)[user_submitted_ix]

        # plot censors
        if show_censors and self.event_table['censored'].sum() > 0:
            cs = {'marker': '+', 'ms': 12, 'mew': 1}
            cs.update(censor_styles)
            times = get_loc(self.event_table.ix[(self.event_table['censored'] > 0)]).index.values.astype(float)
            v = self.predict(times)
            kwargs['ax'].plot(times, v, linestyle='None', color=kwargs['color'], **cs)

        # plot esimate
        get_loc(estimate_).plot(**kwargs)

        # plot confidence intervals
        if ci_show:
            if ci_force_lines:
                get_loc(confidence_interval_).plot(linestyle="-", linewidth=1,
                                                   c=kwargs['color'], legend=True,
                                                   drawstyle=kwargs.get('drawstyle', 'default'),
                                                   ax=kwargs['ax'], alpha=0.6)
            else:
                x = get_loc(confidence_interval_).index.values.astype(float)
                lower = get_loc(confidence_interval_.filter(like='lower')).values[:, 0]
                upper = get_loc(confidence_interval_.filter(like='upper')).values[:, 0]
                fill_between_steps(x, lower, y2=upper, ax=kwargs['ax'], alpha=ci_alpha, color=kwargs['color'], linewidth=1.0)

        return kwargs['ax']
    def fit(
        self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
        durations: an array, or pd.Series, of length n
            duration subject was observed for
        timeline:
            return the best estimate at the values in timelines (positively increasing)
        event_observed: an array, or pd.Series, of length n
            True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
        entry: an array, or pd.Series, of length n
           relative time when a subject entered the study. This is
           useful for left-truncated observations, i.e the birth event was not observed.
           If None, defaults to all 0 (all birth events observed.)
        label: string
            a string to name the column of the estimate.
        alpha: float, optional (default=0.05)
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: iterable
            add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns
        -------
          self, with new properties like ``survival_function_``.

        """
        self._label = coalesce(label, self._label, "BFH_estimate")
        alpha = coalesce(alpha, self.alpha)

        naf = NelsonAalenFitter(alpha=alpha)
        naf.fit(durations, event_observed=event_observed, timeline=timeline, label=self._label, entry=entry, ci_labels=ci_labels)
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights = (
            naf.durations,
            naf.event_observed,
            naf.timeline,
            naf.entry,
            naf.event_table,
            naf.weights,
        )

        # estimation
        self.survival_function_ = np.exp(-naf.cumulative_hazard_)
        self.confidence_interval_ = np.exp(-naf.confidence_interval_)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density = 1 - self.confidence_interval_

        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"

        # plotting functions
        self.plot_survival_function = self.plot
        return self
Beispiel #5
0
    def predict_cumulative_hazard(self, X, times=None, ancillary_X=None):
        """
        Return the cumulative hazard rate of subjects in X at time points.

        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        times: iterable, optional
            an iterable of increasing times to predict the cumulative hazard at. Default
            is the set of all durations (observed and unobserved). Uses a linear interpolation if
            points in time are not in the index.
        ancillary_X: numpy array or DataFrame, optional
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        cumulative_hazard_ : DataFrame
            the cumulative hazard of individuals over the timeline
        """
        times = coalesce(times, self.timeline, np.unique(self.durations))
        alpha_, beta_ = self._prep_inputs_for_prediction_and_return_scores(X, ancillary_X)
        return pd.DataFrame(np.log1p(np.outer(times, 1 / alpha_) ** beta_), columns=_get_index(X), index=times)
    def predict_cumulative_hazard(self, df, times=None):
        """
        Return the cumulative hazard rate of subjects in X at time points.

        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        times: iterable, optional
            an iterable of increasing times to predict the cumulative hazard at. Default
            is the set of all durations (observed and unobserved). Uses a linear interpolation if
            points in time are not in the index.

        Returns
        -------
        cumulative_hazard_ : DataFrame
            the cumulative hazard of individuals over the timeline
        """
        times = np.asarray(
            coalesce(times, self.timeline, np.unique(self.durations)))
        n = times.shape[0]
        times = times.reshape((n, 1))

        lambdas_ = self._prep_inputs_for_prediction_and_return_parameters(df)

        bp = self.breakpoints
        M = np.minimum(np.tile(bp, (n, 1)), times)
        M = np.hstack([M[:, tuple([0])], np.diff(M, axis=1)])

        return pd.DataFrame(np.dot(M, (1 / lambdas_)),
                            columns=_get_index(df),
                            index=times[:, 0])
Beispiel #7
0
    def __init__(self, cls, estimate, loc, iloc, show_censors, censor_styles, ax, **kwargs):

        self.censor_styles = coalesce(censor_styles, {})

        if ax is None:
            ax = plt.gca()

        kwargs["ax"] = ax
        set_kwargs_color(kwargs)
        set_kwargs_drawstyle(kwargs)
        set_kwargs_label(kwargs, cls)

        self.loc = loc
        self.iloc = iloc
        self.show_censors = show_censors
        # plot censors
        self.ax = ax
        self.colour = kwargs["color"]
        self.kwargs = kwargs

        if isinstance(estimate, str):
            self.estimate_ = getattr(cls, estimate)
            self.confidence_interval_ = getattr(cls, "confidence_interval_" + estimate)
            self.predict_at_times = getattr(cls, estimate + "at_times")
        else:
            self.estimate_ = estimate
            self.confidence_interval_ = kwargs.pop("confidence_intervals")
Beispiel #8
0
    def predict_cumulative_hazard(self, X, times=None, ancillary_X=None):
        """
        Return the cumulative hazard rate of subjects in X at time points.

        Parameters
        ----------

        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        times: iterable, optional
            an iterable of increasing times to predict the cumulative hazard at. Default
            is the set of all durations (observed and unobserved). Uses a linear interpolation if
            points in time are not in the index.
        ancillary_X: numpy array or DataFrame, optional
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        cumulative_hazard_ : DataFrame
            the cumulative hazard of individuals over the timeline
        """
        import numpy as np

        times = coalesce(times, self.timeline, np.unique(self.durations))
        exp_mu_, sigma_ = self._prep_inputs_for_prediction_and_return_scores(X, ancillary_X)
        mu_ = np.log(exp_mu_)
        Z = np.subtract.outer(np.log(times), mu_) / sigma_
        return pd.DataFrame(-logsf(Z), columns=_get_index(X), index=times)
Beispiel #9
0
def create_dataframe_slicer(iloc, loc):
    user_did_not_specify_certain_indexes = (iloc is None) and (loc is None)
    user_submitted_slice = slice(
        None) if user_did_not_specify_certain_indexes else coalesce(loc, iloc)

    get_method = "loc" if loc is not None else "iloc"
    return lambda df: getattr(df, get_method)[user_submitted_slice]
Beispiel #10
0
 def plot_survival_function(self, **kwargs):
     """Alias of ``plot``"""
     if not CensoringType.is_interval_censoring(self):
         return _plot_estimate(self, estimate="survival_function_", **kwargs)
     else:
         # hack for now.
         color = coalesce(kwargs.get("c"), kwargs.get("color"), "k")
         self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
Beispiel #11
0
    def predict_cumulative_hazard(self, X, times=None, ancillary_X=None):
        """
        Parameters
        ----------

        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        times: iterable, optional
            an iterable of increasing times to predict the cumulative hazard at. Default
            is the set of all durations (observed and unobserved). Uses a linear interpolation if
            points in time are not in the index.
        ancillary_X: numpy array or DataFrame, optional
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        cumulative_hazard_ : DataFrame
            the cumulative hazard of individuals over the timeline
        """
        X = X.copy()

        times = coalesce(times, self.timeline, np.unique(self.durations))

        if ancillary_X is None:
            ancillary_X = pd.DataFrame(np.ones((X.shape[0], 1)),
                                       columns=["_intercept"])
        elif isinstance(ancillary_X, pd.DataFrame):
            ancillary_X = ancillary_X.copy()
            if self.fit_intercept:
                ancillary_X["_intercept"] = 1.0
            ancillary_X = ancillary_X[self.params_.loc["rho_"].index]
        else:
            assert ancillary_X.shape[1] == (self.params_.loc["rho_"].shape[0] +
                                            1)  # 1 for _intercept

        if isinstance(X, pd.DataFrame):
            if self.fit_intercept:
                X["_intercept"] = 1.0

            X = X[self.params_.loc["lambda_"].index]
        else:
            assert X.shape[1] == (self.params_.loc["lambda_"].shape[0] + 1
                                  )  # 1 for _intercept

        lambda_params = self.params_[self._LOOKUP_SLICE["lambda_"]]
        lambda_ = np.exp(np.dot(X, lambda_params))

        rho_params = self.params_[self._LOOKUP_SLICE["rho_"]]
        rho_ = np.exp(np.dot(ancillary_X, rho_params))
        cols = _get_index(X)
        return pd.DataFrame(np.outer(times, 1 / lambda_)**rho_,
                            columns=cols,
                            index=times)
Beispiel #12
0
def create_dataframe_slicer(iloc, loc, timeline):
    if (loc is not None) and (iloc is not None):
        raise ValueError("Cannot set both loc and iloc in call to .plot().")

    user_did_not_specify_certain_indexes = (iloc is None) and (loc is None)
    user_submitted_slice = (slice(timeline.min(), timeline.max()) if
                            user_did_not_specify_certain_indexes else coalesce(
                                loc, iloc))

    get_method = "iloc" if iloc is not None else "loc"
    return lambda df: getattr(df, get_method)[user_submitted_slice]
    def survival_function_at_times(self, times, label=None) -> pd.Series:
        """
        Return a Pandas series of the predicted survival value at specific times

        Parameters
        -----------
        times: iterable or float
        label: str

        """
        label = coalesce(label, self._label)
        return pd.Series(self.predict(times), index=_to_1d_array(times), name=label)
    def plot_cumulative_density(self, **kwargs):
        """
        Plots a pretty figure of the cumulative density function.

        Matplotlib plot arguments can be passed in inside the kwargs.

        Parameters
        -----------
        show_censors: bool
            place markers at censorship events. Default: False
        censor_styles: bool
            If show_censors, this dictionary will be passed into the plot call.
        ci_alpha: bool
            the transparency level of the confidence interval. Default: 0.3
        ci_force_lines: bool
            force the confidence intervals to be line plots (versus default shaded areas). Default: False
        ci_show: bool
            show confidence intervals. Default: True
        ci_legend: bool
            if ci_force_lines is True, this is a boolean flag to add the lines' labels to the legend. Default: False
        at_risk_counts: bool
            show group sizes at time points. See function ``add_at_risk_counts`` for details. Default: False
        loc: slice
            specify a time-based subsection of the curves to plot, ex:

            >>> model.plot(loc=slice(0.,10.))

            will plot the time values between t=0. and t=10.
        iloc: slice
            specify a location-based subsection of the curves to plot, ex:

            >>> model.plot(iloc=slice(0,10))

            will plot the first 10 time points.

        Returns
        -------
        ax:
            a pyplot axis object
        """
        if not CensoringType.is_interval_censoring(self):
            return _plot_estimate(self,
                                  estimate="cumulative_density_",
                                  **kwargs)
        else:
            # hack for now.
            color = coalesce(kwargs.get("c"), kwargs.get("color"), "k")
            self.cumulative_density_.plot(drawstyle="steps",
                                          color=color,
                                          **kwargs)
Beispiel #15
0
    def plot_survival_function(self, **kwargs):
        """Alias of ``plot``"""
        if not CensoringType.is_interval_censoring(self):
            return _plot_estimate(self, estimate="survival_function_", **kwargs)
        else:
            # hack for now.
            def safe_pop(dict, key):
                if key in dict:
                    return dict.pop(key)
                else:
                    return None

            color = coalesce(safe_pop(kwargs, "c"), safe_pop(kwargs, "color"), "k")
            self.survival_function_.plot(drawstyle="steps-pre", color=color, **kwargs)
    def survival_function_at_times(self, times, label=None):
        """
        Return a Pandas series of the predicted survival value at specific times

        Parameters
        -----------
        times: iterable or float

        Returns
        --------
        pd.Series

        """
        label = coalesce(label, self._label)
        return pd.Series(self.predict(times), index=_to_array(times), name=label)
    def cumulative_density_at_times(self, times, label=None):
        """
        Return a Pandas series of the predicted cumulative density at specific times

        Parameters
        -----------
        times: iterable or float

        Returns
        --------
        pd.Series

        """
        label = coalesce(label, self._label)
        return pd.Series(1 - self.predict(times), index=_to_array(times), name=label)
Beispiel #18
0
    def cumulative_density_at_times(self, times, label=None) -> pd.Series:
        """
        Return a Pandas series of the predicted cumulative density at specific times

        Parameters
        -----------
        times: iterable or float

        Returns
        --------
        pd.Series

        """
        label = coalesce(label, self._label)
        return pd.Series(1 - self.predict(times), index=_to_1d_array(times), name=label)
Beispiel #19
0
    def plot(ix=None, iloc=None, columns=[], legend=True, **kwargs):
        """"
        A wrapper around plotting. Matplotlib plot arguments can be passed in, plus:

          ix: specify a time-based subsection of the curves to plot, ex:
                   .plot(ix=slice(0.,10.)) will plot the time values between t=0. and t=10.
          iloc: specify a location-based subsection of the curves to plot, ex:
                   .plot(iloc=slice(0,10)) will plot the first 10 time points.
          columns: If not empty, plot a subset of columns from the cumulative_hazards_. Default all.
          legend: show legend in figure.

        """
        from matplotlib import pyplot as plt

        assert (ix is None or
                iloc is None), 'Cannot set both ix and iloc in call to .plot'

        get_method = "ix" if ix is not None else "iloc"
        if iloc == ix is None:
            user_submitted_ix = slice(0, None)
        else:
            user_submitted_ix = ix if ix is not None else iloc
        get_loc = lambda df: getattr(df, get_method)[user_submitted_ix]

        if len(columns) == 0:
            columns = self.cumulative_hazards_.columns

        if "ax" not in kwargs:
            kwargs["ax"] = plt.figure().add_subplot(111)

        x = get_loc(self.cumulative_hazards_).index.values.astype(float)
        for column in columns:
            y = get_loc(self.cumulative_hazards_[column]).values
            y_upper = get_loc(
                self.confidence_intervals_[column].ix['upper']).values
            y_lower = get_loc(
                self.confidence_intervals_[column].ix['lower']).values
            shaded_plot(x,
                        y,
                        y_upper,
                        y_lower,
                        ax=kwargs["ax"],
                        label=coalesce(kwargs.get('label'), column))

        if legend:
            kwargs["ax"].legend()

        return kwargs["ax"]
Beispiel #20
0
    def predict_cumulative_hazard(self, df, times=None):

        times = coalesce(times, self.timeline, np.unique(self.durations))
        n = df.shape[0]
        Xs = self._create_Xs_dict(df)

        params_dict = {
            parameter_name:
            self.params_.values[self._LOOKUP_SLICE[parameter_name]]
            for parameter_name in self._fitted_parameter_names
        }

        return pd.DataFrame(self._cumulative_hazard(params_dict,
                                                    np.tile(times, (n, 1)).T,
                                                    Xs),
                            index=times,
                            columns=df.index)
Beispiel #21
0
    def hazard_at_times(self, times, label=None):
        """
        Return a Pandas series of the predicted hazard at specific times.

        Parameters
        -----------
        times: iterable or float
          values to return the hazard at.
        label: string, optional
          Rename the series returned. Useful for plotting.

        Returns
        --------
        pd.Series

        """
        label = coalesce(label, self._label)
        return pd.Series(self._hazard(self._fitted_parameters_, times), index=_to_array(times), name=label)
    def predict_cumulative_hazard(self,
                                  df,
                                  times=None,
                                  conditional_after=None) -> pd.DataFrame:
        """
        Return the cumulative hazard rate of subjects in X at time points.

        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        times: iterable, optional
            an iterable of increasing times to predict the cumulative hazard at. Default
            is the set of all durations (observed and unobserved). Uses a linear interpolation if
            points in time are not in the index.

        Returns
        -------
        cumulative_hazard_ : DataFrame
            the cumulative hazard of individuals over the timeline
        """

        if isinstance(df, pd.Series):
            return self.predict_cumulative_hazard(df.to_frame().T)

        if conditional_after is not None:
            raise NotImplementedError()

        times = np.atleast_1d(coalesce(times, self.timeline)).astype(float)
        n = times.shape[0]
        times = times.reshape((n, 1))

        lambdas_ = self._prep_inputs_for_prediction_and_return_parameters(df)

        bp = np.append(self.breakpoints, [np.inf])
        M = np.minimum(np.tile(bp, (n, 1)), times)
        M = np.hstack([M[:, tuple([0])], np.diff(M, axis=1)])

        return pd.DataFrame(np.dot(M, (1 / lambdas_)),
                            columns=_get_index(df),
                            index=times[:, 0])
Beispiel #23
0
    def plot(ix=None, iloc=None, columns=[], legend=True, **kwargs):
        """"
        A wrapper around plotting. Matplotlib plot arguments can be passed in, plus:

          ix: specify a time-based subsection of the curves to plot, ex:
                   .plot(ix=slice(0.,10.)) will plot the time values between t=0. and t=10.
          iloc: specify a location-based subsection of the curves to plot, ex:
                   .plot(iloc=slice(0,10)) will plot the first 10 time points.
          columns: If not empty, plot a subset of columns from the cumulative_hazards_. Default all.
          legend: show legend in figure.

        """
        from matplotlib import pyplot as plt

        assert ix is None or iloc is None, "Cannot set both ix and iloc in call to .plot"

        get_method = "ix" if ix is not None else "iloc"
        if iloc == ix is None:
            user_submitted_ix = slice(0, None)
        else:
            user_submitted_ix = ix if ix is not None else iloc
        get_loc = lambda df: getattr(df, get_method)[user_submitted_ix]

        if len(columns) == 0:
            columns = self.cumulative_hazards_.columns

        if "ax" not in kwargs:
            kwargs["ax"] = plt.figure().add_subplot(111)

        x = get_loc(self.cumulative_hazards_).index.values.astype(float)
        for column in columns:
            y = get_loc(self.cumulative_hazards_[column]).values
            y_upper = get_loc(self.confidence_intervals_[column].ix["upper"]).values
            y_lower = get_loc(self.confidence_intervals_[column].ix["lower"]).values
            shaded_plot(x, y, y_upper, y_lower, ax=kwargs["ax"], label=coalesce(kwargs.get("label"), column))

        if legend:
            kwargs["ax"].legend()

        return kwargs["ax"]
Beispiel #24
0
    def __init__(self, cls, estimate, confidence_intervals, loc, iloc, show_censors, censor_styles, **kwargs):

        self.censor_styles = coalesce(censor_styles, {})

        set_kwargs_ax(kwargs)
        set_kwargs_color(kwargs)
        set_kwargs_drawstyle(kwargs)
        set_kwargs_label(kwargs, cls)

        self.loc = loc
        self.iloc = iloc
        self.show_censors = show_censors
        # plot censors
        self.ax = kwargs["ax"]
        self.colour = kwargs["c"]
        self.kwargs = kwargs

        if (self.loc is not None) and (self.iloc is not None):
            raise ValueError("Cannot set both loc and iloc in call to .plot().")
        else:
            self.estimate_ = estimate
            self.confidence_interval_ = confidence_intervals
Beispiel #25
0
    def predict(self, x):
        if self.model is None:
            raise AssertionError("Model must be fit before calling predict.")

        np.random.seed(909)

        if isinstance(self.prediction_time_frame_in_hours, tuple):
            earlier = self.predict_at_time(
                x, self.prediction_time_frame_in_hours[0])
            later = self.predict_at_time(
                x, self.prediction_time_frame_in_hours[1])
            y_pred = later - earlier
        else:
            times_to_evaluate_at = coalesce(
                self.prediction_time_frame_in_hours,
                self.model.baseline_cumulative_hazard_.index)
            y_pred = self.predict_at_time(x, times_to_evaluate_at)

        y_pred = np.concatenate(
            [1 - y_pred[..., np.newaxis], y_pred[..., np.newaxis]], axis=-1)
        y_pred = y_pred[0]
        return y_pred
    def _fit(self,
             durations,
             event_observed=None,
             timeline=None,
             entry=None,
             label=None,
             alpha=None,
             ci_labels=None,
             weights=None):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array, list, pd.DataFrame or pd.Series
            length n -- duration subject was observed for
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``

        """
        durations = np.asarray(durations)
        self._check_values(durations)

        if event_observed is not None:
            event_observed = np.asarray(event_observed)
            self._check_values(event_observed)

        self._label = coalesce(label, self._label, "KM_estimate")

        if weights is not None:
            weights = np.asarray(weights)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )
        else:
            weights = np.ones_like(durations, dtype=float)

        # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
        is_left_censoring = CensoringType.is_left_censoring(self)
        primary_estimate_name = "survival_function_" if not is_left_censoring else "cumulative_density_"
        secondary_estimate_name = "cumulative_density_" if not is_left_censoring else "survival_function_"

        (self.durations, self.event_observed, self.timeline, self.entry,
         self.event_table,
         self.weights) = _preprocess_inputs(durations, event_observed,
                                            timeline, entry, weights)

        alpha = alpha if alpha else self.alpha
        log_estimate, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._additive_var, is_left_censoring)

        if entry is not None:
            # a serious problem with KM is that when the sample size is small and there are too few early
            # truncation times, it may happen that is the number of patients at risk and the number of deaths is the same.
            # we adjust for this using the Breslow-Fleming-Harrington estimator
            n = self.event_table.shape[0]
            net_population = (self.event_table["entrance"] -
                              self.event_table["removed"]).cumsum()
            if net_population.iloc[:int(n / 2)].min() == 0:
                ix = net_population.iloc[:int(n / 2)].idxmin()
                raise StatError(
                    """There are too few early truncation times and too many events. S(t)==0 for all t>%g. Recommend BreslowFlemingHarringtonFitter."""
                    % ix)

        # estimation
        setattr(self, primary_estimate_name,
                pd.DataFrame(np.exp(log_estimate), columns=[self._label]))
        setattr(self, secondary_estimate_name,
                pd.DataFrame(1 - np.exp(log_estimate), columns=[self._label]))

        self.__estimate = getattr(self, primary_estimate_name)
        self.confidence_interval_ = self._bounds(
            cumulative_sq_.values[:, None], alpha, ci_labels)
        self._median = median_survival_times(self.survival_function_)
        self._cumulative_sq_ = cumulative_sq_

        setattr(self, "confidence_interval_" + primary_estimate_name,
                self.confidence_interval_)
        setattr(self, "confidence_interval_" + secondary_estimate_name,
                1 - self.confidence_interval_)

        # estimation methods
        self._estimation_method = primary_estimate_name
        self._estimate_name = primary_estimate_name

        return self
    def fit_interval_censoring(
        self,
        lower_bound,
        upper_bound,
        event_observed=None,
        timeline=None,
        label=None,
        alpha=None,
        ci_labels=None,
        entry=None,
        weights=None,
        tol: float = 1e-5,
        show_progress: bool = False,
        **kwargs,
    ) -> "KaplanMeierFitter":
        """
        Fit the model to a interval-censored dataset using non-parametric MLE. This estimator is
        also called the Turnbull Estimator.

        Currently, only closed interval are supported. However, it's easy to create open intervals by adding (or subtracting) a very small
        value from the lower-bound (or upper bound). For example, the following turns closed intervals into open intervals.

        >>> left, right = df['left'], df['right']
        >>> KaplanMeierFitter().fit_interval_censoring(left + 0.00001, right - 0.00001)

        Note
        ------
        This is new and experimental, and many features are missing.

        Parameters
        ----------
          lower_bound: an array, list, pd.DataFrame or pd.Series
            length n -- lower bound of observations
          upper_bound: an array, list, pd.DataFrame or pd.Series
            length n -- upper bound of observations
          event_observed: an array, list, pd.DataFrame, or pd.Series, optional
             True if the the death was observed, False if the event was lost (right-censored). This can be computed from
             the lower_bound and upper_bound, and can be left blank.
          timeline: an array, list, pd.DataFrame, or pd.Series, optional
            return the best estimate at the values in timelines (positively increasing)
          entry: an array, list, pd.DataFrame, or pd.Series, optional
             relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population
             entered study when they were "born".
          label: string, optional
            a string to name the column of the estimate.
          alpha: float, optional
            the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only.
          ci_labels: tuple, optional
                add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: an array, list, pd.DataFrame, or pd.Series, optional
              if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.
          tol: float, optional
            minimum difference in log likelihood changes for iterative algorithm.
          show_progress: bool, optional
            display information during fitting.

        Returns
        -------
        self: KaplanMeierFitter
          self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_``
        """
        if entry is not None:
            raise NotImplementedError("entry is not supported yet")

        if weights is None:
            weights = np.ones_like(upper_bound)

        self.weights = np.asarray(weights)

        self.upper_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(upper_bound))
        self.lower_bound = np.atleast_1d(
            pass_for_numeric_dtypes_or_raise_array(lower_bound))
        check_nans_or_infs(self.lower_bound)

        self.event_observed = self.lower_bound == self.upper_bound

        self.timeline = coalesce(
            timeline,
            np.unique(np.concatenate((self.upper_bound, self.lower_bound))))

        if (self.upper_bound < self.lower_bound).any():
            raise ValueError(
                "All upper_bound times must be greater than or equal to lower_bound times."
            )

        if event_observed is None:
            event_observed = self.upper_bound == self.lower_bound

        if ((self.lower_bound == self.upper_bound) != event_observed).any():
            raise ValueError(
                "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)"
            )

        self._label = coalesce(label, self._label, "NPMLE_estimate")

        results = npmle(self.lower_bound,
                        self.upper_bound,
                        verbose=show_progress,
                        tol=tol,
                        weights=weights,
                        **kwargs)
        self.survival_function_ = reconstruct_survival_function(
            *results, self.timeline, label=self._label).loc[self.timeline]
        self.cumulative_density_ = 1 - self.survival_function_

        self._median = median_survival_times(self.survival_function_)
        """
        self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha)
        self.confidence_interval_survival_function_ = self.confidence_interval_
        self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_
        """
        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        return self
Beispiel #28
0
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df
                and stop_col in df):
            raise KeyError(
                "A column specified in the call to `fit` does not exist in the DataFrame provided."
            )

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={
                id_col: "id",
                event_col: "event",
                start_col: "start",
                stop_col: "stop",
                weights_col: "__weights"
            })

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) +
                              ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(
                df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        params_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.params_ = pd.Series(params_, index=df.columns,
                                 name="coef") / self._norm_std
        self.hazard_ratios_ = pd.Series(np.exp(self.params_),
                                        index=df.columns,
                                        name="exp(coef)")
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(
            self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start,
            stop, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(
            df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({
            "event": events,
            "start": start,
            "stop": stop
        })
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
Beispiel #29
0
    def plot(
        ix=None,
        iloc=None,
        flat=False,
        show_censors=False,
        censor_styles=None,
        ci_legend=False,
        ci_force_lines=False,
        ci_alpha=0.25,
        ci_show=True,
        at_risk_counts=False,
        bandwidth=None,
        **kwargs
    ):

        from matplotlib import pyplot as plt

        if censor_styles is None:
            censor_styles = {}

        if ix is not None and iloc is not None:
            raise ValueError("Cannot set both ix and iloc in call to .plot().")

        if "ax" not in kwargs:
            kwargs["ax"] = plt.figure().add_subplot(111)
        kwargs["color"] = coalesce(kwargs.get("c"), kwargs.get("color"), next(kwargs["ax"]._get_lines.color_cycle))
        kwargs["drawstyle"] = coalesce(kwargs.get("drawstyle"), "steps-post")

        # R-style graphics
        if flat:
            ci_force_lines = True
            show_censors = True

        if estimate == "hazard_":
            if bandwidth is None:
                raise ValueError("Must specify a bandwidth parameter in the " + "call to plot_hazard.")
            estimate_ = cls.smoothed_hazard_(bandwidth)
            confidence_interval_ = cls.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0])
        else:
            confidence_interval_ = getattr(cls, "confidence_interval_")
            estimate_ = getattr(cls, estimate)

        # did user specify certain indexes or locations?
        if iloc == ix is None:
            user_submitted_ix = slice(None)
        else:
            user_submitted_ix = ix if ix is not None else iloc

        get_method = "ix" if ix is not None else "iloc"
        get_loc = lambda df: getattr(df, get_method)[user_submitted_ix]

        # plot censors
        if show_censors and cls.event_table["censored"].sum() > 0:
            cs = {"marker": "+", "ms": 12, "mew": 1}
            cs.update(censor_styles)
            times = get_loc(cls.event_table.ix[(cls.event_table["censored"] > 0)]).index.values.astype(float)
            v = cls.predict(times)
            kwargs["ax"].plot(times, v, linestyle="None", color=kwargs["color"], **cs)

        # plot estimate
        get_loc(estimate_).plot(**kwargs)

        # plot confidence intervals
        if ci_show:
            if ci_force_lines:
                get_loc(confidence_interval_).plot(
                    linestyle="-",
                    linewidth=1,
                    color=[kwargs["color"]],
                    legend=True,
                    drawstyle=kwargs.get("drawstyle", "default"),
                    ax=kwargs["ax"],
                    alpha=0.6,
                )
            else:
                x = get_loc(confidence_interval_).index.values.astype(float)
                lower = get_loc(confidence_interval_.filter(like="lower")).values[:, 0]
                upper = get_loc(confidence_interval_.filter(like="upper")).values[:, 0]
                fill_between_steps(
                    x, lower, y2=upper, ax=kwargs["ax"], alpha=ci_alpha, color=kwargs["color"], linewidth=1.0
                )

        if at_risk_counts:
            add_at_risk_counts(cls, ax=kwargs["ax"])

        return kwargs["ax"]
Beispiel #30
0
def set_kwargs_color(kwargs):
    kwargs["c"] = coalesce(kwargs.get("c"), kwargs.get("color"),
                           kwargs["ax"]._get_lines.get_next_color())
Beispiel #31
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            strata=None,
            step_size=None,
            weights_col=None,
            cluster_col=None,
            robust=False):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
             This can be used for case-weights. For example, a weight of 2 means there were two subjects with
             identical observations.
             This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.
          step_size: set an initial step size for the fitting algorithm.
          robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
            ties, so if there are high number of ties, results may significantly differ. See
            "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
          cluster_col: specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
            be used.
        Returns:
            self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.

        """

        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + ' UTC'
        self.duration_col = duration_col
        self.event_col = event_col
        self.robust = robust
        self.cluster_col = cluster_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col)
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    """It appears your weights are not integers, possibly propensity or sampling scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
""", RuntimeWarning)
            if (weights <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        else:
            weights = pd.Series(np.ones((self._n_examples, )), index=df.index)

        if self.cluster_col:
            self._clusters = df.pop(self.cluster_col)

        self._check_values(df, T, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         T,
                                         E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std

        self.variance_matrix_ = -inv(self._hessian_) / np.outer(
            self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), T, E, weights)
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(
            df, T, E, weights)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(
        )
        self.baseline_survival_ = self._compute_baseline_survival()
        self._predicted_partial_hazards_ = self.predict_partial_hazard(
            df).values

        self._train_log_partial_hazard = self.predict_log_partial_hazard(
            self._norm_mean.to_frame().T)
        return self
    def fit(self,
            durations,
            event_observed=None,
            timeline=None,
            entry=None,
            label=None,
            alpha=None,
            ci_labels=None,
            weights=None):  # pylint: disable=too-many-arguments
        """
        Parameters
        -----------
        durations: an array, or pd.Series, of length n
          duration subject was observed for
        timeline: iterable
            return the best estimate at the values in timelines (positively increasing)
        event_observed: an array, or pd.Series, of length n
            True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
        entry: an array, or pd.Series, of length n
           relative time when a subject entered the study. This is
           useful for left-truncated observations, i.e the birth event was not observed.
           If None, defaults to all 0 (all birth events observed.)
        label: string
            a string to name the column of the estimate.
        alpha: float
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: iterable
            add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
        weights: n array, or pd.Series, of length n
            if providing a weighted dataset. For example, instead
            of providing every subject as a single element of `durations` and `event_observed`, one could
            weigh subject differently.

        Returns
        -------
          self, with new properties like ``cumulative_hazard_``.

        """
        check_nans_or_infs(durations)
        if event_observed is not None:
            check_nans_or_infs(event_observed)

        if weights is not None:
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  """,
                    StatisticalWarning,
                )

        (self.durations, self.event_observed, self.timeline, self.entry,
         self.event_table,
         self.weights) = _preprocess_inputs(durations, event_observed,
                                            timeline, entry, weights)

        cumulative_hazard_, cumulative_sq_ = _additive_estimate(
            self.event_table, self.timeline, self._additive_f,
            self._variance_f, False)

        # estimates
        self._label = coalesce(label, self._label, "NA_estimate")
        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_,
                                               columns=[self._label])
        self.confidence_interval_ = self._bounds(
            cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
        self.confidence_interval_cumulative_hazard_ = self.confidence_interval_
        self._cumulative_sq = cumulative_sq_

        # estimation methods
        self._estimation_method = "cumulative_hazard_"
        self._estimate_name = "cumulative_hazard_"
        self._update_docstrings()

        # plotting
        self.plot_cumulative_hazard = self.plot

        return self
Beispiel #33
0
 def _create_initial_point(self, Ts, E, entry, weights):
     return np.array([utils.coalesce(*Ts).mean(), 1.0])
    def fit(
        self,
        df,
        id_col,
        event_col,
        start_col="start",
        stop_col="stop",
        weights_col=None,
        show_progress=False,
        step_size=None,
        robust=False,
        strata=None,
        initial_point=None,
    ):  # pylint: disable=too-many-arguments
        """
        Fit the Cox Proportional Hazard model to a time varying dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters
        -----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
           `event_col`, plus other covariates. `duration_col` refers to
           the lifetimes of the subjects. `event_col` refers to whether
           the 'death' events was observed: 1 if observed, 0 else (censored).
        id_col: string
            A subject could have multiple rows in the DataFrame. This column contains
           the unique identifier per subject.
        event_col: string
           the column in DataFrame that contains the subjects' death
           observation. If left as None, assume all individuals are non-censored.
        start_col: string
            the column that contains the start of a subject's time period.
        stop_col: string
            the column that contains the end of a subject's time period.
        weights_col: string, optional
            the column that contains (possibly time-varying) weight of each subject-period row.
        show_progress: since the fitter is iterative, show convergence
           diagnostics.
        robust: boolean, optional (default: True)
            Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
          ties, so if there are high number of ties, results may significantly differ. See
          "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
        step_size: float, optional
            set an initial step size for the fitting algorithm.
        strata: list or string, optional
            specify a column or list of columns n to use in stratification. This is useful if a
            categorical covariate does not obey the proportional hazard assumption. This
            is used similar to the `strata` expression in R.
            See http://courses.washington.edu/b515/l17.pdf.
        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        --------
        self: CoxTimeVaryingFitter
            self, with additional properties like ``hazards_`` and ``print_summary``

        """
        self.strata = coalesce(strata, self.strata)
        self.robust = robust
        if self.robust:
            raise NotImplementedError("Not available yet.")

        self.event_col = event_col
        self.id_col = id_col
        self.stop_col = stop_col
        self.start_col = start_col
        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

        df = df.copy()

        if not (id_col in df and event_col in df and start_col in df and stop_col in df):
            raise KeyError("A column specified in the call to `fit` does not exist in the DataFrame provided.")

        if weights_col is None:
            self.weights_col = None
            assert (
                "__weights" not in df.columns
            ), "__weights is an internal lifelines column, please rename your column first."
            df["__weights"] = 1.0
        else:
            self.weights_col = weights_col
            if (df[weights_col] <= 0).any():
                raise ValueError("values in weights_col must be positive.")

        df = df.rename(
            columns={id_col: "id", event_col: "event", start_col: "start", stop_col: "stop", weights_col: "__weights"}
        )

        if self.strata is None:
            df = df.set_index("id")
        else:
            df = df.set_index(_to_list(self.strata) + ["id"])  # TODO: needs to be a list
            df = df.sort_index()

        events, start, stop = (
            pass_for_numeric_dtypes_or_raise_array(df.pop("event")).astype(bool),
            df.pop("start"),
            df.pop("stop"),
        )
        weights = df.pop("__weights").astype(float)

        df = df.astype(float)
        self._check_values(df, events, start, stop)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        hazards_ = self._newton_rhaphson(
            normalize(df, self._norm_mean, self._norm_std),
            events,
            start,
            stop,
            weights,
            initial_point=initial_point,
            show_progress=show_progress,
            step_size=step_size,
        )

        self.hazards_ = pd.Series(hazards_, index=df.columns, name="coef") / self._norm_std
        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
        self.standard_errors_ = self._compute_standard_errors(
            normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights
        )
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, events, start, stop, weights)
        self.baseline_survival_ = self._compute_baseline_survival()
        self.event_observed = events
        self.start_stop_and_events = pd.DataFrame({"event": events, "start": start, "stop": stop})
        self.weights = weights

        self._n_examples = df.shape[0]
        self._n_unique = df.index.unique().shape[0]
        return self
    def fit(
        self,
        durations,
        event_observed,
        event_of_interest,
        timeline=None,
        entry=None,
        label=None,
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array or pd.Series of length n -- duration of subject was observed for
          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be
             only positive integers, where 0 indicates censoring.
          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
          timeline: return the best estimate at the values in timelines (positively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self : AalenJohansenFitter
          self, with new properties like ``cumulative_incidence_``.
        """
        # Checking for tied event times
        ties = self._check_for_duplicates(durations=durations,
                                          events=event_observed)

        if ties:
            warnings.warn(
                dedent(
                    """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times.
                To resolve ties, data is randomly jittered."""),
                Warning,
            )
            durations = self._jitter(
                durations=pd.Series(durations),
                event=pd.Series(event_observed),
                jitter_level=self._jitter_level,
                seed=self._seed,
            )

        alpha = alpha if alpha else self.alpha

        # Creating label for event of interest & indicator for that event
        event_of_interest = int(event_of_interest)
        cmprisk_label = "CIF_" + str(event_of_interest)
        self.label_cmprisk = "observed_" + str(event_of_interest)

        # Fitting Kaplan-Meier for either event of interest OR competing risk
        km = KaplanMeierFitter().fit(durations,
                                     event_observed=event_observed,
                                     timeline=timeline,
                                     entry=entry,
                                     weights=weights)
        aj = km.event_table
        aj["overall_survival"] = km.survival_function_
        aj["lagged_overall_survival"] = aj["overall_survival"].shift()

        # Setting up table for calculations and to return to user
        event_spec = pd.Series(event_observed) == event_of_interest
        self.durations, self.event_observed, *_, event_table, weights = _preprocess_inputs(
            durations=durations,
            event_observed=event_spec,
            timeline=timeline,
            entry=entry,
            weights=weights)
        event_spec_times = event_table["observed"]
        event_spec_times = event_spec_times.rename(self.label_cmprisk)
        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()

        # Estimator of Cumulative Incidence (Density) Function
        aj[cmprisk_label] = (aj[self.label_cmprisk] / aj["at_risk"] *
                             aj["lagged_overall_survival"]).cumsum()
        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
        aj = aj.set_index("event_at")

        # Setting attributes
        self._estimation_method = "cumulative_density_"
        self._estimate_name = "cumulative_density_"
        self.timeline = km.timeline
        self._update_docstrings()

        self._label = coalesce(label, self._label, "AJ_estimate")
        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])

        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
        self.event_table = aj[[
            "removed", "observed", self.label_cmprisk, "censored", "entrance",
            "at_risk"
        ]]  # Event table

        if self._calc_var:
            self.variance_, self.confidence_interval_ = self._bounds(
                aj["lagged_overall_survival"],
                alpha=alpha,
                ci_labels=ci_labels)
        else:
            self.variance_, self.confidence_interval_ = None, None

        self.confidence_interval_cumulative_density_ = self.confidence_interval_
        return self
Beispiel #36
0
 def plot(self, **kwargs):
     kwargs['alpha'] = coalesce(kwargs.pop('alpha', None), 0.05)
     kwargs['legend'] = False
     kwargs['c'] = coalesce(kwargs.pop('c', None), kwargs.pop('color', None), '#348ABD')
     ax = self.sample_survival_functions_.plot(**kwargs)
     return ax
Beispiel #37
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            strata=None,
            step_size=None,
            weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """

        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
                    """, RuntimeWarning)

        else:
            weights = pd.DataFrame(np.ones((self._n_examples, 1)),
                                   index=df.index)

        self._check_values(df, T, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         T,
                                         E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(
        )
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(
            self.durations, -self.predict_partial_hazard(df).values.ravel(),
            self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(
            self._norm_mean.to_frame().T)
        return self
Beispiel #38
0
 def plot(self, **kwargs):
     kwargs['alpha'] = coalesce(kwargs.get('alpha'), 15./self.samples)
     kwargs['legend'] = False
     kwargs['c'] = coalesce( kwargs.get('c'), kwargs.get('color'), '#348ABD')
     ax = self.sample_survival_functions_.plot(**kwargs)
     return ax
Beispiel #39
0
    def fit(self, df, duration_col, event_col=None,
            show_progress=False, initial_beta=None,
            strata=None, step_size=None, weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """
        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col).values
        else:
            weights = np.ones(self._n_examples)

        self._check_values(df, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(self.durations,
                                        -self.predict_partial_hazard(df).values.ravel(),
                                        self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
        return self
    def fit(
        self,
        durations,
        event_observed=None,
        timeline=None,
        entry=None,
        label="BFH_estimate",
        alpha=None,
        ci_labels=None,
    ):  # pylint: disable=too-many-arguments
        """
        Parameters
        ----------
        durations: an array, or pd.Series, of length n
            duration subject was observed for
        timeline:
            return the best estimate at the values in timelines (positively increasing)
        event_observed: an array, or pd.Series, of length n
            True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None
        entry: an array, or pd.Series, of length n
           relative time when a subject entered the study. This is
           useful for left-truncated observations, i.e the birth event was not observed.
           If None, defaults to all 0 (all birth events observed.)
        label: string
            a string to name the column of the estimate.
        alpha: float, optional (default=0.05)
            the alpha value in the confidence intervals. Overrides the initializing
           alpha for this call to fit only.
        ci_labels: iterable
            add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>


        Returns
        -------
          self, with new properties like ``survival_function_``.

        """
        self._label = label
        alpha = coalesce(alpha, self.alpha)

        naf = NelsonAalenFitter(alpha=alpha)
        naf.fit(
            durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels
        )
        self.durations, self.event_observed, self.timeline, self.entry, self.event_table = (
            naf.durations,
            naf.event_observed,
            naf.timeline,
            naf.entry,
            naf.event_table,
        )

        # estimation
        self.survival_function_ = np.exp(-naf.cumulative_hazard_)
        self.confidence_interval_ = np.exp(-naf.confidence_interval_)

        # estimation methods
        self._estimation_method = "survival_function_"
        self._estimate_name = "survival_function_"
        self._update_docstrings()

        # plotting functions
        self.plot_survival_function = self.plot
        return self
Beispiel #41
0
    def plot(ix=None,
             iloc=None,
             flat=False,
             show_censors=False,
             censor_styles=None,
             ci_legend=False,
             ci_force_lines=False,
             ci_alpha=0.25,
             ci_show=True,
             at_risk_counts=False,
             bandwidth=None,
             **kwargs):

        from matplotlib import pyplot as plt

        if censor_styles is None:
            censor_styles = {}

        if (ix is not None and iloc is not None):
            raise ValueError('Cannot set both ix and iloc in call to .plot().')

        if "ax" not in kwargs:
            kwargs["ax"] = plt.figure().add_subplot(111)
        kwargs['color'] = coalesce(kwargs.get('c'), kwargs.get('color'),
                                   next(kwargs["ax"]._get_lines.color_cycle))
        kwargs['drawstyle'] = coalesce(kwargs.get('drawstyle'), 'steps-post')

        # R-style graphics
        if flat:
            ci_force_lines = True
            show_censors = True

        if estimate == "hazard_":
            if bandwidth is None:
                raise ValueError('Must specify a bandwidth parameter in the ' +
                                 'call to plot_hazard.')
            estimate_ = cls.smoothed_hazard_(bandwidth)
            confidence_interval_ = \
                cls.smoothed_hazard_confidence_intervals_(bandwidth,
                                                          hazard_=estimate_.values[:, 0])
        else:
            confidence_interval_ = getattr(cls, 'confidence_interval_')
            estimate_ = getattr(cls, estimate)

        # did user specify certain indexes or locations?
        if iloc == ix is None:
            user_submitted_ix = slice(None)
        else:
            user_submitted_ix = ix if ix is not None else iloc

        get_method = "ix" if ix is not None else "iloc"
        get_loc = lambda df: getattr(df, get_method)[user_submitted_ix]

        # plot censors
        if show_censors and cls.event_table['censored'].sum() > 0:
            cs = {'marker': '+', 'ms': 12, 'mew': 1}
            cs.update(censor_styles)
            times = get_loc(cls.event_table.ix[(cls.event_table['censored'] >
                                                0)]).index.values.astype(float)
            v = cls.predict(times)
            kwargs['ax'].plot(times,
                              v,
                              linestyle='None',
                              color=kwargs['color'],
                              **cs)

        # plot estimate
        get_loc(estimate_).plot(**kwargs)

        # plot confidence intervals
        if ci_show:
            if ci_force_lines:
                get_loc(confidence_interval_).plot(linestyle="-",
                                                   linewidth=1,
                                                   color=[kwargs['color']],
                                                   legend=True,
                                                   drawstyle=kwargs.get(
                                                       'drawstyle', 'default'),
                                                   ax=kwargs['ax'],
                                                   alpha=0.6)
            else:
                x = get_loc(confidence_interval_).index.values.astype(float)
                lower = get_loc(
                    confidence_interval_.filter(like='lower')).values[:, 0]
                upper = get_loc(
                    confidence_interval_.filter(like='upper')).values[:, 0]
                fill_between_steps(x,
                                   lower,
                                   y2=upper,
                                   ax=kwargs['ax'],
                                   alpha=ci_alpha,
                                   color=kwargs['color'],
                                   linewidth=1.0)

        if at_risk_counts:
            add_at_risk_counts(cls, ax=kwargs['ax'])

        return kwargs['ax']