Esempio n. 1
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """


    """
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical CDF"

    if CensoringType.is_left_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
        )
    elif CensoringType.is_right_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
        )
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline, dist_object.cdf(timeline), label="fitted %s" % dist, **plot_kwargs)
    ax.legend()
    return ax
    def _ll_null(self):
        if hasattr(self, "_ll_null_"):
            return self._ll_null_

        initial_point = np.zeros(len(self._fitted_parameter_names))

        model = self.__class__(breakpoints=self.breakpoints[:-1], penalizer=self.penalizer)
        regressors = {param_name: ["_intercept"] for param_name in self._fitted_parameter_names}
        if CensoringType.is_right_censoring(self):
            df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "entry": self.entry, "_intercept": 1.0})
            model.fit_right_censoring(
                df, "T", "E", initial_point=initial_point, entry_col="entry", regressors=regressors
            )
        elif CensoringType.is_interval_censoring(self):
            df = pd.DataFrame(
                {
                    "lb": self.lower_bound,
                    "ub": self.upper_bound,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "_intercept": 1.0,
                }
            )
            model.fit_interval_censoring(
                df, "lb", "ub", "E", initial_point=initial_point, entry_col="entry", regressors=regressors
            )
        if CensoringType.is_left_censoring(self):
            raise NotImplementedError()

        self._ll_null_ = model.log_likelihood_
        return self._ll_null_
Esempio n. 3
0
def cdf_plot(model, timeline=None, **plot_kwargs):
    from lifelines import KaplanMeierFitter

    set_kwargs_ax(plot_kwargs)
    ax = plot_kwargs.pop("ax")

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical quantiles"

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                     model.event_observed,
                                                     label=COL_EMP,
                                                     timeline=timeline)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations,
                                                      model.event_observed,
                                                      label=COL_EMP,
                                                      timeline=timeline)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError()

    kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline,
            dist_object.cdf(timeline),
            label="fitted %s" % dist,
            **plot_kwargs)
    ax.legend()
    return ax
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_data = log(Ts[1] - Ts[0])
     return np.array([log_data.mean(), log(log_data.std()), 0.1])
Esempio n. 5
0
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         T = Ts[0]
     elif CensoringType.is_left_censoring(self):
         T = Ts[1]
     elif CensoringType.is_interval_censoring(self):
         T = Ts[1] - Ts[0]
     return np.array([np.median(T), 1.0])
Esempio n. 6
0
 def _get_initial_values(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = np.log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = np.log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_data = np.log(Ts[1] - Ts[0])
     return np.array([log_data.mean(), np.log(log_data.std()), 1.0])
Esempio n. 7
0
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_T = np.log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_T = np.log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         log_T = np.log(Ts[1])
     return np.array([np.median(log_T), 1.0])
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         log_data = log(Ts[0])
     elif CensoringType.is_left_censoring(self):
         log_data = log(Ts[1])
     elif CensoringType.is_interval_censoring(self):
         # this fails if Ts[1] == Ts[0], so we add a some fudge factors.
         log_data = log(Ts[1] - Ts[0] + 0.01)
     return np.array([log_data.mean(), log(log_data.std() + 0.01), 0.1])
Esempio n. 9
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """
    This plot compares the empirical CDF (derived by KaplanMeier) vs the model CDF.

    Parameters
    ------------
    model: lifelines univariate model
    timeline: iterable
    ax: matplotlib axis

    """
    from lifelines import KaplanMeierFitter
    from matplotlib import pyplot as plt

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    COL_EMP = "empirical CDF"

    if CensoringType.is_left_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations,
            model.event_observed,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    elif CensoringType.is_right_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations,
            model.event_observed,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    elif CensoringType.is_interval_censoring(model):
        empirical_kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound,
            model.upper_bound,
            label=COL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)

    empirical_kmf.plot_cumulative_density(ax=ax, **plot_kwargs)

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)
    ax.plot(timeline,
            dist_object.cdf(timeline),
            label="fitted %s" % dist,
            **plot_kwargs)
    ax.legend()
    return ax
Esempio n. 10
0
 def _create_initial_point(self, Ts, E, *args):
     if CensoringType.is_right_censoring(self):
         T = Ts[0]
     elif CensoringType.is_left_censoring(self):
         T = np.clip(0.0001, np.inf, Ts[1])
     elif CensoringType.is_interval_censoring(self):
         if E.sum() > 0:
             # Ts[1] can contain infs, so ignore this data
             okay_data = Ts[1] < 1e10
             T = Ts[1]
             T = T[okay_data]
         else:
             T = np.array([1.0])
     return np.array([np.median(T), 1.0])
Esempio n. 11
0
    def _ll_null(self):
        if hasattr(self, "_ll_null_"):
            return self._ll_null_

        initial_point = np.zeros(len(self._fitted_parameter_names))
        regressors = {
            name: ["intercept"]
            for name in self._fitted_parameter_names
        }

        model = self.__class__()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            if CensoringType.is_right_censoring(self):
                df = pd.DataFrame({
                    "T": self.durations,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "intercept": 1
                })
                model.fit_right_censoring(df,
                                          "T",
                                          "E",
                                          initial_point=initial_point,
                                          entry_col="entry",
                                          regressors=regressors)
            elif CensoringType.is_interval_censoring(self):
                df = pd.DataFrame({
                    "lb": self.lower_bound,
                    "ub": self.upper_bound,
                    "E": self.event_observed,
                    "entry": self.entry,
                    "intercept": 1,
                })
                model.fit_interval_censoring(df,
                                             "lb",
                                             "ub",
                                             "E",
                                             initial_point=initial_point,
                                             entry_col="entry",
                                             regressors=regressors)
            if CensoringType.is_left_censoring(self):
                raise NotImplementedError()

        self._ll_null_ = model._log_likelihood
        return self._ll_null_
Esempio n. 12
0
def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
    """
         Cumulative Distribution Function
    """
    from lifelines import KaplanMeierFitter

    #kmf = KaplanMeierFitter()
    #kmf.fit(durations = churn_data['tenure'], event_observed = churn_data['Churn - Yes'] )

    if ax is None:
        ax = plt.gca()

    if timeline is None:
        timeline = model.timeline

    CDL_EMP = "empirical CDF"
    if CensoringType.is_left_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                         model.event_observed,
                                                         label=CDL_EMP,
                                                         timeline=timeline,
                                                         weights=model.weights,
                                                         entry=model.entry)
    if CensoringType.is_right_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations,
            model.event_observed,
            label=CDL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
    if CensoringType.is_interval_censoring(model):
        emp_kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound,
            model.upper_bound,
            label=CDL_EMP,
            timeline=timeline,
            weights=model.weights,
            entry=model.entry)
Esempio n. 13
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    q = np.unique(kmf.cumulative_density_.values[:, 0])
    # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)`
    quantiles = qth_survival_times(1 - q, kmf.survival_function_)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Esempio n. 14
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------
    .. code:: python

        from lifelines import *
        from lifelines.plotting import qq_plot
        from lifelines.datasets import load_rossi
        df = load_rossi()
        wf = WeibullFitter().fit(df['week'], df['arrest'])
        qq_plot(wf)

    Notes
    ------
    The interval censoring case uses the mean between the upper and lower bounds.

    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]

    elif CensoringType.is_interval_censoring(model):
        kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"]

    q = np.unique(cdf.values)

    quantiles = qth_survival_times(1 - q, sf)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Esempio n. 15
0
def survival_probability_calibration(model: RegressionFitter,
                                     training_df: pd.DataFrame,
                                     t0: float,
                                     ax=None):
    r"""
    Smoothed calibration curves for time-to-event models. This is analogous to
    calibration curves for classification models, extended to handle survival probabilities
    and censoring. Produces a matplotlib figure and some metrics.

    We want to calibrate our model's prediction of :math:`P(T < \text{t0})` against the observed frequencies.

    Parameters
    -------------

    model:
        a fitted lifelines regression model to be evaluated
    training_df: DataFrame
        the DataFrame used to train the model
    t0: float
        the time to evaluate the probability of event occurring prior at.

    Returns
    ----------
    ax:
        mpl axes
    ICI:
        mean absolute difference between predicted and observed
    E50:
        median absolute difference between predicted and observed

    https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8570

    """
    def ccl(p):
        return np.log(-np.log(1 - p))

    if ax is None:
        ax = plt.gca()

    T = model.duration_col
    E = model.event_col

    predictions_at_t0 = np.clip(
        1 -
        model.predict_survival_function(training_df, times=[t0]).T.squeeze(),
        1e-10, 1 - 1e-10)

    # create new dataset with the predictions
    prediction_df = pd.DataFrame({
        "ccl_at_%d" % t0: ccl(predictions_at_t0),
        "constant": 1,
        T: model.durations,
        E: model.event_observed
    })

    # fit new dataset to flexible spline model
    # this new model connects prediction probabilities and actual survival. It should be very flexible, almost to the point of overfitting. It's goal is just to smooth out the data!
    knots = 3
    regressors = {
        "beta_": ["ccl_at_%d" % t0],
        "gamma0_": ["constant"],
        "gamma1_": ["constant"],
        "gamma2_": ["constant"]
    }

    # this model is from examples/royson_crowther_clements_splines.py
    crc = CRCSplineFitter(knots, penalizer=0)
    if CensoringType.is_right_censoring(model):
        crc.fit_right_censoring(prediction_df, T, E, regressors=regressors)
    elif CensoringType.is_left_censoring(model):
        crc.fit_left_censoring(prediction_df, T, E, regressors=regressors)
    elif CensoringType.is_interval_censoring(model):
        crc.fit_interval_censoring(prediction_df, T, E, regressors=regressors)

    # predict new model at values 0 to 1, but remember to ccl it!
    x = np.linspace(np.clip(predictions_at_t0.min() - 0.01, 0, 1),
                    np.clip(predictions_at_t0.max() + 0.01, 0, 1), 100)
    y = 1 - crc.predict_survival_function(pd.DataFrame({
        "ccl_at_%d" % t0: ccl(x),
        "constant": 1
    }),
                                          times=[t0]).T.squeeze()

    # plot our results
    ax.set_title(
        "Smoothed calibration curve of \npredicted vs observed probabilities of t ≤ %d mortality"
        % t0)

    color = "tab:red"
    ax.plot(x, y, label="smoothed calibration curve", color=color)
    ax.set_xlabel("Predicted probability of \nt ≤ %d mortality" % t0)
    ax.set_ylabel("Observed probability of \nt ≤ %d mortality" % t0,
                  color=color)
    ax.tick_params(axis="y", labelcolor=color)

    # plot x=y line
    ax.plot(x, x, c="k", ls="--")
    ax.legend()

    # plot histogram of our original predictions
    color = "tab:blue"
    twin_ax = ax.twinx()
    twin_ax.set_ylabel("Count of \npredicted probabilities",
                       color=color)  # we already handled the x-label with ax1
    twin_ax.tick_params(axis="y", labelcolor=color)
    twin_ax.hist(predictions_at_t0, alpha=0.3, bins="sqrt", color=color)

    plt.tight_layout()

    deltas = ((1 - crc.predict_survival_function(
        prediction_df, times=[t0])).T.squeeze() - predictions_at_t0).abs()
    ICI = deltas.mean()
    E50 = np.percentile(deltas, 50)
    print("ICI = ", ICI)
    print("E50 = ", E50)

    return ax, ICI, E50
Esempio n. 16
0
    def print_summary(self, decimals=2, **kwargs):
        """
        Print summary statistics describing the fit, the coefficients, and the error bounds.

        Parameters
        -----------
        decimals: int, optional (default=2)
            specify the number of decimal places to show
        alpha: float or iterable
            specify confidence intervals to show
        kwargs:
            print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing
            multiple outputs.

        """

        # Print information about data first
        justify = string_justify(18)
        print(self)
        if self.event_col:
            print("{} = '{}'".format(justify("event col"), self.event_col))
        if self.weights_col:
            print("{} = '{}'".format(justify("weights col"), self.weights_col))
        if self.penalizer > 0:
            print("{} = {}".format(justify("penalizer"), self.penalizer))

        if self.robust:
            print("{} = {}".format(justify("robust variance"), True))

        print("{} = {}".format(justify("number of subjects"),
                               self._n_examples))
        print("{} = {}".format(justify("number of events"),
                               self.event_observed.sum()))
        print("{} = {:.{prec}f}".format(justify("log-likelihood"),
                                        self._log_likelihood,
                                        prec=decimals))
        print("{} = {}".format(justify("time fit was run"),
                               self._time_fit_was_called))

        for k, v in kwargs.items():
            print("{} = {}\n".format(justify(k), v))

        print(end="\n")
        print("---")

        df = self.summary
        print(
            df.to_string(
                float_format=format_floats(decimals),
                formatters={
                    "p": format_p_value(decimals),
                    "exp(coef)": format_exp_floats(decimals)
                },
            ))

        print("---")
        if CensoringType.is_right_censoring(self):
            print("Concordance = {:.{prec}f}".format(self.score_,
                                                     prec=decimals))

        with np.errstate(invalid="ignore", divide="ignore"):
            sr = self.log_likelihood_ratio_test()
            print(
                "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}"
                .format(sr.test_statistic,
                        sr.degrees_freedom,
                        -np.log2(sr.p_value),
                        prec=decimals))