Esempio n. 1
0
def test_proportional_hazard_test_with_strata_weights_and_strata():
    """
    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 1, 1),
      "w" = c(1, 0.5, 2, 1, 1),
      "s" = c(1, 1, 0, 0, 0)
    )

    c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w)
    cz = cox.zph(c, transform='identity')
    cz_km = cox.zph(c, transform='km')

    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "w": [1, 0.5, 2, 1, 1],
            "s": [1, 1, 0, 0, 0],
        }
    )
    df["E"] = True

    cph = CoxPHFitter()
    cph.fit(df, "T", "E", weights_col="w", strata="s")

    results = stats.proportional_hazard_test(cph, df, time_transform="identity")
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3)

    results = stats.proportional_hazard_test(cph, df, time_transform="km")
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0434, rtol=1e-1)
def test_proportional_hazard_test_with_kmf_with_some_censorship_and_weights():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 0, 1),
      "w" = c(1, 0.5, 2, 1, 1),
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w)
    cox.zph(c, transform='km')
    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "E": [1, 1, 1, 0, 1],
            "w": [1, 0.5, 5, 1, 1],
        }
    )

    cph = CoxPHFitter()
    with pytest.warns(StatisticalWarning, match="weights are not integers"):
        cph.fit(df, "T", "E", weights_col="w")
        results = stats.proportional_hazard_test(cph, df)
        npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.916, rtol=1e-2)
def test_proportional_hazard_test_with_kmf_with_some_censorship():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 0, 1)
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df)
    cox.zph(c, transform='km')
    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "E": [1, 1, 1, 0, 1],
        }
    )

    cph = CoxPHFitter()
    cph.fit(df, "T", "E")

    results = stats.proportional_hazard_test(cph, df)
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.013802, rtol=1e-3)
Esempio n. 4
0
def test_proportional_hazard_test_with_weights():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 1, 1),
      "w" = c(1, 0.5, 2, 1, 1)
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w)
    cox.zph(c, transform='rank')
    """

    df = pd.DataFrame({
        "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
        "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
        "w": [1, 0.5, 2, 1, 1],
    })
    df["E"] = True

    cph = CoxPHFitter()
    cph.fit(df, "T", "E", weights_col="w")

    results = stats.proportional_hazard_test(
        cph, df, time_transform=["km", "rank", "log", "identity"])
    results.print_summary(5)
    npt.assert_allclose(results.summary.loc["var1", "rank"]["test_statistic"],
                        0.108,
                        rtol=1e-2)
def test_proportional_hazard_test_with_log_transform():
    cph = CoxPHFitter()
    df = load_regression_dataset()
    cph.fit(df, "T", "E")

    results = stats.proportional_hazard_test(cph, df, time_transform="log")
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 2.227627, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var2"]["test_statistic"], 0.714427, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var3"]["test_statistic"], 1.466321, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var3"]["p"], 0.225927, rtol=1e-3)
Esempio n. 6
0
def test_proportional_hazard_test_with_all():

    df = pd.DataFrame({
        "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
        "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
        "E": [1, 1, 1, 0, 1],
    })

    cph = CoxPHFitter()
    cph.fit(df, "T", "E")
    results = stats.proportional_hazard_test(cph, df, time_transform="all")
    assert results.summary.shape[0] == 1 * 4
def test_proportional_hazard_test():
    """
    c = coxph(formula=Surv(T, E) ~ var1 + var2 + var3, data=df)
    cz = cox.zph(c, transform='rank')
    cz
    """
    cph = CoxPHFitter()
    df = load_regression_dataset()
    cph.fit(df, "T", "E")
    results = stats.proportional_hazard_test(cph, df)
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.4938293, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var2"]["test_statistic"], 0.8792998, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var3"]["test_statistic"], 2.2686088, rtol=1e-3)
    npt.assert_allclose(results.summary.loc["var3"]["p"], 0.1320184, rtol=1e-3)
Esempio n. 8
0
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                'PaperlessBilling', 'PaymentMethod', 'Churn'
            ]

            le = LabelEncoder()

            df[cat_col] = df[cat_col].apply(le.fit_transform)

            ## Instantiate object
            cph = CoxPHFitter()

            ## Fit to data
            cph.fit(df, 'Tenure', 'Churn', show_progress=False)

            ## Test assumption
            results = proportional_hazard_test(cph, df, time_transform='all')

            y = results.summary

            y = y.drop('test_statistic', axis=1)

            x = y.unstack()

            x.columns = x.columns.droplevel()

            x['row_mean'] = x.mean(axis=1)

            x = x.sort_values(by='row_mean')

            st.markdown('''
            When testing the proportional hazard assumption, the `lifelines` package offers several transform methods for the "time" parameter, 
Esempio n. 9
0
    def check_assumptions(
        self,
        training_df: DataFrame,
        advice: bool = True,
        show_plots: bool = False,
        p_value_threshold: float = 0.01,
        plot_n_bootstraps: int = 10,
        columns: Optional[List[str]] = None,
    ) -> None:
        """
        Use this function to test the proportional hazards assumption. See usage example at
        https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html


        Parameters
        -----------

        training_df: DataFrame
            the original DataFrame used in the call to ``fit(...)`` or a sub-sampled version.
        advice: bool, optional
            display advice as output to the user's screen
        show_plots: bool, optional
            display plots of the scaled schoenfeld residuals and loess curves. This is an eyeball test for violations.
            This will slow down the function significantly.
        p_value_threshold: float, optional
            the threshold to use to alert the user of violations. See note below.
        plot_n_bootstraps:
            in the plots displayed, also display plot_n_bootstraps bootstrapped loess curves. This will slow down
            the function significantly.
        columns: list, optional
            specify a subset of columns to test.


        Examples
        ----------

        >>> from lifelines.datasets import load_rossi
        >>> from lifelines import CoxPHFitter
        >>>
        >>> rossi = load_rossi()
        >>> cph = CoxPHFitter().fit(rossi, 'week', 'arrest')
        >>>
        >>> cph.check_assumptions(rossi)


        Notes
        -------
        The ``p_value_threshold`` is arbitrarily set at 0.01. Under the null, some covariates
        will be below the threshold (i.e. by chance). This is compounded when there are many covariates.

        Similarly, when there are lots of observations, even minor deviances from the proportional hazard
        assumption will be flagged.

        With that in mind, it's best to use a combination of statistical tests and eyeball tests to
        determine the most serious violations.


        References
        -----------
        section 5 in https://socialsciences.mcmaster.ca/jfox/Books/Companion/appendices/Appendix-Cox-Regression.pdf,
        http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf,
        http://eprints.lse.ac.uk/84988/1/06_ParkHendry2015-ReassessingSchoenfeldTests_Final.pdf
        """

        if not training_df.index.is_unique:
            raise IndexError(
                "`training_df` index should be unique for this exercise. Please make it unique or use `.reset_index(drop=True)` to force a unique index"
            )

        residuals = self.compute_residuals(training_df,
                                           kind="scaled_schoenfeld")
        test_results = proportional_hazard_test(
            self,
            training_df,
            time_transform=["rank", "km"],
            precomputed_residuals=residuals)

        residuals_and_duration = residuals.join(training_df[self.duration_col])

        counter = 0
        n = residuals_and_duration.shape[0]

        for variable in self.params_.index.intersection(columns
                                                        or self.params_.index):
            minumum_observed_p_value = test_results.summary.loc[variable,
                                                                "p"].min()
            if np.round(minumum_observed_p_value, 2) > p_value_threshold:
                continue

            counter += 1

            if counter == 1:
                if advice:
                    print(
                        fill(
                            """The ``p_value_threshold`` is set at %g. Even under the null hypothesis of no violations, some covariates will be below the threshold by chance. This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged."""
                            % p_value_threshold,
                            width=100,
                        ))
                    print()
                    print(
                        fill(
                            """With that in mind, it's best to use a combination of statistical tests and visual tests to determine the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)`` and looking for non-constant lines. See link [A] below for a full example.""",
                            width=100,
                        ))
                    print()
                test_results.print_summary()
                print()

            print()
            print(
                "%d. Variable '%s' failed the non-proportional test: p-value is %s."
                % (counter, variable,
                   format_p_value(4)(minumum_observed_p_value)),
                end="\n\n",
            )

            if advice:
                values = training_df[variable]
                value_counts = values.value_counts()
                n_uniques = value_counts.shape[0]

                # Arbitrary chosen 10 and 4 to check for ability to use strata col.
                # This should capture dichotomous / low cardinality values.
                if n_uniques <= 10 and value_counts.min() >= 5:
                    print(
                        fill(
                            "   Advice: with so few unique values (only {0}), you can include `strata=['{1}', ...]` in the call in `.fit`. See documentation in link [E] below."
                            .format(n_uniques, variable),
                            width=100,
                        ))
                else:
                    print(
                        fill(
                            """   Advice 1: the functional form of the variable '{var}' might be incorrect. That is, there may be non-linear terms missing. The proportional hazard test used is very sensitive to incorrect functional forms. See documentation in link [D] below on how to specify a functional form."""
                            .format(var=variable),
                            width=100,
                        ),
                        end="\n\n",
                    )
                    print(
                        fill(
                            """   Advice 2: try binning the variable '{var}' using pd.cut, and then specify it in `strata=['{var}', ...]` in the call in `.fit`. See documentation in link [B] below."""
                            .format(var=variable),
                            width=100,
                        ),
                        end="\n\n",
                    )
                    print(
                        fill(
                            """   Advice 3: try adding an interaction term with your time variable. See documentation in link [C] below.""",
                            width=100,
                        ),
                        end="\n\n",
                    )

            if show_plots:

                from matplotlib import pyplot as plt

                fig = plt.figure()

                # plot variable against all time transformations.
                for i, (transform_name, transformer) in enumerate(
                        TimeTransformers().iter(["rank", "km"]), start=1):
                    p_value = test_results.summary.loc[(variable,
                                                        transform_name), "p"]

                    ax = fig.add_subplot(1, 2, i)

                    y = residuals_and_duration[variable]
                    tt = transformer(self.durations, self.event_observed,
                                     self.weights)[self.event_observed.values]

                    ax.scatter(tt, y, alpha=0.75)

                    y_lowess = lowess(tt.values, y.values)
                    ax.plot(tt, y_lowess, color="k", alpha=1.0, linewidth=2)

                    # bootstrap some possible other lowess lines. This is an approximation of the 100% confidence intervals
                    for _ in range(plot_n_bootstraps):
                        ix = sorted(np.random.choice(n, n))
                        tt_ = tt.values[ix]
                        y_lowess = lowess(tt_, y.values[ix])
                        ax.plot(tt_, y_lowess, color="k", alpha=0.30)

                    best_xlim = ax.get_xlim()
                    ax.hlines(0,
                              0,
                              tt.max(),
                              linestyles="dashed",
                              linewidths=1)
                    ax.set_xlim(best_xlim)

                    ax.set_xlabel("%s-transformed time\n(p=%.4f)" %
                                  (transform_name, p_value),
                                  fontsize=10)

                fig.suptitle("Scaled Schoenfeld residuals of '%s'" % variable,
                             fontsize=14)
                plt.tight_layout()
                plt.subplots_adjust(top=0.90)

        if advice and counter > 0:
            print(
                dedent(r"""
                ---
                [A]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html
                [B]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it
                [C]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates
                [D]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form
                [E]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification
            """))

        if counter == 0:
            print("Proportional hazard assumption looks okay.")