def test_proportional_hazard_test_with_strata_weights_and_strata(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1), "s" = c(1, 1, 0, 0, 0) ) c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w) cz = cox.zph(c, transform='identity') cz_km = cox.zph(c, transform='km') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], "s": [1, 1, 0, 0, 0], } ) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w", strata="s") results = stats.proportional_hazard_test(cph, df, time_transform="identity") npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3) results = stats.proportional_hazard_test(cph, df, time_transform="km") npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0434, rtol=1e-1)
def test_proportional_hazard_test_with_kmf_with_some_censorship_and_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 0, 1), "w" = c(1, 0.5, 2, 1, 1), ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='km') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], "w": [1, 0.5, 5, 1, 1], } ) cph = CoxPHFitter() with pytest.warns(StatisticalWarning, match="weights are not integers"): cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.916, rtol=1e-2)
def test_proportional_hazard_test_with_kmf_with_some_censorship(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 0, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df) cox.zph(c, transform='km') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], } ) cph = CoxPHFitter() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.013802, rtol=1e-3)
def test_proportional_hazard_test_with_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='rank') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], }) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test( cph, df, time_transform=["km", "rank", "log", "identity"]) results.print_summary(5) npt.assert_allclose(results.summary.loc["var1", "rank"]["test_statistic"], 0.108, rtol=1e-2)
def test_proportional_hazard_test_with_log_transform(): cph = CoxPHFitter() df = load_regression_dataset() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df, time_transform="log") npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 2.227627, rtol=1e-3) npt.assert_allclose(results.summary.loc["var2"]["test_statistic"], 0.714427, rtol=1e-3) npt.assert_allclose(results.summary.loc["var3"]["test_statistic"], 1.466321, rtol=1e-3) npt.assert_allclose(results.summary.loc["var3"]["p"], 0.225927, rtol=1e-3)
def test_proportional_hazard_test_with_all(): df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], }) cph = CoxPHFitter() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df, time_transform="all") assert results.summary.shape[0] == 1 * 4
def test_proportional_hazard_test(): """ c = coxph(formula=Surv(T, E) ~ var1 + var2 + var3, data=df) cz = cox.zph(c, transform='rank') cz """ cph = CoxPHFitter() df = load_regression_dataset() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.4938293, rtol=1e-3) npt.assert_allclose(results.summary.loc["var2"]["test_statistic"], 0.8792998, rtol=1e-3) npt.assert_allclose(results.summary.loc["var3"]["test_statistic"], 2.2686088, rtol=1e-3) npt.assert_allclose(results.summary.loc["var3"]["p"], 0.1320184, rtol=1e-3)
'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn' ] le = LabelEncoder() df[cat_col] = df[cat_col].apply(le.fit_transform) ## Instantiate object cph = CoxPHFitter() ## Fit to data cph.fit(df, 'Tenure', 'Churn', show_progress=False) ## Test assumption results = proportional_hazard_test(cph, df, time_transform='all') y = results.summary y = y.drop('test_statistic', axis=1) x = y.unstack() x.columns = x.columns.droplevel() x['row_mean'] = x.mean(axis=1) x = x.sort_values(by='row_mean') st.markdown(''' When testing the proportional hazard assumption, the `lifelines` package offers several transform methods for the "time" parameter,
def check_assumptions( self, training_df: DataFrame, advice: bool = True, show_plots: bool = False, p_value_threshold: float = 0.01, plot_n_bootstraps: int = 10, columns: Optional[List[str]] = None, ) -> None: """ Use this function to test the proportional hazards assumption. See usage example at https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html Parameters ----------- training_df: DataFrame the original DataFrame used in the call to ``fit(...)`` or a sub-sampled version. advice: bool, optional display advice as output to the user's screen show_plots: bool, optional display plots of the scaled schoenfeld residuals and loess curves. This is an eyeball test for violations. This will slow down the function significantly. p_value_threshold: float, optional the threshold to use to alert the user of violations. See note below. plot_n_bootstraps: in the plots displayed, also display plot_n_bootstraps bootstrapped loess curves. This will slow down the function significantly. columns: list, optional specify a subset of columns to test. Examples ---------- >>> from lifelines.datasets import load_rossi >>> from lifelines import CoxPHFitter >>> >>> rossi = load_rossi() >>> cph = CoxPHFitter().fit(rossi, 'week', 'arrest') >>> >>> cph.check_assumptions(rossi) Notes ------- The ``p_value_threshold`` is arbitrarily set at 0.01. Under the null, some covariates will be below the threshold (i.e. by chance). This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged. With that in mind, it's best to use a combination of statistical tests and eyeball tests to determine the most serious violations. References ----------- section 5 in https://socialsciences.mcmaster.ca/jfox/Books/Companion/appendices/Appendix-Cox-Regression.pdf, http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf, http://eprints.lse.ac.uk/84988/1/06_ParkHendry2015-ReassessingSchoenfeldTests_Final.pdf """ if not training_df.index.is_unique: raise IndexError( "`training_df` index should be unique for this exercise. Please make it unique or use `.reset_index(drop=True)` to force a unique index" ) residuals = self.compute_residuals(training_df, kind="scaled_schoenfeld") test_results = proportional_hazard_test( self, training_df, time_transform=["rank", "km"], precomputed_residuals=residuals) residuals_and_duration = residuals.join(training_df[self.duration_col]) counter = 0 n = residuals_and_duration.shape[0] for variable in self.params_.index.intersection(columns or self.params_.index): minumum_observed_p_value = test_results.summary.loc[variable, "p"].min() if np.round(minumum_observed_p_value, 2) > p_value_threshold: continue counter += 1 if counter == 1: if advice: print( fill( """The ``p_value_threshold`` is set at %g. Even under the null hypothesis of no violations, some covariates will be below the threshold by chance. This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged.""" % p_value_threshold, width=100, )) print() print( fill( """With that in mind, it's best to use a combination of statistical tests and visual tests to determine the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)`` and looking for non-constant lines. See link [A] below for a full example.""", width=100, )) print() test_results.print_summary() print() print() print( "%d. Variable '%s' failed the non-proportional test: p-value is %s." % (counter, variable, format_p_value(4)(minumum_observed_p_value)), end="\n\n", ) if advice: values = training_df[variable] value_counts = values.value_counts() n_uniques = value_counts.shape[0] # Arbitrary chosen 10 and 4 to check for ability to use strata col. # This should capture dichotomous / low cardinality values. if n_uniques <= 10 and value_counts.min() >= 5: print( fill( " Advice: with so few unique values (only {0}), you can include `strata=['{1}', ...]` in the call in `.fit`. See documentation in link [E] below." .format(n_uniques, variable), width=100, )) else: print( fill( """ Advice 1: the functional form of the variable '{var}' might be incorrect. That is, there may be non-linear terms missing. The proportional hazard test used is very sensitive to incorrect functional forms. See documentation in link [D] below on how to specify a functional form.""" .format(var=variable), width=100, ), end="\n\n", ) print( fill( """ Advice 2: try binning the variable '{var}' using pd.cut, and then specify it in `strata=['{var}', ...]` in the call in `.fit`. See documentation in link [B] below.""" .format(var=variable), width=100, ), end="\n\n", ) print( fill( """ Advice 3: try adding an interaction term with your time variable. See documentation in link [C] below.""", width=100, ), end="\n\n", ) if show_plots: from matplotlib import pyplot as plt fig = plt.figure() # plot variable against all time transformations. for i, (transform_name, transformer) in enumerate( TimeTransformers().iter(["rank", "km"]), start=1): p_value = test_results.summary.loc[(variable, transform_name), "p"] ax = fig.add_subplot(1, 2, i) y = residuals_and_duration[variable] tt = transformer(self.durations, self.event_observed, self.weights)[self.event_observed.values] ax.scatter(tt, y, alpha=0.75) y_lowess = lowess(tt.values, y.values) ax.plot(tt, y_lowess, color="k", alpha=1.0, linewidth=2) # bootstrap some possible other lowess lines. This is an approximation of the 100% confidence intervals for _ in range(plot_n_bootstraps): ix = sorted(np.random.choice(n, n)) tt_ = tt.values[ix] y_lowess = lowess(tt_, y.values[ix]) ax.plot(tt_, y_lowess, color="k", alpha=0.30) best_xlim = ax.get_xlim() ax.hlines(0, 0, tt.max(), linestyles="dashed", linewidths=1) ax.set_xlim(best_xlim) ax.set_xlabel("%s-transformed time\n(p=%.4f)" % (transform_name, p_value), fontsize=10) fig.suptitle("Scaled Schoenfeld residuals of '%s'" % variable, fontsize=14) plt.tight_layout() plt.subplots_adjust(top=0.90) if advice and counter > 0: print( dedent(r""" --- [A] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html [B] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it [C] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates [D] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form [E] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification """)) if counter == 0: print("Proportional hazard assumption looks okay.")