def test_compare_tmle_continuous(self, cf): cf['cd4_wk45'] = np.log(cf['cd4_wk45']) stmle = StochasticTMLE(cf, exposure='art', outcome='cd4_wk45') stmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) stmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) stmle.fit(p=1.0, samples=1) all_treat = stmle.marginal_outcome stmle.fit(p=0.0, samples=1) non_treat = stmle.marginal_outcome tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() expected = tmle.average_treatment_effect npt.assert_allclose(expected, all_treat - non_treat, atol=1e-3)
def test_sklearn_in_tmle_missing(self, mf): log = LogisticRegression(C=1.0) tmle = TMLE(mf, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False) tmle.missing_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False) tmle.outcome_model('art + male + age0 + cd40 + dvl0', custom_model=log, print_results=False) tmle.fit() # Testing RD match npt.assert_allclose(tmle.risk_difference, -0.090086, rtol=1e-5) npt.assert_allclose(tmle.risk_difference_ci, [-0.160371, -0.019801], rtol=1e-4) # Testing RR match npt.assert_allclose(tmle.risk_ratio, 0.507997, rtol=1e-5) npt.assert_allclose(tmle.risk_ratio_ci, [0.256495, 1.006108], rtol=1e-4) # Testing OR match npt.assert_allclose(tmle.odds_ratio, 0.457541, rtol=1e-5) npt.assert_allclose(tmle.odds_ratio_ci, [0.213980, 0.978331], rtol=1e-4)
def test_error_when_no_models_specified3(self, df): tmle = TMLE(df, exposure='art', outcome='dead') tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) with pytest.raises(ValueError): tmle.fit()
def test_missing_binary_outcome(self, mf): r_rd = -0.08168098 r_rd_ci = -0.15163818, -0.01172378 r_rr = 0.5495056 r_rr_ci = 0.2893677, 1.0435042 r_or = 0.4996546 r_or_ci = 0.2435979, 1.0248642 tmle = TMLE(mf, exposure='art', outcome='dead') tmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.missing_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_difference, r_rd) npt.assert_allclose(tmle.risk_difference_ci, r_rd_ci, rtol=1e-5) npt.assert_allclose(tmle.risk_ratio, r_rr) npt.assert_allclose(tmle.risk_ratio_ci, r_rr_ci, rtol=1e-5) npt.assert_allclose(tmle.odds_ratio, r_or) npt.assert_allclose(tmle.odds_ratio_ci, r_or_ci, rtol=1e-5)
def test_match_r_tmle_riskratio(self, df): r_rr = 0.5344266 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_ratio, r_rr)
def test_match_r_tmle_rd_ci(self, df): r_ci = -0.1541104, -0.01470202 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_difference_ci, r_ci, rtol=1e-5)
def test_match_r_epsilons(self, df): r_epsilons = [-0.016214091, 0.003304079] tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle._epsilon, r_epsilons, rtol=1e-5)
def test_match_r_tmle_riskdifference(self, df): r_rd = -0.08440622 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_difference, r_rd)
def test_match_r_tmle_rr_ci(self, df): r_ci = 0.2773936, 1.0296262 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_ratio_ci, r_ci, rtol=1e-5)
def test_match_r_epsilons_continuous(self, cf): r_epsilons = [-0.0046411652, 0.0002270186] tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle._epsilon, r_epsilons, rtol=1e-4, atol=1e-4)
def test_no_ate_with_binary(self, df): tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', bound=[0.025, 0.9], print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() assert tmle.average_treatment_effect is None assert tmle.average_treatment_effect_ci is None
def tmle(outcome, treatment, data): tml = TMLE(data, exposure=treatment, outcome=outcome) cols = data.drop(columns=[outcome, treatment]).columns s = str(cols[0]) for j in range(1, len(cols)): s = s + ' + ' + str(cols[j]) tml.exposure_model(s) tml.outcome_model(s) tml.fit() return tml.average_treatment_effect
def test_sklearn_in_tmle2(self, cf): log = LogisticRegression(C=1.0) lin = LinearRegression() tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log) tmle.outcome_model('art + male + age0 + cd40 + dvl0', custom_model=lin) tmle.fit() npt.assert_allclose(tmle.average_treatment_effect, 236.049719, rtol=1e-5) npt.assert_allclose(tmle.average_treatment_effect_ci, [135.999264, 336.100175], rtol=1e-5)
def test_symmetric_bounds_on_gW(self, df): r_rd = -0.08203143 r_ci = -0.1498092, -0.01425363 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', bound=0.1, print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_difference, r_rd) npt.assert_allclose(tmle.risk_difference_ci, r_ci, rtol=1e-5)
def test_match_r_continuous_poisson(self, cf): r_ate = 223.4648 r_ci = 118.6276, 328.3019 tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False, continuous_distribution='poisson') tmle.fit() npt.assert_allclose(tmle.average_treatment_effect, r_ate, rtol=1e-3) npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
def test_asymmetric_bounds_on_gW(self, df): r_rd = -0.08433208 r_ci = -0.1541296, -0.01453453 tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', bound=[0.025, 0.9], print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.risk_difference, r_rd) npt.assert_allclose(tmle.risk_difference_ci, r_ci, rtol=1e-5)
def test_match_r_continuous_outcome_gbounds(self, cf): r_ate = 223.3958 r_ci = 118.4178, 328.3737 tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', bound=[0.025, 0.9], print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.average_treatment_effect, r_ate, rtol=1e-3) npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
def test_match_r_continuous_outcome(self, cf): r_ate = 223.4022 r_ci = 118.6037, 328.2008 tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.average_treatment_effect, r_ate, rtol=1e-3) npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
def test_no_risk_with_continuous(self, cf): tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', bound=[0.025, 0.9], print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() assert tmle.risk_difference is None assert tmle.risk_ratio is None assert tmle.odds_ratio is None assert tmle.risk_difference_ci is None assert tmle.risk_ratio_ci is None assert tmle.odds_ratio_ci is None
def test_missing_continuous_outcome(self, mcf): r_ate = 211.8295 r_ci = 107.7552, 315.9038 tmle = TMLE(mcf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.missing_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() npt.assert_allclose(tmle.average_treatment_effect, r_ate, rtol=1e-3) npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
def test_sklearn_in_tmle(self, df): log = LogisticRegression(C=1.0) tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log) tmle.outcome_model('art + male + age0 + cd40 + dvl0', custom_model=log) tmle.fit() # Testing RD match npt.assert_allclose(tmle.risk_difference, -0.091372098) npt.assert_allclose(tmle.risk_difference_ci, [-0.1595425678, -0.0232016282], rtol=1e-5) # Testing RR match npt.assert_allclose(tmle.risk_ratio, 0.4998833415) npt.assert_allclose(tmle.risk_ratio_ci, [0.2561223823, 0.9756404452], rtol=1e-5) # Testing OR match npt.assert_allclose(tmle.odds_ratio, 0.4496171689) npt.assert_allclose(tmle.odds_ratio_ci, [0.2139277755, 0.944971255], rtol=1e-5)
def test_compare_tmle_binary(self, df): stmle = StochasticTMLE(df, exposure='art', outcome='dead') stmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) stmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) stmle.fit(p=1.0, samples=1) all_treat = stmle.marginal_outcome stmle.fit(p=0.0, samples=1) non_treat = stmle.marginal_outcome tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() expected = tmle.risk_difference npt.assert_allclose(expected, all_treat - non_treat, atol=1e-4)
def causal_check(): data = load_sample_data(False).drop(columns=['cd4_wk45']) data[['cd4_rs1', 'cd4_rs2']] = spline(data, 'cd40', n_knots=3, term=2, restricted=True) data[['age_rs1', 'age_rs2']] = spline(data, 'age0', n_knots=3, term=2, restricted=True) # Check TimeFixedGFormula diagnostics g = TimeFixedGFormula(data, exposure='art', outcome='dead') g.outcome_model( model= 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) g.run_diagnostics(decimal=3) # Check IPTW plots ipt = IPTW(data, treatment='art', outcome='dead') ipt.treatment_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', stabilized=True) ipt.marginal_structural_model('art') ipt.fit() ipt.plot_love() plt.tight_layout() plt.show() ipt.plot_kde() plt.show() ipt.plot_kde(measure='logit') plt.show() ipt.plot_boxplot() plt.show() ipt.plot_boxplot(measure='logit') plt.show() ipt.run_diagnostics() # Check AIPTW Diagnostics aipw = AIPTW(data, exposure='art', outcome='dead') aipw.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') aipw.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) aipw.fit() aipw.run_diagnostics() aipw.plot_kde(to_plot='exposure') plt.show() aipw.plot_kde(to_plot='outcome') plt.show() aipw.plot_love() plt.show() # Check TMLE diagnostics tmle = TMLE(data, exposure='art', outcome='dead') tmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) tmle.fit() tmle.run_diagnostics() tmle.plot_kde(to_plot='exposure') plt.show() tmle.plot_kde(to_plot='outcome') plt.show() tmle.plot_love() plt.show() # Check SurvivalGFormula plots df = load_sample_data(False).drop(columns=['cd4_wk45']) df['t'] = np.round(df['t']).astype(int) df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0), columns=df.columns) df['t'] = df.groupby('id')['t'].cumcount() + 1 df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1 df['d'] = df['d'].fillna(0) df['t_sq'] = df['t']**2 df['t_cu'] = df['t']**3 sgf = SurvivalGFormula(df, idvar='id', exposure='art', outcome='d', time='t') sgf.outcome_model( model='art + male + age0 + cd40 + dvl0 + t + t_sq + t_cu') sgf.fit(treatment='all') sgf.plot() plt.show() sgf.plot(c='r', linewidth=3, alpha=0.8) plt.show()
aipw.fit() # Printing summary results aipw.summary() ############################# # TMLE from zepid.causal.doublyrobust import TMLE tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False, bound=0.01) tmle.missing_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) tmle.fit() tmle.summary() ############################# # Cross-fitting from sklearn.ensemble import RandomForestClassifier from zepid.superlearner import GLMSL, StepwiseSL, SuperLearner from zepid.causal.doublyrobust import SingleCrossfitTMLE # SuperLearner set-up labels = ["LogR", "Step.int", "RandFor"] candidates = [GLMSL(sm.families.family.Binomial()), StepwiseSL(sm.families.family.Binomial(), selection="forward", order_interaction=0), RandomForestClassifier(random_state=809512)] # Single cross-fit TMLE
def test_error_when_no_models_specified1(self, df): tmle = TMLE(df, exposure='art', outcome='dead') with pytest.raises(ValueError): tmle.fit()