Exemple #1
0
 def test_long_to_wide_conversion(self, longdata):
     g = TimeVaryGFormula(longdata,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          method='SequentialRegression')
     lf = g._long_to_wide(longdata, id='id', t='t')
     expected_lf = pd.DataFrame.from_records([{
         'A_0': 0,
         'A_1': 1,
         'A_2': 1,
         'Y_0': 0,
         'Y_1': 0,
         'Y_2': 1,
         'W_0': 5,
         'W_1': 5,
         'W_2': 5,
         'L_0': 25,
         'L_1': 20,
         'L_2': 31,
         'id': 1
     }]).set_index('id')
     pdt.assert_frame_equal(lf,
                            expected_lf[[
                                'A_0', 'A_1', 'A_2', 'Y_0', 'Y_1', 'Y_2',
                                'W_0', 'W_1', 'W_2', 'L_0', 'L_1', 'L_2'
                            ]],
                            check_names=False)
Exemple #2
0
 def test_error_covariate_label(self, sim_t_fixed_data):
     g = TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          time_in='t0')
     with pytest.raises(ValueError):
         g.add_covariate_model(label='first', covariate='W1', model='W2')
Exemple #3
0
 def test_error_monte_carlo1(self, sim_t_fixed_data):
     with pytest.raises(ValueError):
         TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='W1',
                          time_out='t')
Exemple #4
0
 def test_error_estimation_method(self, sim_t_fixed_data):
     with pytest.raises(ValueError):
         TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='W1',
                          time_out='t',
                          method='Fail')
Exemple #5
0
 def test_error_continuous_outcome(self, sim_t_fixed_data):
     with pytest.raises(ValueError):
         TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='W1',
                          time_out='t',
                          time_in='t0')
Exemple #6
0
 def test_error_sr_other_models(self, sim_t_fixed_data):
     g = TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          method='SequentialRegression')
     g.outcome_model('A + W1_sq + W2 + W3', print_results=False)
     g.exposure_model('W1_sq', print_results=False)
     with pytest.raises(ValueError):
         g.fit(treatment='all')
Exemple #7
0
 def test_sr_custom_time_point(self, data):
     g = TimeVaryGFormula(data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          method='SequentialRegression')
     g.outcome_model('A + L', print_results=False)
     # values come from R's ltmle package
     g.fit(treatment="all", t_max=2)
     npt.assert_allclose(g.predicted_outcomes, 0.33492, rtol=1e-5)
     g.fit(treatment="none", t_max=2)
     npt.assert_allclose(g.predicted_outcomes, 0.51228, rtol=1e-5)
Exemple #8
0
 def test_sr_gap_time(self, data):
     g = TimeVaryGFormula(data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t2',
                          method='SequentialRegression')
     out_m = 'A + L'
     g.outcome_model(out_m, print_results=False)
     g.fit(treatment="all")
     npt.assert_allclose(g.predicted_outcomes, 0.4051569, rtol=1e-5)
     g.fit(treatment="none")
     npt.assert_allclose(g.predicted_outcomes, 0.661226, rtol=1e-5)
Exemple #9
0
 def test_error_treatment_type(self, sim_t_fixed_data):
     g = TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          time_in='t0')
     with pytest.raises(ValueError):
         g.fit(treatment=1)
     g = TimeVaryGFormula(sim_t_fixed_data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          method='SequentialRegression')
     with pytest.raises(ValueError):
         g.fit(treatment=1)
Exemple #10
0
 def test_sr_warning_outside_time_point(self, data):
     g = TimeVaryGFormula(data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t2',
                          method='SequentialRegression')
     g.outcome_model('A + L', print_results=False)
     with pytest.warns(UserWarning):
         g.fit(treatment="all", t_max=6)
     npt.assert_allclose(g.predicted_outcomes, 0.33492, rtol=1e-5)
Exemple #11
0
 def test_sr_custom_treatment(self, data):
     g = TimeVaryGFormula(data,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t2',
                          method='SequentialRegression')
     out_m = 'A + L'
     g.outcome_model(out_m, print_results=False)
     g.fit(treatment="g['t'] != 2")
     npt.assert_allclose(g.predicted_outcomes, 0.48543, rtol=1e-5)
Exemple #12
0
    def test_monte_carlo_for_single_t(self, sim_t_fixed_data):
        # Estimating monte carlo for single t
        gt = TimeVaryGFormula(sim_t_fixed_data,
                              idvar='id',
                              exposure='A',
                              outcome='Y',
                              time_out='t',
                              time_in='t0')
        gt.outcome_model('A + W1_sq + W2 + W3', print_results=False)
        gt.exposure_model('W1_sq', print_results=False)
        gt.fit(treatment="all", sample=1000000
               )  # Keep this a high number to reduce simulation errors

        # Estimating with TimeFixedGFormula
        gf = TimeFixedGFormula(sim_t_fixed_data, exposure='A', outcome='Y')
        gf.outcome_model(model='A + W1_sq + W2 + W3', print_results=False)
        gf.fit(treatment='all')

        # Expected behavior; same results between the estimation methods
        npt.assert_allclose(gf.marginal_outcome,
                            np.mean(gt.predicted_outcomes['Y']),
                            rtol=1e-3)
Exemple #13
0
 def test_error_sr_recurrent_outcomes(self):
     df = pd.DataFrame()
     df['id'] = [1, 1, 1]
     df['Y'] = [0, 1, 1]
     df['A'] = [0, 1, 1]
     df['t'] = [0, 1, 2]
     g = TimeVaryGFormula(df,
                          idvar='id',
                          exposure='A',
                          outcome='Y',
                          time_out='t',
                          method='SequentialRegression')
     g.outcome_model('A', print_results=False)
     with pytest.raises(ValueError):
         g.fit(treatment='all')
Exemple #14
0
    def test_sequential_regression_for_single_t(self, sim_t_fixed_data):
        # Estimating sequential regression for single t
        gt = TimeVaryGFormula(sim_t_fixed_data,
                              idvar='id',
                              exposure='A',
                              outcome='Y',
                              time_out='t',
                              method='SequentialRegression')
        gt.outcome_model('A + W1_sq + W2 + W3', print_results=False)
        gt.fit(treatment="all")

        # Estimating with TimeFixedGFormula
        gf = TimeFixedGFormula(sim_t_fixed_data, exposure='A', outcome='Y')
        gf.outcome_model(model='A + W1_sq + W2 + W3', print_results=False)
        gf.fit(treatment='all')

        # Expected behavior; same results between the estimation methods
        npt.assert_allclose(gf.marginal_outcome, gt.predicted_outcomes)
def mc_gformula_check():
    df = load_sample_data(timevary=True)
    df['lag_art'] = df['art'].shift(1)
    df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art'])
    df['lag_cd4'] = df['cd4'].shift(1)
    df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4'])
    df['lag_dvl'] = df['dvl'].shift(1)
    df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl'])
    df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True)  # age spline
    df['cd40_sq'] = df['cd40'] ** 2  # cd4 baseline cubic
    df['cd40_cu'] = df['cd40'] ** 3
    df['cd4_sq'] = df['cd4'] ** 2  # cd4 current cubic
    df['cd4_cu'] = df['cd4'] ** 3
    df['enter_sq'] = df['enter'] ** 2  # entry time cubic
    df['enter_cu'] = df['enter'] ** 3
    g = TimeVaryGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out')
    exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + 
            cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.exposure_model(exp_m, restriction="g['lag_art']==0")
    out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + 
            cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.outcome_model(out_m, restriction="g['drop']==0")
    dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary')
    cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);"
                         "g['cd4_sq'] = g['cd4']**2;"
                         "g['cd4_cu'] = g['cd4']**3")
    g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous')
    g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
          lags={'art': 'lag_art',
                'cd4': 'lag_cd4',
                'dvl': 'lag_dvl'},
          sample=10000, t_max=None,
          in_recode=("g['enter_sq'] = g['enter']**2;"
                     "g['enter_cu'] = g['enter']**3"))
    gf = g.predicted_outcomes
    gfs = gf.loc[gf.uid_g_zepid != gf.uid_g_zepid.shift(-1)].copy()
    kmn = KaplanMeierFitter()
    kmn.fit(durations=gfs['out'], event_observed=gfs['dead'])
    kmo = KaplanMeierFitter()
    kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter'])
    plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural')
    plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True')
    plt.legend()
    plt.show()
Exemple #16
0
 def test_complete_mc_procedure_completes(self):
     df = load_sample_data(timevary=True)
     df['lag_art'] = df['art'].shift(1)
     df['lag_art'] = np.where(
         df.groupby('id').cumcount() == 0, 0, df['lag_art'])
     df['lag_cd4'] = df['cd4'].shift(1)
     df['lag_cd4'] = np.where(
         df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4'])
     df['lag_dvl'] = df['dvl'].shift(1)
     df['lag_dvl'] = np.where(
         df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl'])
     df[['age_rs0', 'age_rs1',
         'age_rs2']] = spline(df,
                              'age0',
                              n_knots=4,
                              term=2,
                              restricted=True)  # age spline
     df['cd40_sq'] = df['cd40']**2
     df['cd40_cu'] = df['cd40']**3
     df['cd4_sq'] = df['cd4']**2
     df['cd4_cu'] = df['cd4']**3
     df['enter_sq'] = df['enter']**2
     df['enter_cu'] = df['enter']**3
     g = TimeVaryGFormula(df,
                          idvar='id',
                          exposure='art',
                          outcome='dead',
                          time_in='enter',
                          time_out='out')
     exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + 
             cd4_cu + dvl + enter + enter_sq + enter_cu'''
     g.exposure_model(exp_m, restriction="g['lag_art']==0")
     out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + 
             cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu'''
     g.outcome_model(out_m, restriction="g['drop']==0")
     dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
             lag_dvl + lag_art + enter + enter_sq + enter_cu'''
     g.add_covariate_model(label=1,
                           covariate='dvl',
                           model=dvl_m,
                           var_type='binary')
     cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
             lag_dvl + lag_art + enter + enter_sq + enter_cu'''
     cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);"
                          "g['cd4_sq'] = g['cd4']**2;"
                          "g['cd4_cu'] = g['cd4']**3")
     g.add_covariate_model(label=2,
                           covariate='cd4',
                           model=cd4_m,
                           recode=cd4_recode_scheme,
                           var_type='continuous')
     g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
           lags={
               'art': 'lag_art',
               'cd4': 'lag_cd4',
               'dvl': 'lag_dvl'
           },
           sample=10000,
           t_max=None,
           in_recode=("g['enter_sq'] = g['enter']**2;"
                      "g['enter_cu'] = g['enter']**3"))
     assert isinstance(g.predicted_outcomes, type(pd.DataFrame()))