Esempio n. 1
0
 def test_error_for_bad_nknots(self, spline_data):
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data, 'v', n_knots=1.5)
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data, 'v', n_knots=0)
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data, 'v', n_knots=-1)
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data, 'v', n_knots=8)
Esempio n. 2
0
 def test_error_for_unequal_numbers(self, spline_data):
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data,
                                    'v',
                                    n_knots=1,
                                    knots=[1, 3])
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data,
                                    'v',
                                    n_knots=3,
                                    knots=[1, 3])
Esempio n. 3
0
 def mcf(self):
     df = ze.load_sample_data(False)
     df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df,
                                            'cd40',
                                            n_knots=3,
                                            term=2,
                                            restricted=True)
     df[['age_rs1', 'age_rs2']] = ze.spline(df,
                                            'age0',
                                            n_knots=3,
                                            term=2,
                                            restricted=True)
     return df.drop(columns=['dead'])
Esempio n. 4
0
def sdata():
    df = load_sample_data(False)
    df[['cd4_rs1', 'cd4_rs2']] = spline(df,
                                        'cd40',
                                        n_knots=3,
                                        term=2,
                                        restricted=True)
    df[['age_rs1', 'age_rs2']] = spline(df,
                                        'age0',
                                        n_knots=3,
                                        term=2,
                                        restricted=True)
    return df.drop(columns=['cd4_wk45'])
Esempio n. 5
0
def causal_check():
    # Check IPTW plots
    data = load_sample_data(False)
    data[['cd4_rs1', 'cd4_rs2']] = spline(data,
                                          'cd40',
                                          n_knots=3,
                                          term=2,
                                          restricted=True)
    data[['age_rs1', 'age_rs2']] = spline(data,
                                          'age0',
                                          n_knots=3,
                                          term=2,
                                          restricted=True)
    ipt = IPTW(data, treatment='art', stabilized=True)
    ipt.regression_models(
        'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
    ipt.fit()
    ipt.plot_love()
    plt.tight_layout()
    plt.show()
    ipt.plot_kde()
    plt.show()
    ipt.plot_kde(measure='logit')
    plt.show()
    ipt.plot_boxplot()
    plt.show()
    ipt.plot_boxplot(measure='logit')
    plt.show()

    # Check SurvivalGFormula plots
    df = load_sample_data(False).drop(columns=['cd4_wk45'])
    df['t'] = np.round(df['t']).astype(int)
    df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0),
                      columns=df.columns)
    df['t'] = df.groupby('id')['t'].cumcount() + 1
    df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1
    df['d'] = df['d'].fillna(0)
    df['t_sq'] = df['t']**2
    df['t_cu'] = df['t']**3
    sgf = SurvivalGFormula(df,
                           idvar='id',
                           exposure='art',
                           outcome='d',
                           time='t')
    sgf.outcome_model(
        model='art + male + age0 + cd40 + dvl0 + t + t_sq + t_cu')
    sgf.fit(treatment='all')
    sgf.plot()
    plt.show()
    sgf.plot(c='r', linewidth=3, alpha=0.8)
    plt.show()
Esempio n. 6
0
 def test_higher_order_spline(self, spline_data):
     spline_data[['sp1', 'sp2']] = spline(spline_data,
                                          'v',
                                          n_knots=2,
                                          knots=[10, 16],
                                          term=3.7,
                                          restricted=False)
     expected_splines = pd.DataFrame.from_records([{
         'sp1': 0.0,
         'sp2': 0.0
     }, {
         'sp1': 0.0,
         'sp2': 0.0
     }, {
         'sp1': 0.0,
         'sp2': 0.0
     }, {
         'sp1': 5.0**3.7,
         'sp2': 0.0
     }, {
         'sp1': 10.0**3.7,
         'sp2': 4.0**3.7
     }])
     pdt.assert_frame_equal(spline_data[['sp1', 'sp2']],
                            expected_splines[['sp1', 'sp2']])
Esempio n. 7
0
 def test_auto_knots1(self, spline_data):
     spline_data['sp'] = spline(spline_data, 'v', n_knots=1, restricted=False)
     expected_splines = pd.DataFrame.from_records([{'sp': 0.0},
                                                   {'sp': 0.0},
                                                   {'sp': 0.0},
                                                   {'sp': 5.0},
                                                   {'sp': 10.0}])
     pdt.assert_series_equal(spline_data['sp'], expected_splines['sp'])
Esempio n. 8
0
 def test_restricted_spline3(self, spline_data):
     spline_data['rsp'] = spline(spline_data, 'v', n_knots=2, knots=[5, 16], term=2, restricted=True)
     expected_splines = pd.DataFrame.from_records([{'rsp': 0.0},
                                                   {'rsp': 0.0},
                                                   {'rsp': (10.0 - 5.0)**2 - 0},
                                                   {'rsp': (15.0 - 5.0)**2 - 0},
                                                   {'rsp': (20.0 - 5.0)**2 - (20.0 - 16.0)**2}])
     pdt.assert_series_equal(spline_data['rsp'], expected_splines['rsp'])
Esempio n. 9
0
 def test_restricted_spline1(self, spline_data):
     spline_data['rsp'] = spline(spline_data, 'v', n_knots=2, knots=[10, 16], restricted=True)
     expected_splines = pd.DataFrame.from_records([{'rsp': 0.0},
                                                   {'rsp': 0.0},
                                                   {'rsp': 0.0},
                                                   {'rsp': 5.0},
                                                   {'rsp': 6.0}])
     pdt.assert_series_equal(spline_data['rsp'], expected_splines['rsp'])
Esempio n. 10
0
 def test_cubic_spline1(self, spline_data):
     spline_data['sp'] = spline(spline_data, 'v', n_knots=1, knots=[16], term=3, restricted=False)
     expected_splines = pd.DataFrame.from_records([{'sp': 0.0},
                                                   {'sp': 0.0},
                                                   {'sp': 0.0},
                                                   {'sp': 0.0},
                                                   {'sp': 4.0**3}])
     pdt.assert_series_equal(spline_data['sp'], expected_splines['sp'])
Esempio n. 11
0
 def test_auto_knots2(self, spline_data):
     spline_data[['sp1', 'sp2']] = spline(spline_data, 'v', n_knots=2, restricted=False)
     expected_splines = pd.DataFrame.from_records([{'sp1': 0.0, 'sp2': 0.0},
                                                   {'sp1': 0.0, 'sp2': 0.0},
                                                   {'sp1': 10 - 20/3, 'sp2': 0.0},
                                                   {'sp1': 15 - 20/3, 'sp2': 15 - 40/3},
                                                   {'sp1': 20 - 20/3, 'sp2': 20 - 40/3}])
     pdt.assert_frame_equal(spline_data[['sp1', 'sp2']], expected_splines[['sp1', 'sp2']])
Esempio n. 12
0
def causal_check():
    # 9) Check IPTW plots
    data = load_sample_data(False)
    data[['cd4_rs1', 'cd4_rs2']] = spline(data, 'cd40', n_knots=3, term=2, restricted=True)
    data[['age_rs1', 'age_rs2']] = spline(data, 'age0', n_knots=3, term=2, restricted=True)
    ipt = IPTW(data, treatment='art', stabilized=True)
    ipt.regression_models('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
    ipt.fit()
    ipt.plot_love()
    plt.tight_layout()
    plt.show()
    ipt.plot_kde()
    plt.show()
    ipt.plot_kde(measure='logit')
    plt.show()
    ipt.plot_boxplot()
    plt.show()
    ipt.plot_boxplot(measure='logit')
    plt.show()
Esempio n. 13
0
def mc_gformula_check():
    df = load_sample_data(timevary=True)
    df['lag_art'] = df['art'].shift(1)
    df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art'])
    df['lag_cd4'] = df['cd4'].shift(1)
    df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4'])
    df['lag_dvl'] = df['dvl'].shift(1)
    df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl'])
    df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True)  # age spline
    df['cd40_sq'] = df['cd40'] ** 2  # cd4 baseline cubic
    df['cd40_cu'] = df['cd40'] ** 3
    df['cd4_sq'] = df['cd4'] ** 2  # cd4 current cubic
    df['cd4_cu'] = df['cd4'] ** 3
    df['enter_sq'] = df['enter'] ** 2  # entry time cubic
    df['enter_cu'] = df['enter'] ** 3
    g = TimeVaryGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out')
    exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + 
            cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.exposure_model(exp_m, restriction="g['lag_art']==0")
    out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + 
            cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.outcome_model(out_m, restriction="g['drop']==0")
    dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary')
    cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);"
                         "g['cd4_sq'] = g['cd4']**2;"
                         "g['cd4_cu'] = g['cd4']**3")
    g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous')
    g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
          lags={'art': 'lag_art',
                'cd4': 'lag_cd4',
                'dvl': 'lag_dvl'},
          sample=10000, t_max=None,
          in_recode=("g['enter_sq'] = g['enter']**2;"
                     "g['enter_cu'] = g['enter']**3"))
    gf = g.predicted_outcomes
    gfs = gf.loc[gf.uid_g_zepid != gf.uid_g_zepid.shift(-1)].copy()
    kmn = KaplanMeierFitter()
    kmn.fit(durations=gfs['out'], event_observed=gfs['dead'])
    kmo = KaplanMeierFitter()
    kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter'])
    plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural')
    plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True')
    plt.legend()
    plt.show()
Esempio n. 14
0
 def test_complete_mc_procedure_completes(self):
     df = load_sample_data(timevary=True)
     df['lag_art'] = df['art'].shift(1)
     df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art'])
     df['lag_cd4'] = df['cd4'].shift(1)
     df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4'])
     df['lag_dvl'] = df['dvl'].shift(1)
     df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl'])
     df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True)  # age spline
     df['cd40_sq'] = df['cd40'] ** 2
     df['cd40_cu'] = df['cd40'] ** 3
     df['cd4_sq'] = df['cd4'] ** 2
     df['cd4_cu'] = df['cd4'] ** 3
     df['enter_sq'] = df['enter'] ** 2
     df['enter_cu'] = df['enter'] ** 3
     g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out')
     exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + 
             cd4_cu + dvl + enter + enter_sq + enter_cu'''
     g.exposure_model(exp_m, restriction="g['lag_art']==0")
     out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + 
             cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu'''
     g.outcome_model(out_m, restriction="g['drop']==0")
     dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
             lag_dvl + lag_art + enter + enter_sq + enter_cu'''
     g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary')
     cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
             lag_dvl + lag_art + enter + enter_sq + enter_cu'''
     cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);"
                          "g['cd4_sq'] = g['cd4']**2;"
                          "g['cd4_cu'] = g['cd4']**3")
     g.add_covariate_model(label=2, covariate='cd4', model=cd4_m, recode=cd4_recode_scheme, var_type='continuous')
     cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 +
              lag_dvl + lag_art + enter + enter_sq + enter_cu"""
     g.censoring_model(cens_m)
     g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
           lags={'art': 'lag_art',
                 'cd4': 'lag_cd4',
                 'dvl': 'lag_dvl'},
           sample=5000, t_max=None,
           in_recode=("g['enter_sq'] = g['enter']**2;"
                      "g['enter_cu'] = g['enter']**3"))
     assert isinstance(g.predicted_outcomes, type(pd.DataFrame()))
Esempio n. 15
0
#########################################
# Causal Survival Analysis
from zepid import load_sample_data, spline
from zepid.causal.gformula import SurvivalGFormula

df = load_sample_data(False).drop(columns=['cd4_wk45'])
df['t'] = np.round(df['t']).astype(int)
df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0), columns=df.columns)
df['t'] = df.groupby('id')['t'].cumcount() + 1
df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1
df['d'] = df['d'].fillna(0)

# Spline terms
df[['t_rs1', 't_rs2', 't_rs3']] = spline(df,
                                         't',
                                         n_knots=4,
                                         term=2,
                                         restricted=True)
df[['cd4_rs1', 'cd4_rs2']] = spline(df,
                                    'cd40',
                                    n_knots=3,
                                    term=2,
                                    restricted=True)
df[['age_rs1', 'age_rs2']] = spline(df,
                                    'age0',
                                    n_knots=3,
                                    term=2,
                                    restricted=True)

sgf = SurvivalGFormula(df.drop(columns=['dead']),
                       idvar='id',
Esempio n. 16
0
def causal_check():
    data = load_sample_data(False).drop(columns=['cd4_wk45'])
    data[['cd4_rs1', 'cd4_rs2']] = spline(data,
                                          'cd40',
                                          n_knots=3,
                                          term=2,
                                          restricted=True)
    data[['age_rs1', 'age_rs2']] = spline(data,
                                          'age0',
                                          n_knots=3,
                                          term=2,
                                          restricted=True)

    # Check TimeFixedGFormula diagnostics
    g = TimeFixedGFormula(data, exposure='art', outcome='dead')
    g.outcome_model(
        model=
        'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
    )
    g.run_diagnostics(decimal=3)

    # Check IPTW plots
    ipt = IPTW(data, treatment='art', outcome='dead')
    ipt.treatment_model(
        'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',
        stabilized=True)
    ipt.marginal_structural_model('art')
    ipt.fit()
    ipt.plot_love()
    plt.tight_layout()
    plt.show()
    ipt.plot_kde()
    plt.show()
    ipt.plot_kde(measure='logit')
    plt.show()
    ipt.plot_boxplot()
    plt.show()
    ipt.plot_boxplot(measure='logit')
    plt.show()
    ipt.run_diagnostics()

    # Check AIPTW Diagnostics
    aipw = AIPTW(data, exposure='art', outcome='dead')
    aipw.exposure_model(
        'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
    aipw.outcome_model(
        'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
    )
    aipw.fit()
    aipw.run_diagnostics()
    aipw.plot_kde(to_plot='exposure')
    plt.show()
    aipw.plot_kde(to_plot='outcome')
    plt.show()
    aipw.plot_love()
    plt.show()

    # Check TMLE diagnostics
    tmle = TMLE(data, exposure='art', outcome='dead')
    tmle.exposure_model(
        'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
    tmle.outcome_model(
        'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
    )
    tmle.fit()
    tmle.run_diagnostics()
    tmle.plot_kde(to_plot='exposure')
    plt.show()
    tmle.plot_kde(to_plot='outcome')
    plt.show()
    tmle.plot_love()
    plt.show()

    # Check SurvivalGFormula plots
    df = load_sample_data(False).drop(columns=['cd4_wk45'])
    df['t'] = np.round(df['t']).astype(int)
    df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0),
                      columns=df.columns)
    df['t'] = df.groupby('id')['t'].cumcount() + 1
    df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1
    df['d'] = df['d'].fillna(0)
    df['t_sq'] = df['t']**2
    df['t_cu'] = df['t']**3
    sgf = SurvivalGFormula(df,
                           idvar='id',
                           exposure='art',
                           outcome='d',
                           time='t')
    sgf.outcome_model(
        model='art + male + age0 + cd40 + dvl0 + t + t_sq + t_cu')
    sgf.fit(treatment='all')
    sgf.plot()
    plt.show()
    sgf.plot(c='r', linewidth=3, alpha=0.8)
    plt.show()
Esempio n. 17
0
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import family, links
import matplotlib.pyplot as plt

import zepid as ze
from zepid.causal.gformula import TimeFixedGFormula
from zepid.causal.doublyrobust import SimpleDoubleRobust

df = ze.load_sample_data(timevary=False)
df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', term=2, restricted=True)
df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', term=2, restricted=True)

#Crude Model
ze.RiskRatio(df, exposure='art', outcome='dead')
ze.RiskDiff(df, exposure='art', outcome='dead')
#Adjusted Model
model = 'art + male + age0 + cd40 + dvl0'
f = sm.families.family.Binomial(sm.families.links.identity)
linrisk = smf.glm('dead ~ ' + model, df, family=f).fit()
linrisk.summary()
f = sm.families.family.Binomial(sm.families.links.log)
log = smf.glm('dead ~ art', df, family=f).fit()
log.summary()
#g-formula
g = TimeFixedGFormula(df, exposure='art', outcome='dead')
g.outcome_model(
    model=
    'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
g.fit(treatment='all')
Esempio n. 18
0
 def test_error_for_bad_order(self, spline_data):
     with pytest.raises(ValueError):
         spline_data['sp'] = spline(spline_data,
                                    'v',
                                    n_knots=3,
                                    knots=[3, 1, 2])
Esempio n. 19
0
import warnings
import numpy as np
import pandas as pd
import statsmodels.api as sm

from zepid import load_sample_data, spline


#######################################################################################################################
# Binary Outcome
#######################################################################################################################

df = load_sample_data(timevary=False)
df = df.drop(columns=['cd4_wk45'])
df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True)
df[['age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=3, term=2, restricted=True)

#############################
# Naive Risk Difference
from zepid import RiskDifference

rd = RiskDifference()
rd.fit(df, exposure='art', outcome='dead')
rd.summary()

#############################
# G-formula
from zepid.causal.gformula import TimeFixedGFormula

g = TimeFixedGFormula(df, exposure='art', outcome='dead')
g.outcome_model(model='art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',