def test_nominal(self): family = Multinomial(3) endog_orig, exog_orig, groups = load_data("gee_nominal_1.csv", icept=False) data = np.concatenate((endog_orig[:, None], exog_orig, groups[:, None]), axis=1) # Recode as indicators endog, exog, exog_ne, nlevel = gee_setup_nominal(data, 0, [3]) groups = exog_ne[:, 0] # Test with independence correlation v = Independence() md = GEE(endog, exog, groups, None, family, v) mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf1 = np.r_[0.44944752, 0.45569985, -0.92007064, -0.46766728] se1 = np.r_[0.09801821, 0.07718842, 0.13229421, 0.08544553] assert_almost_equal(mdf1.params, cf1, decimal=5) assert_almost_equal(mdf1.standard_errors(), se1, decimal=5) # Test with global odds ratio dependence v = GlobalOddsRatio(nlevel, "nominal") md = GEE(endog, exog, groups, None, family, v) mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf2 = np.r_[0.45397549, 0.42278345, -0.91997131, -0.50115943] se2 = np.r_[0.09646057, 0.07405713, 0.1324629, 0.09025019] assert_almost_equal(mdf2.params, cf2, decimal=5) assert_almost_equal(mdf2.standard_errors(), se2, decimal=5)
def test_nested_linear(self): family = Gaussian() endog, exog, group = load_data("gee_nested_linear_1.csv") group_n = [] for i in range(endog.shape[0]//10): group_n.extend([0,]*5) group_n.extend([1,]*5) group_n = np.array(group_n)[:,None] dp = Independence() md = GEE(endog, exog, group, None, family, dp) mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf = np.r_[-0.1671073 , 1.00467426, -2.01723004, 0.97297106] se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989] assert_almost_equal(mdf1.params, cf, decimal=6) assert_almost_equal(mdf1.standard_errors(), se, decimal=6) ne = Nested() md = GEE(endog, exog, group, None, family, ne, dep_data=group_n) mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969] se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991] assert_almost_equal(mdf2.params, cf, decimal=6) assert_almost_equal(mdf2.standard_errors(), se, decimal=6)
def test_missing_formula(self): # Test missing data handling for formulas. endog = np.random.normal(size=100) exog1 = np.random.normal(size=100) exog2 = np.random.normal(size=100) exog3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog2[10:12] = np.nan data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, "exog3": exog3, "groups": groups}) mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) data = data.dropna() groups = groups[data.index.values] mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params.values, rslt2.params.values) assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
def test_nominal(self): family = Multinomial(3) endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation v = Independence() md = GEE(endog, exog, groups, None, family, v) md.setup_nominal() mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf1 = np.r_[0.44944752, 0.45569985, -0.92007064, -0.46766728] se1 = np.r_[0.09801821, 0.07718842, 0.13229421, 0.08544553] assert_almost_equal(mdf1.params, cf1, decimal=5) assert_almost_equal(mdf1.standard_errors(), se1, decimal=5) # Test with global odds ratio dependence v = GlobalOddsRatio("nominal") md = GEE(endog, exog, groups, None, family, v) md.setup_nominal() mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf2 = np.r_[0.45397549, 0.42278345, -0.91997131, -0.50115943] se2 = np.r_[0.09646057, 0.07405713, 0.1324629, 0.09025019] assert_almost_equal(mdf2.params, cf2, decimal=5) assert_almost_equal(mdf2.standard_errors(), se2, decimal=5)
def test_ordinal(self): family = Binomial() endog_orig, exog_orig, groups = load_data("gee_ordinal_1.csv", icept=False) data = np.concatenate((endog_orig[:,None], exog_orig, groups[:,None]), axis=1) # Recode as cumulative indicators endog, exog, intercepts, nlevel = gee_setup_ordinal(data, 0) exog1 = np.concatenate((intercepts, exog), axis=1) groups = exog1[:,-1] exog1 = exog1[:,0:-1] v = GlobalOddsRatio(nlevel, "ordinal") beta = gee_ordinal_starting_values(endog_orig, exog_orig.shape[1]) md = GEE(endog, exog1, groups, None, family, v) mdf = md.fit(start_params = beta) cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666, 0.02983409, 1.18123172, 0.01845318, -1.10233886] se = np.r_[0.10878752, 0.10326078, 0.11171241, 0.05488705, 0.05995019, 0.0916574, 0.05951445, 0.08539281] assert_almost_equal(mdf.params, cf, decimal=5) assert_almost_equal(mdf.bse, se, decimal=5)
def test_missing(self): #Test missing data handling for calling from the api. Missing #data handling does not currently work for formulas. endog = np.random.normal(size=100) exog = np.random.normal(size=(100, 3)) exog[:, 0] = 1 groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog[10:12, 1] = np.nan mod1 = GEE(endog, exog, groups, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) ii = np.isfinite(endog) & np.isfinite(exog).all(1) mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) assert_almost_equal(rslt1.bse, rslt2.bse)
def test_ordinal_pandas(self): family = Binomial() endog_orig, exog_orig, groups = load_data("gee_ordinal_1.csv", icept=False) data = np.concatenate( (endog_orig[:, None], exog_orig, groups[:, None]), axis=1) data = pd.DataFrame(data) data.columns = ["endog", "x1", "x2", "x3", "x4", "x5", "group"] # Recode as cumulative indicators endog, exog, intercepts, nlevel = \ gee_setup_ordinal(data, "endog") exog1 = np.concatenate((intercepts, exog), axis=1) groups = exog1[:, -1] exog1 = exog1[:, 0:-1] v = GlobalOddsRatio(nlevel, "ordinal") beta = gee_ordinal_starting_values(endog_orig, exog_orig.shape[1]) md = GEE(endog, exog1, groups, None, family, v) mdf = md.fit(start_params=beta) cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666, 0.02983409, 1.18123172, 0.01845318, -1.10233886] se = np.r_[0.10878752, 0.10326078, 0.11171241, 0.05488705, 0.05995019, 0.0916574, 0.05951445, 0.08539281] assert_almost_equal(mdf.params, cf, decimal=2) assert_almost_equal(mdf.bse, se, decimal=2)
def test_default_time(self): # Check that the time defaults work correctly. endog,exog,group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() va = Autoregressive() md1 = GEE(endog, exog, group, family=family, cov_struct=va) mdf1 = md1.fit() md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va) mdf2 = md2.fit() assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.standard_errors(), mdf2.standard_errors(), decimal=6)
def BuildPoissonModels(hist_data, feature_list, comp_data=None): ''' Build score predictions via (linear) poisson regression. ''' hist_data_1 = hist_data[["team_1_score"] + feature_list] hist_data_2 = hist_data[["team_2_score"] + feature_list] formula_1 = "team_1_score ~ " + " + ".join(feature_list) formula_2 = "team_2_score ~ " + " + ".join(feature_list) # using the GEE package along with independance assumptions to fit poisson model. # Am assuming this is using a maximum likleyhood approach? fam = Poisson() ind = Independence() model_1 = GEE.from_formula(formula_1, "team_1_score", hist_data, cov_struct=ind, family=fam) model_2 = GEE.from_formula(formula_2, "team_2_score", hist_data, cov_struct=ind, family=fam) model_1_fit = model_1.fit() model_2_fit = model_2.fit() print(model_1_fit.summary()) hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data) hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data) # return historical data if comp_data wasn't passed. if comp_data is None: return hist_data # prepare comp data comp_data['team_1_score_pred'] = model_1_fit.predict( comp_data[feature_list]) comp_data['team_2_score_pred'] = model_2_fit.predict( comp_data[feature_list]) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
def test_missing(): # gh-1877 data = [['id', 'al', 'status', 'fake', 'grps'], ['4A', 'A', 1, 1, 0], ['5A', 'A', 1, 2.0, 1], ['6A', 'A', 1, 3, 2], ['7A', 'A', 1, 2.0, 3], ['8A', 'A', 1, 1, 4], ['9A', 'A', 1, 2.0, 5], ['11A', 'A', 1, 1, 6], ['12A', 'A', 1, 2.0, 7], ['13A', 'A', 1, 1, 8], ['14A', 'A', 1, 1, 9], ['15A', 'A', 1, 1, 10], ['16A', 'A', 1, 2.0, 11], ['17A', 'A', 1, 3.0, 12], ['18A', 'A', 1, 3.0, 13], ['19A', 'A', 1, 2.0, 14], ['20A', 'A', 1, 2.0, 15], ['2C', 'C', 0, 3.0, 0], ['3C', 'C', 0, 1, 1], ['4C', 'C', 0, 1, 2], ['5C', 'C', 0, 2.0, 3], ['6C', 'C', 0, 1, 4], ['9C', 'C', 0, 1, 5], ['10C', 'C', 0, 3, 6], ['12C', 'C', 0, 3, 7], ['14C', 'C', 0, 2.5, 8], ['15C', 'C', 0, 1, 9], ['17C', 'C', 0, 1, 10], ['22C', 'C', 0, 1, 11], ['23C', 'C', 0, 1, 12], ['24C', 'C', 0, 1, 13], ['32C', 'C', 0, 2.0, 14], ['35C', 'C', 0, 1, 15]] df = pd.DataFrame(data[1:], columns=data[0]) df.ix[df.fake == 1, 'fake'] = np.nan mod = smf.gee('status ~ fake', data=df, groups='grps', cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) df = df.dropna() #df.loc[:, 'constant'] = 1 df['constant'] = 1 mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps, cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) assert_equal(mod.endog, mod2.endog) assert_equal(mod.exog, mod2.exog) assert_equal(mod.groups, mod2.groups) res = mod.fit() res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values)
def test_missing(): # gh-1877 data = [['id', 'al', 'status', 'fake', 'grps'], ['4A', 'A', 1, 1, 0], ['5A', 'A', 1, 2.0, 1], ['6A', 'A', 1, 3, 2], ['7A', 'A', 1, 2.0, 3], ['8A', 'A', 1, 1, 4], ['9A', 'A', 1, 2.0, 5], ['11A', 'A', 1, 1, 6], ['12A', 'A', 1, 2.0, 7], ['13A', 'A', 1, 1, 8], ['14A', 'A', 1, 1, 9], ['15A', 'A', 1, 1, 10], ['16A', 'A', 1, 2.0, 11], ['17A', 'A', 1, 3.0, 12], ['18A', 'A', 1, 3.0, 13], ['19A', 'A', 1, 2.0, 14], ['20A', 'A', 1, 2.0, 15], ['2C', 'C', 0, 3.0, 0], ['3C', 'C', 0, 1, 1], ['4C', 'C', 0, 1, 2], ['5C', 'C', 0, 2.0, 3], ['6C', 'C', 0, 1, 4], ['9C', 'C', 0, 1, 5], ['10C', 'C', 0, 3, 6], ['12C', 'C', 0, 3, 7], ['14C', 'C', 0, 2.5, 8], ['15C', 'C', 0, 1, 9], ['17C', 'C', 0, 1, 10], ['22C', 'C', 0, 1, 11], ['23C', 'C', 0, 1, 12], ['24C', 'C', 0, 1, 13], ['32C', 'C', 0, 2.0, 14], ['35C', 'C', 0, 1, 15]] df = pd.DataFrame(data[1:], columns=data[0]) df.ix[df.fake == 1, 'fake'] = np.nan mod = smf.gee('status ~ fake', data=df, groups='grps', cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) df = df.dropna() df['constant'] = 1 mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps, cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) assert_equal(mod.endog, mod2.endog) assert_equal(mod.exog, mod2.exog) assert_equal(mod.groups, mod2.groups) res = mod.fit() res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values)
def test_post_estimation(self): family = Gaussian() endog, exog, group = load_data("gee_linear_1.csv") ve = Exchangeable() md = GEE(endog, exog, group, None, family, ve) mdf = md.fit() assert_almost_equal(np.dot(exog, mdf.params), mdf.fittedvalues) assert_almost_equal(endog - np.dot(exog, mdf.params), mdf.resid)
def test_wrapper(self): endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False) endog = pd.Series(endog) exog = pd.DataFrame(exog) group_n = pd.Series(group_n) family = Poisson() vi = Independence() mod = GEE(endog, exog, group_n, None, family, vi) rslt2 = mod.fit() check_wrapper(rslt2)
def test_scoretest(self): # Regression tests np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3*np.random.normal(size=n) group = np.kron(np.arange(n/4), np.ones(4)) # Test under the null. L = np.array([[1., -1, 0, 0]]) R = np.array([0.,]) family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() assert_almost_equal(mod1.score_test_results["statistic"], 1.08126334) assert_almost_equal(mod1.score_test_results["p-value"], 0.2984151086) # Test under the alternative. L = np.array([[1., -1, 0, 0]]) R = np.array([1.0,]) family = Gaussian() va = Independence() mod2 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt2 = mod2.fit() assert_almost_equal(mod2.score_test_results["statistic"], 3.491110965) assert_almost_equal(mod2.score_test_results["p-value"], 0.0616991659) # Compare to Wald tests exog = np.random.normal(size=(n, 2)) L = np.array([[1, -1]]) R = np.array([0.]) f = np.r_[1, -1] for i in range(10): endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\ np.random.normal(size=n) family = Gaussian() va = Independence() mod0 = GEE(endog, exog, group, family=family, cov_struct=va) rslt0 = mod0.fit() family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) wald_z = np.dot(f, rslt0.params) / se wald_p = 2*norm.cdf(-np.abs(wald_z)) score_p = mod1.score_test_results["p-value"] assert_array_less(np.abs(wald_p - score_p), 0.02)
def test_predict_exposure(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.random.uniform(1, 2, size=n) Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset", exposure="exposure") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data["offset"]) pred3 = result.predict(exposure=data["exposure"]) pred4 = result.predict(offset=data["offset"], exposure=data["exposure"]) pred5 = result.predict(exog=data[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:]) # without patsy pred6 = result.predict(exog=result.model.exog[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:], transform=False) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4) assert_allclose(pred1[-10:], pred5) assert_allclose(pred1[-10:], pred6)
def test_sensitivity(self): va = Exchangeable() family = Gaussian() n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], np.r_[-0.1256575, -0.126747036])
def test_compare_OLS(self): """ Gaussian GEE with independence correlation should agree exactly with OLS for parameter estimates and standard errors derived from the naive covariance estimate. """ vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(range(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs) mdf = md.fit() ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(ols.params.values, mdf.params, decimal=10) naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.naive_covariance)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_compare_OLS(self): #Gaussian GEE with independence correlation should agree #exactly with OLS for parameter estimates and standard errors #derived from the naive covariance estimate. vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit() # don't use wrapper, asserts_xxx don't work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({ "Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups }) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert (len(md.endog) == 95) assert (md.exog.shape) == (95, 4)
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([ -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315 ])
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_compare_OLS(self): """ Gaussian GEE with independence correlation should agree exactly with OLS for parameter estimates and standard errors derived from the naive covariance estimate. """ vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(range(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs) mdf = md.fit() ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(ols.params.values, mdf.params, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.naive_covariance)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10)
def test_predict(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) Y = np.random.normal(0.1*(X1 + X2) + offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Gaussian(), offset="offset") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data.offset) pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) pred4 = result.predict(exog=data[["X1", "X2"]], offset=0*data.offset) pred5 = result.predict(offset=0*data.offset) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4 + data.offset) assert_allclose(pred1, pred5 + data.offset) x1_new = np.random.normal(size=10) x2_new = np.random.normal(size=10) new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) pred6 = result.predict(exog=new_exog) params = result.params pred6_correct = params[0] + params[1]*x1_new + params[2]*x2_new assert_allclose(pred6, pred6_correct)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def test_sensitivity(self): va = Exchangeable() family = Gaussian() np.random.seed(34234) n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], [0.1696214707458818, 0.17836097387799127])
def test_formulas(self): """ Check formulas, especially passing groups and time as either variable names or arrays. """ n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = Autoregressive() family = Gaussian() mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2)
def test_autoregressive(self): dep_params_true = [0, 0.589208623896, 0.559823804948] params_true = [[1.08043787, 1.12709319, 0.90133927], [0.9613677, 1.05826987, 0.90832055], [1.05370439, 0.96084864, 0.93923374]] np.random.seed(342837482) num_group = 100 ar_param = 0.5 k = 3 ga = Gaussian() for gsize in 1, 2, 3: ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :] ix = np.abs(ix) cmat = ar_param**ix cmat_r = np.linalg.cholesky(cmat) endog = [] exog = [] groups = [] for i in range(num_group): x = np.random.normal(size=(gsize, k)) exog.append(x) expval = x.sum(1) errors = np.dot(cmat_r, np.random.normal(size=gsize)) endog.append(expval + errors) groups.append(i * np.ones(gsize)) endog = np.concatenate(endog) groups = np.concatenate(groups) exog = np.concatenate(exog, axis=0) ar = Autoregressive() md = GEE(endog, exog, groups, family=ga, cov_struct=ar) mdf = md.fit() assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1]) assert_almost_equal(mdf.params, params_true[gsize - 1])
def test_autoregressive(self): dep_params_true = [0, 0.589208623896, 0.559823804948] params_true = [[1.08043787, 1.12709319, 0.90133927], [0.9613677, 1.05826987, 0.90832055], [1.05370439, 0.96084864, 0.93923374]] np.random.seed(342837482) num_group = 100 ar_param = 0.5 k = 3 ga = Gaussian() for gsize in 1,2,3: ix = np.arange(gsize)[:,None] - np.arange(gsize)[None,:] ix = np.abs(ix) cmat = ar_param ** ix cmat_r = np.linalg.cholesky(cmat) endog = [] exog = [] groups = [] for i in range(num_group): x = np.random.normal(size=(gsize,k)) exog.append(x) expval = x.sum(1) errors = np.dot(cmat_r, np.random.normal(size=gsize)) endog.append(expval + errors) groups.append(i*np.ones(gsize)) endog = np.concatenate(endog) groups = np.concatenate(groups) exog = np.concatenate(exog, axis=0) ar = Autoregressive() md = GEE(endog, exog, groups, family=ga, cov_struct = ar) mdf = md.fit() assert_almost_equal(ar.dep_params, dep_params_true[gsize-1]) assert_almost_equal(mdf.params, params_true[gsize-1])
def test_ordinal(self): family = Binomial() endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False) v = GlobalOddsRatio("ordinal") md = GEE(endog, exog, groups, None, family, v) md.setup_ordinal() mdf = md.fit() cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666, 0.02983409, 1.18123172, 0.01845318, -1.10233886] se = np.r_[0.10878752, 0.10326078, 0.11171241, 0.05488705, 0.05995019, 0.0916574, 0.05951445, 0.08539281] assert_almost_equal(mdf.params, cf, decimal=5) assert_almost_equal(mdf.bse, se, decimal=5)
def test_linear_constrained(self): family = Gaussian() exog = np.random.normal(size=(300, 4)) exog[:, 0] = 1 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) + np.random.normal(size=300) group = np.kron(np.arange(100), np.r_[1, 1, 1]) vi = Independence() ve = Exchangeable() L = np.r_[[[0, 0, 0, 1]]] R = np.r_[0,] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v, constraint=(L, R)) mdf = md.fit() assert_almost_equal(mdf.params[3], 0, decimal=10)
def test_linear_constrained(self): family = Gaussian() exog = np.random.normal(size=(300, 4)) exog[:, 0] = 1 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\ np.random.normal(size=300) group = np.kron(np.arange(100), np.r_[1, 1, 1]) vi = Independence() ve = Exchangeable() L = np.r_[[[0, 0, 0, 1]]] R = np.r_[0, ] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v, constraint=(L, R)) mdf = md.fit() assert_almost_equal(mdf.params[3], 0, decimal=10)
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() cls.mod = GEE(endog, exog, group_n, None, family, vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315])
def test_default_time(self): """ Check that the time defaults work correctly. """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() va = Autoregressive() md1 = GEE(endog, exog, group, family=family, cov_struct=va) mdf1 = md1.fit() md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va) mdf2 = md2.fit() assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.standard_errors(), mdf2.standard_errors(), decimal=6)
def test_nested_linear(self): family = Gaussian() endog, exog, group = load_data("gee_nested_linear_1.csv") group_n = [] for i in range(endog.shape[0] // 10): group_n.extend([ 0, ] * 5) group_n.extend([ 1, ] * 5) group_n = np.array(group_n)[:, None] dp = Independence() md = GEE(endog, exog, group, None, family, dp) mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106] se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989] assert_almost_equal(mdf1.params, cf, decimal=6) assert_almost_equal(mdf1.standard_errors(), se, decimal=6) ne = Nested() md = GEE(endog, exog, group, None, family, ne, dep_data=group_n) mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969] se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991] assert_almost_equal(mdf2.params, cf, decimal=6) assert_almost_equal(mdf2.standard_errors(), se, decimal=6)
def test_missing(self): """ Test missing data handling for calling from the api. Missing data handling does not currently work for formulas. """ endog = np.random.normal(size=100) exog = np.random.normal(size=(100, 3)) exog[:, 0] = 1 groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog[10:12, 1] = np.nan mod1 = GEE(endog, exog, groups, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) ii = np.isfinite(endog) & np.isfinite(exog).all(1) mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) assert_almost_equal(rslt1.bse, rslt2.bse)
def test_weighted(self): # Simple check where the answer can be computed by hand. exog = np.ones(20) weights = np.ones(20) weights[0:10] = 2 endog = np.zeros(20) endog[0:10] += 1 groups = np.kron(np.arange(10), np.r_[1, 1]) model = GEE(endog, exog, groups, weights=weights) result = model.fit() assert_allclose(result.params, np.r_[2/3.]) # Comparison against stata using groups with different sizes. weights = np.ones(20) weights[10:] = 2 endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8, 7, 8] exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3] groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6, 8, 8, 9, 9, 10, 10] exog = np.column_stack((np.ones(20), exog1)) # Comparison using independence model model = GEE(endog, exog, groups, weights=weights, cov_struct=sm.cov_struct.Independence()) g = np.mean([2, 4, 2, 2, 4, 2, 2, 2]) fac = 20 / float(20 - g) result = model.fit(ddof_scale=0, scaling_factor=fac) assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6) assert_allclose(result.scale, 1.808576) # Stata multiples robust SE by sqrt(N / (N - g)), where N is # the total sample size and g is the average group size. assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5) # Comparison using exchangeable model # Smoke test for now model = GEE(endog, exog, groups, weights=weights, cov_struct=sm.cov_struct.Exchangeable()) result = model.fit(ddof_scale=0)
def test_margins(self): n = 300 exog = np.random.normal(size=(n, 4)) exog[:, 0] = 1 exog[:, 1] = 1 * (exog[:, 2] < 0) group = np.kron(np.arange(n / 4), np.ones(4)) time = np.zeros((n, 1)) beta = np.r_[0, 1, -1, 0.5] lpr = np.dot(exog, beta) prob = 1 / (1 + np.exp(-lpr)) endog = 1 * (np.random.uniform(size=n) < prob) fa = Binomial() ex = Exchangeable() md = GEE(endog, exog, group, time, fa, ex) mdf = md.fit() marg = GEEMargins(mdf, ()) marg.summary()
def test_margins(self): n = 300 exog = np.random.normal(size=(n, 4)) exog[:,0] = 1 exog[:,1] = 1*(exog[:,2] < 0) group = np.kron(np.arange(n/4), np.ones(4)) time = np.zeros((n, 1)) beta = np.r_[0, 1, -1, 0.5] lpr = np.dot(exog, beta) prob = 1 / (1 + np.exp(-lpr)) endog = 1*(np.random.uniform(size=n) < prob) fa = Binomial() ex = Exchangeable() md = GEE(endog, exog, group, time, fa, ex) mdf = md.fit() marg = GEEMargins(mdf, ()) marg.summary()
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:,None], group_n[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315])
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1*(np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, cov_struct=vs).fit() sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit(disp=False) assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_predict(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n) data = pd.DataFrame({ "Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset }) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Gaussian(), offset="offset") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data.offset) pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset) pred5 = result.predict(offset=0 * data.offset) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4 + data.offset) assert_allclose(pred1, pred5 + data.offset) x1_new = np.random.normal(size=10) x2_new = np.random.normal(size=10) new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) pred6 = result.predict(exog=new_exog) params = result.params pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new assert_allclose(pred6, pred6_correct)
def test_offset_formula(self): """ Test various ways of passing offset and exposure to `from_formula`. """ n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=2*offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True)
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert(len(md.endog) == 95) assert(md.exog.shape) == (95,4)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1*(np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def test_offset_formula(self): # Test various ways of passing offset and exposure to `from_formula`. n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=2*offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True)