def test_scoretest(self): # Regression tests np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3*np.random.normal(size=n) group = np.kron(np.arange(n/4), np.ones(4)) # Test under the null. L = np.array([[1., -1, 0, 0]]) R = np.array([0.,]) family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() assert_almost_equal(mod1.score_test_results["statistic"], 1.08126334) assert_almost_equal(mod1.score_test_results["p-value"], 0.2984151086) # Test under the alternative. L = np.array([[1., -1, 0, 0]]) R = np.array([1.0,]) family = Gaussian() va = Independence() mod2 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt2 = mod2.fit() assert_almost_equal(mod2.score_test_results["statistic"], 3.491110965) assert_almost_equal(mod2.score_test_results["p-value"], 0.0616991659) # Compare to Wald tests exog = np.random.normal(size=(n, 2)) L = np.array([[1, -1]]) R = np.array([0.]) f = np.r_[1, -1] for i in range(10): endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\ np.random.normal(size=n) family = Gaussian() va = Independence() mod0 = GEE(endog, exog, group, family=family, cov_struct=va) rslt0 = mod0.fit() family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) wald_z = np.dot(f, rslt0.params) / se wald_p = 2*norm.cdf(-np.abs(wald_z)) score_p = mod1.score_test_results["p-value"] assert_array_less(np.abs(wald_p - score_p), 0.02)
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([ -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315 ])
def test_nested_linear(self): family = Gaussian() endog, exog, group = load_data("gee_nested_linear_1.csv") group_n = [] for i in range(endog.shape[0] // 10): group_n.extend([ 0, ] * 5) group_n.extend([ 1, ] * 5) group_n = np.array(group_n)[:, None] dp = Independence() md = GEE(endog, exog, group, None, family, dp) mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106] se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989] assert_almost_equal(mdf1.params, cf, decimal=6) assert_almost_equal(mdf1.standard_errors(), se, decimal=6) ne = Nested() md = GEE(endog, exog, group, None, family, ne, dep_data=group_n) mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969] se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991] assert_almost_equal(mdf2.params, cf, decimal=6) assert_almost_equal(mdf2.standard_errors(), se, decimal=6)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10)
def test_nominal(self): family = Multinomial(3) endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation va = Independence() mod1 = NominalGEE(endog, exog, groups, None, family, va) rslt1 = mod1.fit() # Regression test cf1 = np.r_[0.44944752, 0.45569985, -0.92007064, -0.46766728] se1 = np.r_[0.09801821, 0.07718842, 0.13229421, 0.08544553] assert_almost_equal(rslt1.params, cf1, decimal=5) assert_almost_equal(rslt1.standard_errors(), se1, decimal=5) # Test with global odds ratio dependence va = GlobalOddsRatio("nominal") mod2 = NominalGEE(endog, exog, groups, None, family, va) rslt2 = mod2.fit(start_params=rslt1.params) # Regression test cf2 = np.r_[0.45448248, 0.41945568, -0.92008924, -0.50485758] se2 = np.r_[0.09632274, 0.07433944, 0.13264646, 0.0911768] assert_almost_equal(rslt2.params, cf2, decimal=5) assert_almost_equal(rslt2.standard_errors(), se2, decimal=5) # Make sure we get the correct results type assert_equal(type(rslt1), NominalGEEResultsWrapper) assert_equal(type(rslt1._results), NominalGEEResults)
def test_compare_OLS(self): #Gaussian GEE with independence correlation should agree #exactly with OLS for parameter estimates and standard errors #derived from the naive covariance estimate. vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit() # don't use wrapper, asserts_xxx don't work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_nominal(self): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation va = Independence() mod1 = NominalGEE(endog, exog, groups, cov_struct=va) rslt1 = mod1.fit() # Regression test cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266] se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258] assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5) assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5) # Test with global odds ratio dependence va = GlobalOddsRatio("nominal") mod2 = NominalGEE(endog, exog, groups, cov_struct=va) rslt2 = mod2.fit(start_params=rslt1.params) # Regression test cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116] se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064] assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5) assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5) # Make sure we get the correct results type assert_equal(type(rslt1), NominalGEEResultsWrapper) assert_equal(type(rslt1._results), NominalGEEResults)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def gendat_overdispersed(): exs = Overdispersed_simulator() exs.params = np.r_[2., 0.2, 0.2, -0.1, -0.2] exs.ngroups = 200 exs.scale_inv = 2. exs.dparams = [] exs.simulate() return exs, Independence()
def BuildPoissonModels(hist_data, feature_list, comp_data=None): ''' Build score predictions via (linear) poisson regression. ''' hist_data_1 = hist_data[["team_1_score"] + feature_list] hist_data_2 = hist_data[["team_2_score"] + feature_list] formula_1 = "team_1_score ~ " + " + ".join(feature_list) formula_2 = "team_2_score ~ " + " + ".join(feature_list) # using the GEE package along with independance assumptions to fit poisson model. # Am assuming this is using a maximum likleyhood approach? fam = Poisson() ind = Independence() model_1 = GEE.from_formula(formula_1, "team_1_score", hist_data, cov_struct=ind, family=fam) model_2 = GEE.from_formula(formula_2, "team_2_score", hist_data, cov_struct=ind, family=fam) model_1_fit = model_1.fit() model_2_fit = model_2.fit() print(model_1_fit.summary()) hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data) hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data) # return historical data if comp_data wasn't passed. if comp_data is None: return hist_data # prepare comp data comp_data['team_1_score_pred'] = model_1_fit.predict( comp_data[feature_list]) comp_data['team_2_score_pred'] = model_2_fit.predict( comp_data[feature_list]) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
def setup_class(cls): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation va = Independence() cls.mod = NominalGEE(endog, exog, groups, cov_struct=va) cls.start_params = np.array( [0.44944752, 0.45569985, -0.92007064, -0.46766728])
def test_wrapper(self): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) endog = pd.Series(endog, name='yendog') exog = pd.DataFrame(exog) groups = pd.Series(groups, name='the_group') va = Independence() mod = NominalGEE(endog, exog, groups, cov_struct=va) rslt2 = mod.fit() check_wrapper(rslt2)
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() cls.mod = GEE(endog, exog, group_n, None, family, vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315])
def test_wrapper(self): endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False) endog = pd.Series(endog) exog = pd.DataFrame(exog) group_n = pd.Series(group_n) family = Poisson() vi = Independence() mod = GEE(endog, exog, group_n, None, family, vi) rslt2 = mod.fit() check_wrapper(rslt2)
def test_linear_constrained(self): family = Gaussian() exog = np.random.normal(size=(300, 4)) exog[:, 0] = 1 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\ np.random.normal(size=300) group = np.kron(np.arange(100), np.r_[1, 1, 1]) vi = Independence() ve = Exchangeable() L = np.r_[[[0, 0, 0, 1]]] R = np.r_[0, ] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v, constraint=(L, R)) mdf = md.fit() assert_almost_equal(mdf.params[3], 0, decimal=10)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_logistic(self): """ R code for comparing results: library(gee) Z = read.csv("results/gee_logistic_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="independence") smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="exchangeable") sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="AR-M") sma = summary(ma) u = coefficients(sma) cfa = paste(u[,1], collapse=",") sea = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[ 0.0167272965285882, 1.13038654425893, -1.86896345082962, 1.09397608331333 ], [ 0.0178982283915449, 1.13118798191788, -1.86133518416017, 1.08944256230299 ], [ 0.0109621937947958, 1.13226505028438, -1.88278757333046, 1.09954623769449 ]] se = [[ 0.127291720283049, 0.166725808326067, 0.192430061340865, 0.173141068839597 ], [ 0.127045031730155, 0.165470678232842, 0.192052750030501, 0.173174779369249 ], [ 0.127240302296444, 0.170554083928117, 0.191045527104503, 0.169776150974586 ]] for j, v in enumerate((vi, ve, va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def run_permutation_test(dependent, network, number_of_permutations, output_path): nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)), orient='index') degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index') centrality = pd.DataFrame.from_dict(dict( nx.betweenness_centrality(network)), orient='index') h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0) h1.columns = [ 'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree', 'centrality' ] h1['degree_dist'] = h1.degree / float(h1.degree.max()) equation = dependent + "~ Age + Sex" from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", h1, cov_struct=ind, family=fam) main_model_result = model.fit() main_result = pd.DataFrame(main_model_result.params).T degree_random_coeff = [] for i in range(number_of_permutations): rand_h1 = h1.copy() rand_h1[dependent] = np.random.permutation(h1[dependent]) fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", rand_h1, cov_struct=ind, family=fam) result = model.fit() degree_random_coeff.append(result.params) d = pd.DataFrame.from_records(degree_random_coeff) import seaborn as sns f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) ax1.hist(d['Age[T.HY]'], bins=100) ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272') p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax1.set_xlabel( 'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax1.set_ylabel('Frequency') ax2.hist(d['Age[T.UNK]'], bins=100) ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272') p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax3.hist(d['Sex[T.M]'], bins=100) ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272') p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' + '{0:.2f}'.format(p)) title = 'permutation test for ' + dependent f.suptitle(title) plt.tight_layout() plt.savefig(output_path + '/' + dependent + '_Permutation_test.png', dpi=300) plt.show()
# Loop over data generating models for gendat in gendats: pvalues = [] params = [] std_errors = [] dparams = [] for j in range(nrep): da, va = gendat() ga = Poisson() # Poisson seems to be more sensitive to starting values, # so we run the independence model first. md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence()) mdf = md.fit() md = GEE(da.endog, da.exog, da.group, da.time, ga, va) mdf = md.fit(start_params=mdf.params) if mdf is None or (not mdf.converged): print("Failed to converge") continue scale_inv = 1. / md.estimate_scale() dparams.append(np.r_[va.dparams, scale_inv]) params.append(np.asarray(mdf.params)) std_errors.append(np.asarray(mdf.standard_errors)) da, va = gendat() ga = Poisson()
for gendat in gendats: pvalues = [] params = [] std_errors = [] dparams = [] for j in range(nrep): da, va = gendat() ga = Poisson() # Poisson seems to be more sensitive to starting values, # so we run the independence model first. md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence()) mdf = md.fit() md = GEE(da.endog, da.exog, da.group, da.time, ga, va) mdf = md.fit(start_params = mdf.params) if mdf is None or (not mdf.converged): print("Failed to converge") continue scale_inv = 1. / md.estimate_scale() dparams.append(np.r_[va.dparams, scale_inv]) params.append(np.asarray(mdf.params)) std_errors.append(np.asarray(mdf.standard_errors)) da,va = gendat() ga = Poisson()
def test_poisson(self): """ library(gee) Z = read.csv("results/gee_poisson_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] X4 = Z[,6] X5 = Z[,7] mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="independence", scale.fix=TRUE) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="exchangeable", scale.fix=TRUE) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Poisson() endog, exog, group_n = load_data("gee_poisson_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.0364450410793481, -0.0543209391301178, 0.0156642711741052, 0.57628591338724, -0.00465659951186211, -0.477093153099256 ], [ -0.0315615554826533, -0.0562589480840004, 0.0178419412298561, 0.571512795340481, -0.00363255566297332, -0.475971696727736 ]] se = [[ 0.0611309237214186, 0.0390680524493108, 0.0334234174505518, 0.0366860768962715, 0.0304758505008105, 0.0316348058881079 ], [ 0.0610840153582275, 0.0376887268649102, 0.0325168379415177, 0.0369786751362213, 0.0296141014225009, 0.0306115470200955 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
# -*- coding: utf-8 -*- """ Created on Fri Aug 12 11:36:51 2016 @author: emg """ import numpy as np import pandas as pd from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model1 = GEE.from_formula("author_count ~ top + mod", "author", authors, cov_struct=ind, family=fam) result1 = model1.fit() print(result1.summary())
def test_linear(self): """ library(gee) Z = read.csv("results/gee_linear_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="independence", tol=1e-8, maxit=100) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="exchangeable", tol=1e-8, maxit=100) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Gaussian() endog, exog, group = load_data("gee_linear_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.01850226507491, 0.81436304278962, -1.56167635393184, 0.794239361055003 ], [ -0.0182920577154767, 0.814898414022467, -1.56194040106201, 0.793499517527478 ]] se = [[ 0.0440733554189401, 0.0479993639119261, 0.0496045952071308, 0.0479467597161284 ], [ 0.0440369906460754, 0.0480069787567662, 0.049519758758187, 0.0479760443027526 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)