def test_sensitivity(self): va = Exchangeable() family = Gaussian() n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], np.r_[-0.1256575, -0.126747036])
def gendat_exchangeable(): exs = Exchangeable_simulator() exs.params = np.r_[2., 0.2, 0.2, -0.1, -0.2] exs.ngroups = 200 exs.dparams = [0.3,] exs.simulate() return exs, Exchangeable()
def vcfassoc(formula, covariate_df, groups=None): y, X = patsy.dmatrices(str(formula), covariate_df, return_type='dataframe') # get the column containing genotype ix = get_genotype_ix(X) Binomial = sm.families.Binomial logit = sm.families.links.Logit() if groups is not None: #covariate_df['grps'] = map(str, range(len(covariate_df) / 8)) * 8 if not isinstance(groups, (pd.DataFrame, np.ndarray)): cov = Exchangeable() model = sm.GEE(y, X, groups=covariate_df[groups], cov_struct=cov, family=Binomial()) else: model = sm.GLS(logit(y), X, sigma=groups.ix[X.index, X.index]) else: model = sm.GLM(y, X, missing='drop', family=Binomial()) result = model.fit(maxiter=1000) res = { 'OR': np.exp(result.params[ix]), 'pvalue': result.pvalues[ix], 'z': result.tvalues[ix], 'OR_CI': tuple(np.exp(result.conf_int().ix[ix, :])), } try: res['df_resid'] = result.df_resid except AttributeError: pass return res
def test_post_estimation(self): family = Gaussian() endog, exog, group = load_data("gee_linear_1.csv") ve = Exchangeable() md = GEE(endog, exog, group, None, family, ve) mdf = md.fit() assert_almost_equal(np.dot(exog, mdf.params), mdf.fittedvalues) assert_almost_equal(endog - np.dot(exog, mdf.params), mdf.resid)
def test_linear_constrained(self): family = Gaussian() exog = np.random.normal(size=(300, 4)) exog[:, 0] = 1 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\ np.random.normal(size=300) group = np.kron(np.arange(100), np.r_[1, 1, 1]) vi = Independence() ve = Exchangeable() L = np.r_[[[0, 0, 0, 1]]] R = np.r_[0, ] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v, constraint=(L, R)) mdf = md.fit() assert_almost_equal(mdf.params[3], 0, decimal=10)
def test_margins(self): n = 300 exog = np.random.normal(size=(n, 4)) exog[:, 0] = 1 exog[:, 1] = 1 * (exog[:, 2] < 0) group = np.kron(np.arange(n / 4), np.ones(4)) time = np.zeros((n, 1)) beta = np.r_[0, 1, -1, 0.5] lpr = np.dot(exog, beta) prob = 1 / (1 + np.exp(-lpr)) endog = 1 * (np.random.uniform(size=n) < prob) fa = Binomial() ex = Exchangeable() md = GEE(endog, exog, group, time, fa, ex) mdf = md.fit() marg = GEEMargins(mdf, ()) marg.summary()
def test_poisson(self): """ library(gee) Z = read.csv("results/gee_poisson_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] X4 = Z[,6] X5 = Z[,7] mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="independence", scale.fix=TRUE) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="exchangeable", scale.fix=TRUE) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Poisson() endog, exog, group_n = load_data("gee_poisson_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.0364450410793481, -0.0543209391301178, 0.0156642711741052, 0.57628591338724, -0.00465659951186211, -0.477093153099256 ], [ -0.0315615554826533, -0.0562589480840004, 0.0178419412298561, 0.571512795340481, -0.00363255566297332, -0.475971696727736 ]] se = [[ 0.0611309237214186, 0.0390680524493108, 0.0334234174505518, 0.0366860768962715, 0.0304758505008105, 0.0316348058881079 ], [ 0.0610840153582275, 0.0376887268649102, 0.0325168379415177, 0.0369786751362213, 0.0296141014225009, 0.0306115470200955 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def test_linear(self): """ library(gee) Z = read.csv("results/gee_linear_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="independence", tol=1e-8, maxit=100) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="exchangeable", tol=1e-8, maxit=100) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Gaussian() endog, exog, group = load_data("gee_linear_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.01850226507491, 0.81436304278962, -1.56167635393184, 0.794239361055003 ], [ -0.0182920577154767, 0.814898414022467, -1.56194040106201, 0.793499517527478 ]] se = [[ 0.0440733554189401, 0.0479993639119261, 0.0496045952071308, 0.0479467597161284 ], [ 0.0440369906460754, 0.0480069787567662, 0.049519758758187, 0.0479760443027526 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)
def test_logistic(self): """ R code for comparing results: library(gee) Z = read.csv("results/gee_logistic_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="independence") smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="exchangeable") sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="AR-M") sma = summary(ma) u = coefficients(sma) cfa = paste(u[,1], collapse=",") sea = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[ 0.0167272965285882, 1.13038654425893, -1.86896345082962, 1.09397608331333 ], [ 0.0178982283915449, 1.13118798191788, -1.86133518416017, 1.08944256230299 ], [ 0.0109621937947958, 1.13226505028438, -1.88278757333046, 1.09954623769449 ]] se = [[ 0.127291720283049, 0.166725808326067, 0.192430061340865, 0.173141068839597 ], [ 0.127045031730155, 0.165470678232842, 0.192052750030501, 0.173174779369249 ], [ 0.127240302296444, 0.170554083928117, 0.191045527104503, 0.169776150974586 ]] for j, v in enumerate((vi, ve, va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def gee_cluster(formula, cluster, covs, coef, cov_struct=Exchangeable(), family=Gaussian()): """An example of a `model_fn`; any function with a similar signature can be used. Parameters ---------- formula : str R (patsy) style formula. Must contain 'methylation': e.g.: methylation ~ age + gender + race cluster : list of Features cluster of features from clustering or a region. most functions will create a methylation matrix with: >> meth = np.array([f.values for f in features]) covs : pandas.DataFrame Contains covariates from `formula` coef: str coefficient of interest, e.g. 'age' cov_struct: object one of the covariance structures provided by statsmodels. Likely either Exchangeable() or Independence() family: object one of the familyies provided by statsmodels. If Guassian(), then methylation is assumed to be count-based (clusters of CountFeatures. Returns ------- result : dict dict with values (keys) of at least p-value ('p'), coefficient estimate ('coef') and any other information desired. """ if isinstance(family, Gaussian): cov_rep = long_covs(covs, np.array([f.values for f in cluster])) res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep, cov_struct=cov_struct, family=family).fit() elif isinstance(family, (NB, Poisson)): cov_rep = long_covs(covs, np.array([f.methylated for f in cluster]), counts=np.array([f.counts for f in cluster])) res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep, cov_struct=cov_struct, family=family, offset=np.log(cov_rep['counts'])).fit() else: raise Exception("Only gaussian and poisson are supported") return get_ptc(res, coef)
# Approach one to generalized linear models for panel data: Generalized Estimating Equations # poisson model poi=Poisson() ar=Autoregressive() gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) gee_model0_results = gee_model0.fit(maxiter=200) print(gee_model0_results.summary()) print(ar.summary()) print("scale=%.2f" % (gee_model0_results.scale)) # There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case, # specific starting values are needed to get the estimating algorithm to converge. # First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077. fam = Poisson() ex = Exchangeable() ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) ex_results = ex_model.fit() print(ex_results.summary()) print(ex.summary()) # use these results as the starting values for model with autoregressive dependence structure. but still we got the warning message... poi=Poisson() ar=Autoregressive() ar.dep_params = 0.077 gee_model1 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) gee_model1_results = gee_model1.fit(maxiter=200, start_params=ex_results.params) print(gee_model1_results.summary()) print(ar.summary())