def __init__(self, X, dist='OLS', alphas=[0.05, 0.01], log=True): self.X, self.xLen, self.dist, self.permute, self.zero_prob, self.alphas, self.dfd, self.dfn = X, len( X.names), dist, self.permute_REG, 0.0, alphas, len( X.names) - 1, len(X.array) - len(X.names) #F_KEY = {'TW': sfams.Tweedie(link=slinks.log), 'PO': sfams.Poisson(link=slinks.log), 'NB': sfams.NegativeBinomial(link=slinks.log), 'GA': sfams.Gamma(link=slinks.log), 'NO': sfams.Gaussian(link=slinks.log)} F_KEY = { 'TW': sfams.Tweedie(), 'PO': sfams.Poisson(), 'NB': sfams.NegativeBinomial(), 'GA': sfams.Gamma(), 'NO': sfams.Gaussian() } if self.dist.upper() == 'OLS': self.reg, self.execute = sm.OLS, self.execute_REG elif self.dist.upper()[0] == 'G': self.reg, self.execute, self.family = scm.ZeroInflatedNegativeBinomialP, self.execute_GIN, F_KEY[ 'NB'] elif self.dist.upper()[0] != 'Z': self.execute, self.permute, self.family = self.execute_GLM, self.permute_GLM, F_KEY[ self.dist.upper()[0:2]] else: if self.dist.upper()[0:3] in ['ZIP', 'ZPO']: self.reg, self.execute, self.family = CUSTOM_ZPO, self.execute_ZIN, F_KEY[ 'PO'] elif self.dist.upper()[0:3] in ['ZIN', 'ZNB']: self.reg, self.execute, self.family = CUSTOM_ZNB, self.execute_ZIN, F_KEY[ 'NB'] elif self.dist.upper()[0:3] in ['ZGP', 'ZGP']: self.reg, self.execute, self.family = CUSTOM_ZGP, self.execute_ZIN, F_KEY[ 'GP']
def test_fit_poisson(): from numpy.testing import assert_allclose np.random.seed(23424) n = 1000 p = 5 for d in range(1, 6): icept = np.linspace(3, 5, p) fac = np.random.normal(size=(p, d)) fac, _, _ = np.linalg.svd(fac, 0) sc = np.random.normal(size=(n, d)) lp = np.dot(sc, fac.T) + icept mu = np.exp(lp) endog = np.random.poisson(mu, size=(n, p)) pca = GPCA(endog, d, family=families.Poisson()) r = pca.fit(maxiter=50) icept1, fac1 = r.intercept, r.factors # Check intercepts assert_allclose(icept, icept1, atol=1e-2, rtol=1e-1) # Check factors p1 = np.dot(fac, fac.T) p2 = np.dot(fac1, fac1.T) assert_allclose(np.trace(np.dot(p1, p2)), d, atol=1e-2) # Scores should be approximately centered scores = pca.scores(r.params) assert_allclose(scores.mean(), 0, atol=1e-3) assert (r.score_norm < 0.005)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def test_fit_poisson(miss_frac, d): np.random.seed(23424) n, p = 2000, 5 icept = np.linspace(3, 5, p) fac = np.random.normal(size=(p, d)) fac, _, _ = np.linalg.svd(fac, 0) sc = np.random.normal(size=(n, d)) lp = np.dot(sc, fac.T) + icept mu = np.exp(lp) endog = np.random.poisson(mu, size=(n, p)) valid = (np.random.uniform(size=(n, p)) > miss_frac).astype(np.bool) pca = GPCA(endog, d, valid=valid, family=families.Poisson()) r = pca.fit() icept1, fac1 = r.intercept, r.factors # Check intercepts versus population values if not np.allclose(icept, icept1, atol=1e-2, rtol=1e-1): warnings.warn("icept=%s icept1=%s" % (icept, icept1)) # Check factors versus population values p1 = np.dot(fac, fac.T) p2 = np.dot(fac1, fac1.T) if not np.allclose( np.trace(np.dot(p1, p2)), d, rtol=[1e-2, 0.05][miss_frac > 0]): warnings.warn("d=%s, trace=%s" % (d, np.trace(np.dot(p1, p2)))) # Scores should be approximately centered scores = pca.scores(r.params) if not np.allclose(scores.mean(), 0, atol=[1e-2, 1e-1][miss_frac > 0]): warnings.warn(str(scores.mean(0))) assert (r.score_norm < 0.01)
def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): fam = families.Poisson() x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, vcp_names=vcp_names, vc_names=vc_names) return PoissonBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, fep_names=x.fep_names, vcp_names=x.vcp_names, vc_names=x.vc_names)
def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): fam = families.Poisson() x = _BayesMixedGLM.from_formula( formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p) # Copy over to the intended class structure mod = PoissonBayesMixedGLM( endog=x.endog, exog=x.exog, exog_vc=x.exog_vc, ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, fep_names=x.fep_names, vcp_names=x.vcp_names, vc_names=x.vc_names) mod.data = x.data return mod
def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): super(PoissonBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, family=families.Poisson(), fep_names=fep_names, vcp_names=vcp_names)
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit(cov_type='HC1') cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
def setup_class(cls): nobs, k_vars = 500, 5 np.random.seed(786452) x = np.random.randn(nobs, k_vars) x[:, 0] = 1 x2 = np.random.randn(nobs, 2) xx = np.column_stack((x, x2)) if cls.dispersed: het = np.random.randn(nobs) y = np.random.poisson(np.exp(x.sum(1) * 0.5 + het)) #y_mc = np.random.negative_binomial(np.exp(x.sum(1) * 0.5), 2) else: y = np.random.poisson(np.exp(x.sum(1) * 0.5)) cls.exog_extra = x2 cls.model_full = GLM(y, xx, family=families.Poisson()) cls.model_drop = GLM(y, x, family=families.Poisson())
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() #res_hc0_ = cls.res1.get_robustcov_results('HC1') get_robustcov_results(cls.res1._results, 'HC1', use_self=True) cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit(cov_type='HC1') cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape corr_fact = (nobs) / float(nobs - 1.) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(1. / corr_fact)
def regression(): '''Poisson regression example chapter 4.4, p.69''' # get the data from the web inFile = r'GLM_data/Table 4.3 Poisson regression.xls' df = get_data(inFile) # do the fit p = smf.glm('y~x', family=sm_families.Poisson(sm_families.links.identity), data=df) print(p.fit().summary())
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Poisson()) res = mod.fit(attach_wls=True, atol=1e-10) from statsmodels.discrete.discrete_model import Poisson mod2 = Poisson(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(tol=1e-10) cls.infl0 = res.get_influence() cls.infl1 = res2.get_influence()
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() #res_hc0_ = cls.res1.get_robustcov_results('HC1') get_robustcov_results(cls.res1._results, 'HC1', use_self=True) cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape corr_fact = (nobs) / float(nobs - 1.) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(1. / corr_fact)
def setup_class(cls): np.random.seed(987125643) # not intentional seed endog_count = np.random.poisson(endog) cls.cov_type = 'HC0' mod1 = GLM(endog_count, exog, family=families.Poisson()) cls.res1 = mod1.fit(cov_type='HC0') mod1 = smd.Poisson(endog_count, exog) cls.res2 = mod1.fit(cov_type='HC0') cls.res1.rtol = 1e-11
def log_linear_models(): '''Log-linear models chapter 9.7, p 180 & 182 ''' # Malignant melanoma, p 180 -------------------------------- inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls' df = get_data(inFile) # Minimal model model_min = smf.glm('frequency~1', family = sm_families.Poisson(), data=df).fit() print('Malignant melanoma') print(model_min.fittedvalues[0]) # Additive model model_add = smf.glm('frequency~site+type', family = sm_families.Poisson(), data=df).fit() print(model_add.fittedvalues[0]) # Saturated model # model_sat = smf.glm('frequency~site*type', family = sm_families.Poisson(), data=df).fit() # # The saturated model gives a perfect fit, and the fitted data are equal to # the original data. Statsmodels indicates a "PerfectSeparationError" # Ulcer and aspirin, p. 182 ------------------------------------- inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls' df = get_data(inFile) df.columns = ['GD', 'CC', 'AP', 'freq'] model1 = smf.glm('freq~GD+CC+GD*CC', family = sm_families.Poisson(), data=df).fit() model2 = smf.glm('freq~GD+CC+GD*CC + AP', family = sm_families.Poisson(), data=df).fit() model3 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC', family = sm_families.Poisson(), data=df).fit() model4 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = sm_families.Poisson(), data=df).fit() print('Ulcer and aspirin') print(model4.fittedvalues)
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit() get_robustcov_results(cls.res1._results, 'cluster', groups=group, use_correction=True, df_correction=True, #TODO has no effect use_t=False, #True, use_self=True) cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1)
def setup_class(cls): from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_noexposure_constraint cls.idx = [7, 3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson()) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)
def exercise9d2(): '''Poisson Regression chapter 9.2, p.170 & 171 ''' inFile = r"GLM_data/Table 9.13 Car insurance.xls" df = get_data(inFile) print(df) df['carage'] = df['car']*df['age'] df['cardist'] = df['car']*(df['district']) df['agedist'] = df['age']*df['district'] model = smf.glm('y~car+age+district+carage+cardist+agedist', family=sm_families.Poisson(), data=df, exposure=df['n']).fit() print(model.summary()) pdb.set_trace()
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit(cov_type='cluster', cov_kwds=dict(groups=group, use_correction=True, df_correction=True), #TODO has no effect use_t=False, #True, ) # The model results, t_test, ... should also work without # normalized_cov_params, see #2209 # Note: we cannot set on the wrapper res1, we need res1._results cls.res1._results.normalized_cov_params = None cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1)
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit( cov_type='cluster', cov_kwds=dict(groups=group, use_correction=True, df_correction=True), #TODO has no effect use_t=False, #True, ) cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape k_params = len(cls.res1.params) #n_groups = len(np.unique(group)) corr_fact = (nobs - 1.) / float(nobs - k_params) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(corr_fact)
def setup_class(cls): from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_exposure_constraint cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example with offset formula = 'deaths ~ smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson(), offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)._results
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_score(): from numpy.testing import assert_allclose np.random.seed(23424) n = 100 p = 5 for j in 0, 1: for d in 1, 2, 3: for k in range(10): if j == 0: endog = np.random.normal(size=(n, p)) pca = GPCA(endog, d) mn = np.random.normal(size=p) else: endog = np.random.poisson(100, size=(n, p)) pca = GPCA(endog, d, family=families.Poisson()) mn = np.log(100) + 0.5 * np.random.normal(size=p) qm = np.random.normal(size=(p, d)) qm, _, _ = np.linalg.svd(qm, 0) params = np.concatenate((mn, qm.ravel())) ll = pca.loglike(params) score = pca.score(params) # Numeric derivative nscore = np.zeros(p + p * d) f = 1e-7 for j in range(p + p * d): params1 = params.copy() params1[j] += f nscore[j] = (pca.loglike(params1) - pca.loglike(params)) / f assert_allclose(nscore, score, atol=1e-3, rtol=1e-4)
def poisson_regression(): '''Poisson Regression chapter 9.2, p.170 & 171 ''' inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls" df = get_data(inFile) print(df) # Generate the required variables df['smoke'] = np.zeros(len(df)) df['smoke'][df['smoking'] == 'smoker'] = 1 df['agecat'] = np.array([1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) df['agesq'] = df['agecat']**2 df['smkage'] = df['agecat'] df['smkage'][df['smoking'] == 'non-smoker'] = 0 model = smf.glm('deaths~agecat+agesq+smoke+smkage', family=sm_families.Poisson(), data=df, exposure=df["person-years"]).fit() print(model.summary())
def test_score_poisson(miss_frac): np.random.seed(23424) n, p = 100, 5 for d in range(1, 5): # Generate the data icept = np.linspace(3, 5, p) fac = np.random.normal(size=(p, d)) fac, _, _ = np.linalg.svd(fac, 0) sc = np.random.normal(size=(n, d)) lp = np.dot(sc, fac.T) + icept mu = np.exp(lp) endog = np.random.poisson(mu, size=(n, p)) valid = (np.random.uniform(size=(n, p)) > miss_frac).astype(np.bool) pca = GPCA(endog, d, valid=valid, family=families.Poisson()) par = np.concatenate((icept, fac.ravel())) grad = pca.score(par) ngrad = nd.Gradient(pca.loglike)(par) assert_allclose(grad, ngrad, rtol=1e-4, atol=1e-4)
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit(cov_type='cluster', cov_kwds=dict(groups=group, use_correction=True, df_correction=True), #TODO has no effect use_t=False, #True, ) # The model results, t_test, ... should also work without # normalized_cov_params, see #2209 # Note: we cannot set on the wrapper res1, we need res1._results cls.res1._results.normalized_cov_params = None cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape k_params = len(cls.res1.params) #n_groups = len(np.unique(group)) corr_fact = (nobs-1.) / float(nobs - k_params) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(corr_fact)
#b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)]) b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(z)]) b.shape = y.shape m = GAM(b, d, family=f) toc = time.time() m.fit(b) tic = time.time() print tic-toc #for plotting yp = f.link.inverse(y) p = b if example == 3: print "Poisson" f = families.Poisson() #y = y/y.max() * 3 yp = f.link.inverse(z) #p = np.asarray([scipy.stats.poisson.rvs(p) for p in f.link.inverse(y)], float) p = np.asarray([scipy.stats.poisson.rvs(p) for p in f.link.inverse(z)], float) p.shape = y.shape m = GAM(p, d, family=f) toc = time.time() m.fit(p) tic = time.time() print tic-toc if example > 1: y_pred = m.results.mu# + m.results.alpha#m.results.predict(d) plt.figure() plt.subplot(2,2,1)
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = smd.Poisson(endog, exog) mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() cls.get_robust_clu()