def __init__(self, X, dist='OLS', alphas=[0.05, 0.01], log=True): self.X, self.xLen, self.dist, self.permute, self.zero_prob, self.alphas, self.dfd, self.dfn = X, len( X.names), dist, self.permute_REG, 0.0, alphas, len( X.names) - 1, len(X.array) - len(X.names) #F_KEY = {'TW': sfams.Tweedie(link=slinks.log), 'PO': sfams.Poisson(link=slinks.log), 'NB': sfams.NegativeBinomial(link=slinks.log), 'GA': sfams.Gamma(link=slinks.log), 'NO': sfams.Gaussian(link=slinks.log)} F_KEY = { 'TW': sfams.Tweedie(), 'PO': sfams.Poisson(), 'NB': sfams.NegativeBinomial(), 'GA': sfams.Gamma(), 'NO': sfams.Gaussian() } if self.dist.upper() == 'OLS': self.reg, self.execute = sm.OLS, self.execute_REG elif self.dist.upper()[0] == 'G': self.reg, self.execute, self.family = scm.ZeroInflatedNegativeBinomialP, self.execute_GIN, F_KEY[ 'NB'] elif self.dist.upper()[0] != 'Z': self.execute, self.permute, self.family = self.execute_GLM, self.permute_GLM, F_KEY[ self.dist.upper()[0:2]] else: if self.dist.upper()[0:3] in ['ZIP', 'ZPO']: self.reg, self.execute, self.family = CUSTOM_ZPO, self.execute_ZIN, F_KEY[ 'PO'] elif self.dist.upper()[0:3] in ['ZIN', 'ZNB']: self.reg, self.execute, self.family = CUSTOM_ZNB, self.execute_ZIN, F_KEY[ 'NB'] elif self.dist.upper()[0:3] in ['ZGP', 'ZGP']: self.reg, self.execute, self.family = CUSTOM_ZGP, self.execute_ZIN, F_KEY[ 'GP']
def multiple_linear_regression(): '''Multiple linear regression chapter 6.3, p. 98''' # get the data from the web inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls' df = get_data(inFile) # do the fit, for the original model ... model = smf.ols('carbohydrate ~ age + weight + protein', data=df).fit() print(model.summary()) print(sm_stats.anova_lm(model)) # as GLM glm = smf.glm('carbohydrate ~ age + weight + protein', family=sm_families.Gaussian(), data=df).fit() print('Same model, calculated with GLM') ''' The confidence intervals are different than those from OLS. The reason (from Nathaniel Smith): OLS uses a method that gives exact results, but only works in the special case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM instead uses an approximate method which is correct asymptotically but may be off for small samples; the tradeoff you get in return is that this method works the same way for all GLM models, including those with non-Gaussian error terms and non-trivial link functions. So that's why they're different. ''' print(glm.summary()) # ... and for model 1 model1 = smf.ols('carbohydrate ~ weight + protein', data=df).fit() print(model1.summary()) print(sm_stats.anova_lm(model1))
def setup_class(cls): cls.cov_type = 'HC0' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HC0') mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HC0')
def setup_class(cls): cls.cov_type = 'cluster' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink())) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Probit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): cls.cov_type = 'HAC' kwds = {'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def setup_class(cls): nobs, k_vars = 500, 5 np.random.seed(786452) x = np.random.randn(nobs, k_vars) x[:, 0] = 1 x2 = np.random.randn(nobs, 2) xx = np.column_stack((x, x2)) if cls.dispersed: het = np.random.randn(nobs) y = np.random.randn(nobs) + x.sum(1) * 0.5 + het #y_mc = np.random.negative_binomial(np.exp(x.sum(1) * 0.5), 2) else: y = np.random.randn(nobs) + x.sum(1) * 0.5 cls.exog_extra = x2 cls.model_full = GLM(y, xx, family=families.Gaussian()) cls.model_drop = GLM(y, x, family=families.Gaussian())
def __init__(self, endog, exog, smoothers=None, family=families.Gaussian()): #self.family = family #TODO: inconsistent super __init__ AdditiveModel.__init__(self, exog, smoothers=smoothers, family=family) GLM.__init__(self, endog, exog, family=family) assert self.family is family #make sure we got the right family
def setup_class(cls): cls.cov_type = 'HAC' # check kernel specified as string kwds = {'kernel': 'bartlett', 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) kwds2 = {'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'kernel': sw.weights_uniform, 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) # check kernel as string mod2 = OLS(endog, exog) kwds2 = {'kernel': 'uniform', 'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def setup_class(cls): cls.cov_type = 'HAC' kwds = {'kernel': sw.weights_uniform, 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds) #for debugging cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags': 2})
def setup_class(cls): cls.cov_type = 'hac-groupsum' # time index is just made up to have a test case time = np.tile(np.arange(7), 5)[:-1] mod1 = GLM(endog, exog, family=families.Gaussian()) kwds = dict(time=time, maxlags=2, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds) cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
def setup_class(cls): cls.cov_type = 'hac-panel' # time index is just made up to have a test case groups = np.repeat(np.arange(5), 7)[:-1] mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian()) kwds = dict(groups=groups, maxlags=2, kernel=sw.weights_uniform, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
def setup_class(cls): import statsmodels.stats.sandwich_covariance as sw cls.cov_type = 'hac-panel' # time index is just made up to have a test case time = np.tile(np.arange(7), 5)[:-1] mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian()) kwds = dict(time=time, maxlags=2, kernel=sw.weights_uniform, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds) cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
def _check_inputs(self, family, offset, exposure, endog): # Default family is Gaussian if family is None: family = families.Gaussian() self.family = family if exposure is not None: if not isinstance(self.family.link, families.links.Log): raise ValueError("exposure can only be used with the log " "link function") elif exposure.shape[0] != endog.shape[0]: raise ValueError("exposure is not the same length as endog") if offset is not None: if offset.shape[0] != endog.shape[0]: raise ValueError("offset is not the same length as endog")
def __init__(self, exog, smoothers=None, weights=None, family=None): self.exog = exog if not weights is None: self.weights = weights else: self.weights = np.ones(self.exog.shape[0]) self.smoothers = smoothers or [default_smoother(exog[:,i]) for i in range(exog.shape[1])] #TODO: why do we set here df, refactoring temporary? for i in range(exog.shape[1]): self.smoothers[i].df = 10 if family is None: self.family = families.Gaussian() else: self.family = family
def __init__(self, endog, exog, groups, family=None, cov_struct=None, missing='none', **kwargs): # Handle the family argument if family is None: family = families.Gaussian() else: if not issubclass(family.__class__, families.Family): raise ValueError("QIF: `family` must be a genmod " "family instance") self.family = family self._fit_history = defaultdict(list) # Handle the cov_struct argument if cov_struct is None: cov_struct = QIFIndependence() else: if not isinstance(cov_struct, QIFCovariance): raise ValueError( "QIF: `cov_struct` must be a QIFCovariance instance") self.cov_struct = cov_struct groups = np.asarray(groups) super(QIF, self).__init__(endog, exog, groups=groups, missing=missing, **kwargs) self.group_names = list(set(groups)) self.nobs = len(self.endog) groups_ix = defaultdict(list) for i, g in enumerate(groups): groups_ix[g].append(i) self.groups_ix = [groups_ix[na] for na in self.group_names] self._check_args(groups)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def anova(): '''ANOVA chapter 6.4, p. 108, and p. 113 GLM does not work with anova_lm. ''' # get the data from the web inFile = r'GLM_data/Table 6.6 Plant experiment.xls' df = get_data(inFile) # fit the model (p 109) glm = smf.glm('weight~group', family=sm_families.Gaussian(), data=df) print(glm.fit().summary()) print('-'*65) print('OLS') model = smf.ols('weight~group', data=df) print(model.fit().summary()) print(sm_stats.anova_lm(model.fit())) # The model corresponding to the null hypothesis of no treatment effect is model0 = smf.ols('weight~1', data=df) # Get the data for the two-factor ANOVA (p 113) inFile = r'GLM_data/Table 6.9 Two-factor data.xls' df = get_data(inFile) # adjust the header names from the Excel-file df.columns = ['A','B', 'data'] # two-factor anova, with interactions ols_int = smf.ols('data~A*B', data=df) sm_stats.anova_lm(ols_int.fit()) # The python commands for the other four models are ols_add = smf.ols('data~A+B', data=df) ols_A = smf.ols('data~A', data=df) ols_B = smf.ols('data~B', data=df) ols_mean = smf.ols('data~1', data=df)
import numpy as np from numpy.testing import assert_allclose import pandas as pd import pytest from statsmodels.genmod.qif import (QIF, QIFIndependence, QIFExchangeable, QIFAutoregressive) from statsmodels.tools.numdiff import approx_fprime from statsmodels.genmod import families @pytest.mark.parametrize( "fam", [families.Gaussian(), families.Poisson(), families.Binomial()]) @pytest.mark.parametrize( "cov_struct", [QIFIndependence(), QIFExchangeable(), QIFAutoregressive()]) def test_qif_numdiff(fam, cov_struct): # Test the analytic scores against numeric derivatives np.random.seed(234234) n = 200 q = 4 x = np.random.normal(size=(n, 3)) if isinstance(fam, families.Gaussian): e = np.kron(np.random.normal(size=n // q), np.ones(q)) e = np.sqrt(0.5) * e + np.sqrt(1 - 0.5**2) * np.random.normal(size=n) y = x.sum(1) + e elif isinstance(fam, families.Poisson): y = np.random.poisson(5, size=n)
def __init__(self, endog, ndim, offset=None, family=None, penmat=None): """ Fit a generalized principal component analysis. This analysis fits a generalized linear model (GLM) to a rectangular data array. The linear predictor, which in a GLM would be derived from covariates, is instead represented as a factor-structured matrix. If endog is n x p and we wish to extract d factors, then the linear predictor is represented as 1*icept' + (s - 1*icept')*F*F', where 1 is a column vector of n 1's, s is a n x p matrix containing the 'saturated' linear predictor, and F is a p x d orthogonal matrix of loadings. Parameters ---------- endog : array-like The data to which a reduced-rank structure is fit. ndim : integer The dimension of the low-rank structure. family : GLM family instance The GLM family to use in the analysis offset : array-like An optional offset vector Returns ------- A GPCAResults instance. Notes ----- Estimation uses the Grassmann optimization approach of Edelman, rather than the approaches from Landgraf and Lee. References ---------- A. Landgraf, Y.Lee (2019). Generalized Principal Component Analysis: Projection of saturated model parameters. Technometrics. https://www.asc.ohio-state.edu/lee.2272/mss/tr890.pdf Edelman,Arias, Smith (1999). The geometry of algorithms with orthogonality constraints. https://arxiv.org/abs/physics/9806030 """ if family is None: # Default family family = families.Gaussian() self.family = family self.endog = np.asarray(endog) self.ndim = ndim if offset is not None: if offset.shape != endog.shape: msg = "endog and offset must have the same shape" raise ValueError(msg) self.offset = np.asarray(offset) if penmat is not None: pm = [] if len(penmat) != 2: msg = "penmat must be a tuple of length 2" raise ValueError(msg) for j in range(2): if np.isscalar(penmat[j]): n, p = endog.shape pm.append(self._gen_penmat(penmat[j], n, p)) else: pm.append(penmat[j]) self.penmat = pm # Calculate the saturated parameter if isinstance(family, families.Poisson): satparam = np.where(endog != 0, np.log(endog), -3) elif isinstance(family, families.Binomial): satparam = np.where(endog == 1, 3, -3) elif isinstance(family, families.Gaussian): satparam = endog else: raise ValueError("Unknown family") self.satparam = satparam