Example #1
0
    def __init__(self, X, dist='OLS', alphas=[0.05, 0.01], log=True):

        self.X, self.xLen, self.dist, self.permute, self.zero_prob, self.alphas, self.dfd, self.dfn = X, len(
            X.names), dist, self.permute_REG, 0.0, alphas, len(
                X.names) - 1, len(X.array) - len(X.names)

        #F_KEY = {'TW': sfams.Tweedie(link=slinks.log), 'PO': sfams.Poisson(link=slinks.log), 'NB': sfams.NegativeBinomial(link=slinks.log), 'GA': sfams.Gamma(link=slinks.log), 'NO': sfams.Gaussian(link=slinks.log)}
        F_KEY = {
            'TW': sfams.Tweedie(),
            'PO': sfams.Poisson(),
            'NB': sfams.NegativeBinomial(),
            'GA': sfams.Gamma(),
            'NO': sfams.Gaussian()
        }

        if self.dist.upper() == 'OLS':
            self.reg, self.execute = sm.OLS, self.execute_REG
        elif self.dist.upper()[0] == 'G':
            self.reg, self.execute, self.family = scm.ZeroInflatedNegativeBinomialP, self.execute_GIN, F_KEY[
                'NB']

        elif self.dist.upper()[0] != 'Z':
            self.execute, self.permute, self.family = self.execute_GLM, self.permute_GLM, F_KEY[
                self.dist.upper()[0:2]]
        else:

            if self.dist.upper()[0:3] in ['ZIP', 'ZPO']:
                self.reg, self.execute, self.family = CUSTOM_ZPO, self.execute_ZIN, F_KEY[
                    'PO']
            elif self.dist.upper()[0:3] in ['ZIN', 'ZNB']:
                self.reg, self.execute, self.family = CUSTOM_ZNB, self.execute_ZIN, F_KEY[
                    'NB']
            elif self.dist.upper()[0:3] in ['ZGP', 'ZGP']:
                self.reg, self.execute, self.family = CUSTOM_ZGP, self.execute_ZIN, F_KEY[
                    'GP']
Example #2
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)
    
    # do the fit, for the original model ...
    model = smf.ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print(model.summary())
    print(sm_stats.anova_lm(model))

    # as GLM
    glm = smf.glm('carbohydrate ~ age + weight + protein',
            family=sm_families.Gaussian(), data=df).fit()
    print('Same model, calculated with GLM')
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print(glm.summary())
    
    # ... and for model 1
    model1 = smf.ols('carbohydrate ~ weight + protein', data=df).fit()
    print(model1.summary())
    print(sm_stats.anova_lm(model1))
    def setup_class(cls):
        cls.cov_type = 'HC0'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HC0')
    def setup_class(cls):
        cls.cov_type = 'cluster'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
Example #5
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink()))
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds = {'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
    def setup_class(cls):
        nobs, k_vars = 500, 5

        np.random.seed(786452)
        x = np.random.randn(nobs, k_vars)
        x[:, 0] = 1
        x2 = np.random.randn(nobs, 2)
        xx = np.column_stack((x, x2))

        if cls.dispersed:
            het = np.random.randn(nobs)
            y = np.random.randn(nobs) + x.sum(1) * 0.5 + het
            #y_mc = np.random.negative_binomial(np.exp(x.sum(1) * 0.5), 2)
        else:
            y = np.random.randn(nobs) + x.sum(1) * 0.5

        cls.exog_extra = x2
        cls.model_full = GLM(y, xx, family=families.Gaussian())
        cls.model_drop = GLM(y, x, family=families.Gaussian())
Example #8
0
 def __init__(self,
              endog,
              exog,
              smoothers=None,
              family=families.Gaussian()):
     #self.family = family
     #TODO: inconsistent super __init__
     AdditiveModel.__init__(self, exog, smoothers=smoothers, family=family)
     GLM.__init__(self, endog, exog, family=family)
     assert self.family is family  #make sure we got the right family
    def setup_class(cls):

        cls.cov_type = 'HAC'

        # check kernel specified as string
        kwds = {'kernel': 'bartlett', 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        kwds2 = {'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        # check kernel as string
        mod2 = OLS(endog, exog)
        kwds2 = {'kernel': 'uniform', 'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds = {'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)

        #for debugging
        cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags': 2})
Example #12
0
    def setup_class(cls):
        cls.cov_type = 'hac-groupsum'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog, exog, family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
Example #13
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        groups = np.repeat(np.arange(5), 7)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(groups=groups,
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
Example #14
0
    def setup_class(cls):
        import statsmodels.stats.sandwich_covariance as sw
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
Example #15
0
    def _check_inputs(self, family, offset, exposure, endog):
        # Default family is Gaussian
        if family is None:
            family = families.Gaussian()
        self.family = family

        if exposure is not None:
            if not isinstance(self.family.link, families.links.Log):
                raise ValueError("exposure can only be used with the log "
                                 "link function")
            elif exposure.shape[0] != endog.shape[0]:
                raise ValueError("exposure is not the same length as endog")

        if offset is not None:
            if offset.shape[0] != endog.shape[0]:
                raise ValueError("offset is not the same length as endog")
Example #16
0
    def __init__(self, exog, smoothers=None, weights=None, family=None):
        self.exog = exog
        if not weights is None:
            self.weights = weights
        else:
            self.weights = np.ones(self.exog.shape[0])

        self.smoothers = smoothers or [default_smoother(exog[:,i]) for i in range(exog.shape[1])]

        #TODO: why do we set here df, refactoring temporary?
        for i in range(exog.shape[1]):
            self.smoothers[i].df = 10

        if family is None:
            self.family = families.Gaussian()
        else:
            self.family = family
Example #17
0
    def __init__(self,
                 endog,
                 exog,
                 groups,
                 family=None,
                 cov_struct=None,
                 missing='none',
                 **kwargs):

        # Handle the family argument
        if family is None:
            family = families.Gaussian()
        else:
            if not issubclass(family.__class__, families.Family):
                raise ValueError("QIF: `family` must be a genmod "
                                 "family instance")
        self.family = family

        self._fit_history = defaultdict(list)

        # Handle the cov_struct argument
        if cov_struct is None:
            cov_struct = QIFIndependence()
        else:
            if not isinstance(cov_struct, QIFCovariance):
                raise ValueError(
                    "QIF: `cov_struct` must be a QIFCovariance instance")
        self.cov_struct = cov_struct

        groups = np.asarray(groups)

        super(QIF, self).__init__(endog,
                                  exog,
                                  groups=groups,
                                  missing=missing,
                                  **kwargs)

        self.group_names = list(set(groups))
        self.nobs = len(self.endog)

        groups_ix = defaultdict(list)
        for i, g in enumerate(groups):
            groups_ix[g].append(i)
        self.groups_ix = [groups_ix[na] for na in self.group_names]

        self._check_args(groups)
Example #18
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
Example #19
0
def anova():
    '''ANOVA
    chapter 6.4, p. 108, and p. 113
    GLM does not work with anova_lm.
    '''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.6 Plant experiment.xls'
    df = get_data(inFile)
    
    # fit the model (p 109)
    glm = smf.glm('weight~group', family=sm_families.Gaussian(), data=df)
    print(glm.fit().summary())
    
    print('-'*65)
    print('OLS')
    model = smf.ols('weight~group', data=df)
    print(model.fit().summary())
    print(sm_stats.anova_lm(model.fit()))
    
    # The model corresponding to the null hypothesis of no treatment effect is
    model0 = smf.ols('weight~1', data=df)
    
    # Get the data for the two-factor ANOVA (p 113)
    inFile = r'GLM_data/Table 6.9 Two-factor data.xls' 
    df = get_data(inFile)
    
    # adjust the header names from the Excel-file
    df.columns = ['A','B', 'data']
    
    # two-factor anova, with interactions
    ols_int = smf.ols('data~A*B', data=df)
    sm_stats.anova_lm(ols_int.fit())
    
    # The python commands for the other four models are
    ols_add = smf.ols('data~A+B', data=df)
    ols_A = smf.ols('data~A', data=df)    
    ols_B = smf.ols('data~B', data=df)    
    ols_mean = smf.ols('data~1', data=df)    
Example #20
0
import numpy as np
from numpy.testing import assert_allclose
import pandas as pd
import pytest
from statsmodels.genmod.qif import (QIF, QIFIndependence, QIFExchangeable,
                                    QIFAutoregressive)
from statsmodels.tools.numdiff import approx_fprime
from statsmodels.genmod import families


@pytest.mark.parametrize(
    "fam", [families.Gaussian(),
            families.Poisson(),
            families.Binomial()])
@pytest.mark.parametrize(
    "cov_struct", [QIFIndependence(),
                   QIFExchangeable(),
                   QIFAutoregressive()])
def test_qif_numdiff(fam, cov_struct):
    # Test the analytic scores against numeric derivatives

    np.random.seed(234234)
    n = 200
    q = 4
    x = np.random.normal(size=(n, 3))
    if isinstance(fam, families.Gaussian):
        e = np.kron(np.random.normal(size=n // q), np.ones(q))
        e = np.sqrt(0.5) * e + np.sqrt(1 - 0.5**2) * np.random.normal(size=n)
        y = x.sum(1) + e
    elif isinstance(fam, families.Poisson):
        y = np.random.poisson(5, size=n)
Example #21
0
    def __init__(self, endog, ndim, offset=None, family=None, penmat=None):
        """
        Fit a generalized principal component analysis.

        This analysis fits a generalized linear model (GLM) to a
        rectangular data array.  The linear predictor, which in a GLM
        would be derived from covariates, is instead represented as a
        factor-structured matrix.  If endog is n x p and we wish to
        extract d factors, then the linear predictor is represented as
        1*icept' + (s - 1*icept')*F*F', where 1 is a column vector of
        n 1's, s is a n x p matrix containing the 'saturated' linear
        predictor, and F is a p x d orthogonal matrix of loadings.

        Parameters
        ----------
        endog : array-like
            The data to which a reduced-rank structure is fit.
        ndim : integer
            The dimension of the low-rank structure.
        family : GLM family instance
            The GLM family to use in the analysis
        offset : array-like
            An optional offset vector

        Returns
        -------
        A GPCAResults instance.

        Notes
        -----
        Estimation uses the Grassmann optimization approach of Edelman,
        rather than the approaches from Landgraf and Lee.

        References
        ----------
        A. Landgraf, Y.Lee (2019). Generalized Principal Component Analysis:
        Projection of saturated model parameters.  Technometrics.
        https://www.asc.ohio-state.edu/lee.2272/mss/tr890.pdf

        Edelman,Arias, Smith (1999).  The geometry of algorithms with orthogonality
        constraints.
        https://arxiv.org/abs/physics/9806030
        """

        if family is None:
            # Default family
            family = families.Gaussian()

        self.family = family
        self.endog = np.asarray(endog)
        self.ndim = ndim

        if offset is not None:
            if offset.shape != endog.shape:
                msg = "endog and offset must have the same shape"
                raise ValueError(msg)
            self.offset = np.asarray(offset)

        if penmat is not None:
            pm = []
            if len(penmat) != 2:
                msg = "penmat must be a tuple of length 2"
                raise ValueError(msg)
            for j in range(2):
                if np.isscalar(penmat[j]):
                    n, p = endog.shape
                    pm.append(self._gen_penmat(penmat[j], n, p))
                else:
                    pm.append(penmat[j])
            self.penmat = pm

        # Calculate the saturated parameter
        if isinstance(family, families.Poisson):
            satparam = np.where(endog != 0, np.log(endog), -3)
        elif isinstance(family, families.Binomial):
            satparam = np.where(endog == 1, 3, -3)
        elif isinstance(family, families.Gaussian):
            satparam = endog
        else:
            raise ValueError("Unknown family")
        self.satparam = satparam