Example #1
0
File: glm.py Project: Synapski/pymc
def linear_component(formula, data, priors=None,
                     intercept_prior=None,
                     regressor_prior=None,
                     init=True, init_vals=None, family=None,
                     model=None):
    """Create linear model according to patsy specification.

    Parameters
    ----------
    formula : str
        Patsy linear model descriptor.
    data : array
        Labeled array (e.g. pandas DataFrame, recarray).
    priors : dict
        Mapping prior name to prior distribution.
        E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
    intercept_prior : pymc distribution
        Prior to use for the intercept.
        Default: Normal.dist(mu=0, tau=1.0E-12)
    regressor_prior : pymc distribution
        Prior to use for all regressor(s).
        Default: Normal.dist(mu=0, tau=1.0E-12)
    init : bool
        Whether to set the starting values via statsmodels
        Default: True
    init_vals : dict
        Set starting values externally: parameter -> value
        Default: None
    family : statsmodels.family
        Link function to pass to statsmodels (init has to be True).
    See `statsmodels.api.families`
        Default: identity

    Output
    ------
    (y_est, coeffs) : Estimate for y, list of coefficients

    Example
    -------
    # Logistic regression
    y_est, coeffs = glm('male ~ height + weight',
                        htwt_data,
                        family=glm.families.Binomial(links=glm.link.Logit))
    y_data = Bernoulli('y', y_est, observed=data.male)
    """
    if intercept_prior is None:
        intercept_prior = Normal.dist(mu=0, tau=1.0E-12)
    if regressor_prior is None:
        regressor_prior = Normal.dist(mu=0, tau=1.0E-12)

    if priors is None:
        priors = defaultdict(None)

    # Build patsy design matrix and get regressor names.
    _, dmatrix = patsy.dmatrices(formula, data)
    reg_names = dmatrix.design_info.column_names

    if init_vals is None and init:
        init_vals = glm_sm(formula, data, family=family).fit().params
    else:
        init_vals = defaultdict(lambda: None)

    # Create individual coefficients
    model = modelcontext(model)
    coeffs = []

    if reg_names[0] == 'Intercept':
        prior = priors.get('Intercept', intercept_prior)
        coeff = model.Var(reg_names.pop(0), prior)
        coeff.tag.test_value = init_vals['Intercept']
        coeffs.append(coeff)

    for reg_name in reg_names:
        prior = priors.get(reg_name, regressor_prior)
        coeff = model.Var(reg_name, prior)
        coeff.tag.test_value = init_vals[reg_name]
        coeffs.append(coeff)

    y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1))

    return y_est, coeffs
Example #2
0
def linear_component(formula, data, priors=None,
                     intercept_prior=None,
                     regressor_prior=None,
                     init=True, init_vals=None, family=None,
                     model=None):
    """Create linear model according to patsy specification.

    Parameters
    ----------
    formula : str
        Patsy linear model descriptor.
    data : array
        Labeled array (e.g. pandas DataFrame, recarray).
    priors : dict
        Mapping prior name to prior distribution.
        E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
    intercept_prior : pymc3 distribution
        Prior to use for the intercept.
        Default: Normal.dist(mu=0, tau=1.0E-12)
    regressor_prior : pymc3 distribution
        Prior to use for all regressor(s).
        Default: Normal.dist(mu=0, tau=1.0E-12)
    init : bool
        Whether to set the starting values via statsmodels
        Default: True
    init_vals : dict
        Set starting values externally: parameter -> value
        Default: None
    family : statsmodels.family
        Link function to pass to statsmodels (init has to be True).
    See `statsmodels.api.families`
        Default: identity

    Output
    ------
    (y_est, coeffs) : Estimate for y, list of coefficients

    Example
    -------
    # Logistic regression
    y_est, coeffs = glm('male ~ height + weight',
                        htwt_data,
                        family=glm.families.Binomial(links=glm.link.Logit))
    y_data = Bernoulli('y', y_est, observed=data.male)
    """
    if intercept_prior is None:
        intercept_prior = Normal.dist(mu=0, tau=1.0E-12)
    if regressor_prior is None:
        regressor_prior = Normal.dist(mu=0, tau=1.0E-12)

    if priors is None:
        priors = defaultdict(None)

    # Build patsy design matrix and get regressor names.
    _, dmatrix = patsy.dmatrices(formula, data)
    reg_names = dmatrix.design_info.column_names

    if init_vals is None and init:
        init_vals = glm_sm(formula, data, family=family).fit().params
    else:
        init_vals = defaultdict(lambda: None)

    # Create individual coefficients
    model = modelcontext(model)
    coeffs = []

    if reg_names[0] == 'Intercept':
        prior = priors.get('Intercept', intercept_prior)
        coeff = model.Var(reg_names.pop(0), prior)
        coeff.tag.test_value = init_vals['Intercept']
        coeffs.append(coeff)

    for reg_name in reg_names:
        prior = priors.get(reg_name, regressor_prior)
        coeff = model.Var(reg_name, prior)
        coeff.tag.test_value = init_vals[reg_name]
        coeffs.append(coeff)

    y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1))

    return y_est, coeffs
Example #3
0
Fare           0.0008      0.000      2.206      0.028    8.73e-05       0.001
Pclass_1       0.2398      0.046      5.228      0.000       0.150       0.330
Pclass_2       0.1943      0.038      5.110      0.000       0.120       0.269
Embarked_C     0.5044      0.048     10.580      0.000       0.411       0.598
Embarked_S     0.4092      0.037     11.136      0.000       0.337       0.481
Sex_male      -0.4022      0.030    -13.398      0.000      -0.461      -0.343
==============================================================================
Omnibus:                       49.900   Durbin-Watson:                   1.872
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               53.293
Skew:                           0.570   Prob(JB):                     2.68e-12
Kurtosis:                       2.632   Cond. No.                         255.
==============================================================================

"""
from statsmodels.formula.api import glm as glm_sm
glm_model = glm_sm('Y ~ X', train, family=sm.families.Binomial()).fit()
glm_model.summary()
"""
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      Y   No. Observations:                  891
Model:                            GLM   Df Residuals:                      883
Model Family:                Binomial   Df Model:                            7
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -399.16
Date:                Fri, 15 Jan 2021   Deviance:                       798.33
Time:                        18:49:28   Pearson chi2:                     936.
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]