def linear_component(formula, data, priors=None, intercept_prior=None, regressor_prior=None, init=True, init_vals=None, family=None, model=None): """Create linear model according to patsy specification. Parameters ---------- formula : str Patsy linear model descriptor. data : array Labeled array (e.g. pandas DataFrame, recarray). priors : dict Mapping prior name to prior distribution. E.g. {'Intercept': Normal.dist(mu=0, sd=1)} intercept_prior : pymc distribution Prior to use for the intercept. Default: Normal.dist(mu=0, tau=1.0E-12) regressor_prior : pymc distribution Prior to use for all regressor(s). Default: Normal.dist(mu=0, tau=1.0E-12) init : bool Whether to set the starting values via statsmodels Default: True init_vals : dict Set starting values externally: parameter -> value Default: None family : statsmodels.family Link function to pass to statsmodels (init has to be True). See `statsmodels.api.families` Default: identity Output ------ (y_est, coeffs) : Estimate for y, list of coefficients Example ------- # Logistic regression y_est, coeffs = glm('male ~ height + weight', htwt_data, family=glm.families.Binomial(links=glm.link.Logit)) y_data = Bernoulli('y', y_est, observed=data.male) """ if intercept_prior is None: intercept_prior = Normal.dist(mu=0, tau=1.0E-12) if regressor_prior is None: regressor_prior = Normal.dist(mu=0, tau=1.0E-12) if priors is None: priors = defaultdict(None) # Build patsy design matrix and get regressor names. _, dmatrix = patsy.dmatrices(formula, data) reg_names = dmatrix.design_info.column_names if init_vals is None and init: init_vals = glm_sm(formula, data, family=family).fit().params else: init_vals = defaultdict(lambda: None) # Create individual coefficients model = modelcontext(model) coeffs = [] if reg_names[0] == 'Intercept': prior = priors.get('Intercept', intercept_prior) coeff = model.Var(reg_names.pop(0), prior) coeff.tag.test_value = init_vals['Intercept'] coeffs.append(coeff) for reg_name in reg_names: prior = priors.get(reg_name, regressor_prior) coeff = model.Var(reg_name, prior) coeff.tag.test_value = init_vals[reg_name] coeffs.append(coeff) y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1)) return y_est, coeffs
def linear_component(formula, data, priors=None, intercept_prior=None, regressor_prior=None, init=True, init_vals=None, family=None, model=None): """Create linear model according to patsy specification. Parameters ---------- formula : str Patsy linear model descriptor. data : array Labeled array (e.g. pandas DataFrame, recarray). priors : dict Mapping prior name to prior distribution. E.g. {'Intercept': Normal.dist(mu=0, sd=1)} intercept_prior : pymc3 distribution Prior to use for the intercept. Default: Normal.dist(mu=0, tau=1.0E-12) regressor_prior : pymc3 distribution Prior to use for all regressor(s). Default: Normal.dist(mu=0, tau=1.0E-12) init : bool Whether to set the starting values via statsmodels Default: True init_vals : dict Set starting values externally: parameter -> value Default: None family : statsmodels.family Link function to pass to statsmodels (init has to be True). See `statsmodels.api.families` Default: identity Output ------ (y_est, coeffs) : Estimate for y, list of coefficients Example ------- # Logistic regression y_est, coeffs = glm('male ~ height + weight', htwt_data, family=glm.families.Binomial(links=glm.link.Logit)) y_data = Bernoulli('y', y_est, observed=data.male) """ if intercept_prior is None: intercept_prior = Normal.dist(mu=0, tau=1.0E-12) if regressor_prior is None: regressor_prior = Normal.dist(mu=0, tau=1.0E-12) if priors is None: priors = defaultdict(None) # Build patsy design matrix and get regressor names. _, dmatrix = patsy.dmatrices(formula, data) reg_names = dmatrix.design_info.column_names if init_vals is None and init: init_vals = glm_sm(formula, data, family=family).fit().params else: init_vals = defaultdict(lambda: None) # Create individual coefficients model = modelcontext(model) coeffs = [] if reg_names[0] == 'Intercept': prior = priors.get('Intercept', intercept_prior) coeff = model.Var(reg_names.pop(0), prior) coeff.tag.test_value = init_vals['Intercept'] coeffs.append(coeff) for reg_name in reg_names: prior = priors.get(reg_name, regressor_prior) coeff = model.Var(reg_name, prior) coeff.tag.test_value = init_vals[reg_name] coeffs.append(coeff) y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1)) return y_est, coeffs
Fare 0.0008 0.000 2.206 0.028 8.73e-05 0.001 Pclass_1 0.2398 0.046 5.228 0.000 0.150 0.330 Pclass_2 0.1943 0.038 5.110 0.000 0.120 0.269 Embarked_C 0.5044 0.048 10.580 0.000 0.411 0.598 Embarked_S 0.4092 0.037 11.136 0.000 0.337 0.481 Sex_male -0.4022 0.030 -13.398 0.000 -0.461 -0.343 ============================================================================== Omnibus: 49.900 Durbin-Watson: 1.872 Prob(Omnibus): 0.000 Jarque-Bera (JB): 53.293 Skew: 0.570 Prob(JB): 2.68e-12 Kurtosis: 2.632 Cond. No. 255. ============================================================================== """ from statsmodels.formula.api import glm as glm_sm glm_model = glm_sm('Y ~ X', train, family=sm.families.Binomial()).fit() glm_model.summary() """ Generalized Linear Model Regression Results ============================================================================== Dep. Variable: Y No. Observations: 891 Model: GLM Df Residuals: 883 Model Family: Binomial Df Model: 7 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -399.16 Date: Fri, 15 Jan 2021 Deviance: 798.33 Time: 18:49:28 Pearson chi2: 936. No. Iterations: 5 Covariance Type: nonrobust ============================================================================== coef std err z P>|z| [0.025 0.975]