def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def setup_class(cls): from .test_diagnostic import get_duncan_data endog, exog, labels = get_duncan_data() data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit() res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit() cls.infl1 = res1.get_influence() cls.infl0 = res0.get_influence()
def setup_class(cls): from .test_diagnostic import get_duncan_data endog, exog, labels = get_duncan_data() data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit() res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit() cls.infl1 = res1.get_influence() cls.infl0 = res0.get_influence()
def setup_class(cls): nobs = 30 np.random.seed(987128) x = np.random.randn(nobs, 3) y = x.sum(1) + np.random.randn(nobs) index = ['obs%02d' % i for i in range(nobs)] # add one extra column to check that it doesn't matter cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4), columns='y var1 var2 var3'.split(), index=index) cls.res = GLM.from_formula('y ~ var1 + var2', data=cls.data).fit()
def setup_class(cls): nobs = 30 np.random.seed(987128) x = np.random.randn(nobs, 3) y = x.sum(1) + np.random.randn(nobs) index = ['obs%02d' % i for i in range(nobs)] # add one extra column to check that it doesn't matter cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4), columns='y var1 var2 var3'.split(), index=index) cls.res = GLM.from_formula('y ~ var1 + var2', data=cls.data).fit()
def _train(elements, model_cfg): """Construct one model per building block type""" models = {} target = model_cfg.target for model in model_cfg.sections: # Construct model formula from configuration terms = " + ".join(["1"] + [f"C({f})" for f in model.factors]) # Train model models[model.label] = GLM.from_formula( f"{target} ~ {terms}", family=Binomial(), data=filter_data(elements, {model_cfg.label_column: model.label}), ).fit(scale="X2") return models
def setup_class(cls): from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_noexposure_constraint cls.idx = [7, 3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson()) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)
def setup_class(cls): from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_noexposure_constraint cls.idx = [7, 3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson()) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def setup_class(cls): from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_exposure_constraint cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example with offset formula = 'deaths ~ smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson(), offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)._results
def test_wtd_patsy_missing(): from statsmodels.datasets.cpunish import load import pandas as pd data = load() data.exog[0, 0] = np.nan data.endog[[2, 4, 6, 8]] = np.nan data.pandas = pd.DataFrame(data.exog, columns=data.exog_name) data.pandas['EXECUTIONS'] = data.endog weights = np.arange(1, len(data.endog)+1) formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 + SOUTH + DEGREE""" mod_misisng = GLM.from_formula(formula, data=data.pandas, freq_weights=weights) assert_equal(mod_misisng.freq_weights.shape[0], mod_misisng.endog.shape[0]) assert_equal(mod_misisng.freq_weights.shape[0], mod_misisng.exog.shape[0]) assert_equal(mod_misisng.freq_weights.shape[0], 12) keep_weights = np.array([ 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17]) assert_equal(mod_misisng.freq_weights, keep_weights)
def compute_chi2_null_test(model_results, data, dep_var, max_iter, l2_weight): """ Compute difference from null model using deviance: P(null) - P(model) ~ chi_2 """ null_formula = '%s ~ 1' % (dep_var) null_model = GLM.from_formula(null_formula, data, family=Binomial(link=logit())) null_model_results = null_model.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) model_loglike = model_results.model.loglike(model_results.params) null_model_loglike = null_model_results.model.loglike( null_model_results.params) llr = -2 * (null_model_loglike - model_loglike) model_df = model_results.model.df_model p_val = chi2.sf(llr, model_df) return llr, model_df, p_val
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def setup_class(cls): from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_exposure_constraint cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example with offset formula = 'deaths ~ smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson(), offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)._results
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def update_peaks_fit_regression(data, NE_date_ranges, NE_var, data_name_var, round_date_var, peak_date_var, peak_date_buffer, scalar_vars, formula, max_iter, l2_weight, regression_type): """ Randomly update peak times and fit regression. """ NE_peak_dates_i = NE_date_ranges.apply(lambda x: np.random.choice(x, 1)[0]) NE_peak_dates_i_df = NE_peak_dates_i.reset_index().rename( columns={0: peak_date_var}) # data_peak_dates_i = data_date_ranges.apply(lambda x: np.random.choice(x, 1)[0]).reset_index().rename(columns={0 : peak_date_var}) data_i = pd.merge(data, NE_peak_dates_i_df, on=[NE_var, data_name_var], how='inner') # reassign peaks data_i = data_i.assign( **{ 'pre_peak': ( data_i.loc[:, round_date_var] <= data_i.loc[:, peak_date_var] - peak_date_buffer).astype(int), 'post_peak': ( data_i.loc[:, round_date_var] >= data_i.loc[:, peak_date_var] + peak_date_buffer).astype(int), 'during_peak': ((data_i.loc[:, round_date_var] > data_i.loc[:, peak_date_var] - peak_date_buffer) & (data_i.loc[:, round_date_var] < data_i.loc[:, peak_date_var] + peak_date_buffer)).astype(int), }) # add days since post-peak data_i = data_i.assign( **{ 'since_peak': data_i.loc[:, 'post_peak'] * (data_i.loc[:, round_date_var] - data_i.loc[:, peak_date_var]) }) # Z-norm all scalar vars scaler = StandardScaler() for v in scalar_vars: data_i = data_i.assign( ** {v: scaler.fit_transform(data_i.loc[:, v].values.reshape(-1, 1))}) model_full = GLM.from_formula(formula, data_i, family=Binomial(link=logit())) logging.debug( '%d/%d/%d pre/during/post data' % (data_i.loc[:, 'pre_peak'].sum(), data_i.loc[:, 'during_peak'].sum(), data_i.loc[:, 'post_peak'].sum())) if (regression_type == 'regularized_logit'): model_res_full = model_full.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) model_res_full_err = compute_err_data(model_res_full) err = model_res_full_err.loc[:, 'SE'] else: model_res_full = model_full.fit() err = model_res_full.bse params = model_res_full.params return params, err, NE_peak_dates_i
def run_regression(data, formula, regression_type, dep_var='anchor', out_dir='../../output', split_var=None, split_var_val=0): """ Run logit regression on data with given formula and write to file. Option: use regularized logit (reduce variable inflation). :param data: full data :param formula: regression formula :param regression_type: type of regression (logit|regularized_logit) :param dep_var: dependent variable :param out_dir: output directory :param split_var: optional variable to split data (e.g. only organization accounts) :param split_var_val: value of split value variable (if included) """ l2_weight = 0.01 max_iter = 100 model_full = GLM.from_formula(formula, data, family=Binomial(link=logit())) if (regression_type == 'regularized_logit'): model_res_full = model_full.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) else: model_res_full = model_full.fit() ## summary stats model_res_full_err = compute_err_data(model_res_full) # write to file reg_out_str = 'anchor_%s_output_%s.tsv' % (regression_type, formula.replace(' ', '')) if (split_var is not None): reg_out_str = 'anchor_%s_output_%s_split_%s=%s.tsv' % ( regression_type, formula.replace(' ', ''), split_var, split_var_val) res_out_file = os.path.join(out_dir, reg_out_str) model_res_full_err.to_csv(res_out_file, sep='\t', index=True) ## save coeffs to file => pretty print as latex # need lots of decimal points! for multiple variable correction pd.options.display.float_format = '{:,.5f}'.format tex_out_str = reg_out_str.replace('.tsv', '.tex') tex_res_out_file = os.path.join(out_dir, tex_out_str) model_res_full_err = model_res_full_err.assign( **{'coeff': model_res_full_err.index}) tex_data_cols = ['coeff', 'mean', 'SE', 'p_val'] model_res_full_err.to_latex(tex_res_out_file, columns=tex_data_cols, index=False) ## compute regression fit parameters => deviance, AIC, etc. # start with chi2 test against null model llr, model_df, p_val = compute_chi2_null_test(model_res_full, data, dep_var, max_iter, l2_weight) logging.debug('N=%d, LLR=%.5f, df=%d, p-val=%.3E' % (data.shape[0], llr, model_df, p_val)) # variance inflation factor: are some of the covariates highly collinear? # for sanity we only look at non-categorical vars cat_var_matcher = re.compile( 'C\(.+\)\[T\..+\]|Intercept' ) # format="C(var_name)[T.var_val]" ("C(username)[T.barackobama]") non_cat_params = [ param for param in model_res_full.params.index if cat_var_matcher.search(param) is None ] for param in non_cat_params: VIF_i = compute_VIF(model_res_full, param) logging.debug('VIF test: param=%s, VIF=%.3f' % (param, VIF_i)) ## compute accuracy on k-fold classification ## we would use R-squared but that doesn't work for logistic regression # first get data into usable format n_splits = 10 accs = k_fold_acc(model_full.exog, model_full.endog, k=n_splits) mean_acc = np.mean(accs) se_acc = np.std(accs) / n_splits**.5 logging.debug('%d-fold mean accuracy = %.3f +/- %.3f' % (n_splits, mean_acc, se_acc))
def test_weights(data, dep_var, cat_vars, scalar_vars, l2_weights): indep_formula = ' + '.join( ['C(%s)' % (cap_cat_var) for cap_cat_var in cap_cat_vars] + scalar_vars) formula = '%s ~ %s' % (dep_var, indep_formula) # convert raw data to exogenous data # need to do this to force train/test # to have same features data_rand = data.copy() np.random.shuffle(data_rand.values) model_dummy = GLM.from_formula(formula, data_rand, family=Binomial(link=logit())) exog = model_dummy.exog exog_names = model_dummy.exog_names endog = model_dummy.endog # generate cross validation folds cross_val_folds = 10 N = data_rand.shape[0] cross_val_chunk_size = float(N) / cross_val_folds cross_val_fold_train_idx = [ list( range(int(floor(i * cross_val_chunk_size)), int(ceil((i + 1) * cross_val_chunk_size)))) for i in range(cross_val_folds) ] cross_val_fold_test_idx = [ list(range(0, int(ceil(i * cross_val_chunk_size)))) + list(range(int(floor((i + 1) * cross_val_chunk_size)), N)) for i in range(cross_val_folds) ] weight_likelihoods = [] for l2_weight in l2_weights: print('testing weight = %.3f' % (l2_weight)) likelihoods_l2 = [] for i, (train_idx_i, test_idx_i) in enumerate( zip(cross_val_fold_train_idx, cross_val_fold_test_idx)): print('fold %d' % (i)) train_XY = data_rand.iloc[train_idx_i, :] test_X = exog[test_idx_i, :] test_Y = endog[test_idx_i] # fit model model_i = GLM.from_formula(formula, train_XY, family=Binomial(link=logit())) model_res_i = model_i.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.) # add 0 params for missing coefficients # to match X shape model_res_i.params = model_res_i.params.loc[exog_names].fillna( 0, inplace=False) # score test data likelihood_i = compute_log_likelihood(model_res_i.params, test_Y, test_X) likelihoods_l2.append(likelihood_i) weight_likelihoods.append(likelihoods_l2) weight_likelihoods = pd.DataFrame(np.array(weight_likelihoods), index=l2_weights) mean_weight_likelihoods = weight_likelihoods.mean(axis=0) return mean_weight_likelihoods
delimiter=",", names=[ "male", "female", "infant", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings" ]) sample.describe() sample_validation.describe() N = len(sample) N_valid = len(sample_validation) print("{} samples to train and {} samples to validate".format(N, N_valid)) #Definició del model lineal com a combinació lineal de les entrades de les dades. model_glm = GLM.from_formula( 'rings ~ male + female + infant + length + diameter + height + whole_weight + shucked_weight + viscera_weight + shell_weight', sample) model = model_glm.fit() print(model.summary()) print(model.params, file=open('coeficients/linear_regression', 'w')) #Fem les prediccions de train prediccions = model.predict(sample.loc[:, "male":"shell_weight"]) #Trobem les mètriques per evaluar el model, sobre les dades de train MAE = np.sum(abs(sample.rings - prediccions)) / N print("MAE:", MAE) mean_square_error = np.sum((sample.rings - prediccions)**2) / N print("MSE:", mean_square_error) root_mse = np.sqrt(model.deviance / N)