def multiple_linear_regression(): '''Multiple linear regression chapter 6.3, p. 98''' # get the data from the web inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls' df = get_data(inFile) # do the fit, for the original model ... model = ols('carbohydrate ~ age + weight + protein', data=df).fit() print model.summary() print anova_lm(model) # as GLM p = glm('carbohydrate ~ age + weight + protein', family=Gaussian(), data=df).fit() print 'Same model, calculated with GLM' ''' The confidence intervals are different than those from OLS. The reason (from Nathaniel Smith): OLS uses a method that gives exact results, but only works in the special case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM instead uses an approximate method which is correct asymptotically but may be off for small samples; the tradeoff you get in return is that this method works the same way for all GLM models, including those with non-Gaussian error terms and non-trivial link functions. So that's why they're different. ''' print p.summary() # ... and for model 1 model1 = ols('carbohydrate ~ weight + protein', data=df).fit() print model1.summary() print anova_lm(model1)
def multiple_linear_regression(): '''Multiple linear regression chapter 6.3, p. 98''' # get the data from the web inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls' df = get_data(inFile) # do the fit, for the original model ... model = ols('carbohydrate ~ age + weight + protein', data=df).fit() print model.summary() print anova_lm(model) # as GLM glm = glm('carbohydrate ~ age + weight + protein', family=Gaussian(), data=df).fit() print 'Same model, calculated with GLM' ''' The confidence intervals are different than those from OLS. The reason (from Nathaniel Smith): OLS uses a method that gives exact results, but only works in the special case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM instead uses an approximate method which is correct asymptotically but may be off for small samples; the tradeoff you get in return is that this method works the same way for all GLM models, including those with non-Gaussian error terms and non-trivial link functions. So that's why they're different. ''' print glm.summary() # ... and for model 1 model1 = ols('carbohydrate ~ weight + protein', data=df).fit() print model1.summary() print anova_lm(model1)
def ancova(): ''' ANCOVA chapter 6.5, p 117 ''' # get the data from the web inFile = r'GLM_data/Table 6.12 Achievement scores.xls' df = get_data(inFile) # fit the model model = ols('y~x+method', data=df).fit() print anova_lm(model) print model.summary()
def f_twoway(data, factorA_name, factorB_name, value_name, precision=4): """ return results, aov_table, render_table, factorA_f_stat, factorA_p_value, factorB_f_stat, factorB_p_value, inter_f_stat, inter_p_value """ results = smf.ols( f'{value_name} ~ C({factorA_name}) + C({factorB_name}) + C({factorA_name}):C({factorB_name})', data=data).fit() aov_table = sms.anova_lm(results, typ=2) factorA_f_stat, factorA_p_value = aov_table['F'][0], aov_table['PR(>F)'][0] factorB_f_stat, factorB_p_value = aov_table['F'][1], aov_table['PR(>F)'][1] inter_f_stat, inter_p_value = aov_table['F'][2], aov_table['PR(>F)'][2] render_table = aov_table.copy() render_table.columns = [ 'Sum of Squares', 'Degree of Freedom', 'F', 'p-value' ] render_table.index = ['Factor A', 'Factor B', 'Interaction', 'Error'] render_table.loc['Total'] = render_table.sum() render_table.loc['Total', ['F', 'p-value']] = np.nan print( f'Factor A\'s p-value: {factorA_p_value:.{precision}f}\nFactor B\'s p-value: {factorB_p_value:.{precision}f}\nInteraction p-value: {inter_p_value:.{precision}f}' ) return results, aov_table, render_table, factorA_f_stat, factorA_p_value, factorB_f_stat, factorB_p_value, inter_f_stat, inter_p_value
def anova_test_formula(formula, data=None, typ=1): """ANOVA Test by formula. """ lin_model = smf.ols(formula, data=data).fit() res_anova = smt.anova_lm(lin_model, typ=typ) return res_anova, lin_model
def anova(): '''ANOVA chapter 6.4, p. 108, and p. 113 GLM does not work with anova_lm. ''' # get the data from the web inFile = r'GLM_data/Table 6.6 Plant experiment.xls' df = get_data(inFile) # fit the model (p 109) glm = glm('weight~group', family=Gaussian(), data=df) print glm.fit().summary() print '-'*65 print 'OLS' model = ols('weight~group', data=df) print model.fit().summary() print anova_lm(model.fit()) # The model corresponding to the null hypothesis of no treatment effect is model0 = ols('weight~1', data=df) # Get the data for the two-factor ANOVA (p 113) inFile = r'GLM_data/Table 6.9 Two-factor data.xls' df = get_data(inFile) # adjust the header names from the Excel-file df.columns = ['A','B', 'data'] # two-factor anova, with interactions ols_int = ols('data~A*B', data=df) anova_lm(ols_int.fit()) # The python commands for the other four models are ols_add = ols('data~A+B', data=df) ols_A = ols('data~A', data=df) ols_B = ols('data~B', data=df) ols_mean = ols('data~1', data=df)
def anova(): '''ANOVA chapter 6.4, p. 108, and p. 113 GLM does not work with anova_lm. ''' # get the data from the web inFile = r'GLM_data/Table 6.6 Plant experiment.xls' df = get_data(inFile) # fit the model (p 109) p = glm('weight~group', family=Gaussian(), data=df) print p.fit().summary() print '-' * 65 print 'OLS' model = ols('weight~group', data=df) print model.fit().summary() print anova_lm(model.fit()) # The model corresponding to the null hypothesis of no treatment effect is model0 = ols('weight~1', data=df) # Get the data for the two-factor ANOVA (p 113) inFile = r'GLM_data/Table 6.9 Two-factor data.xls' df = get_data(inFile) # adjust the header names from the Excel-file df.columns = ['A', 'B', 'data'] # two-factor anova, with interactions ols_int = ols('data~A*B', data=df) anova_lm(ols_int.fit()) # The python commands for the other four models are ols_add = ols('data~A+B', data=df) ols_A = ols('data~A', data=df) ols_B = ols('data~B', data=df) ols_mean = ols('data~1', data=df)
def f_oneway(data, treatment_name, value_name): """ return results, aov_table, render_table, f_stat, p_value """ results = smf.ols(f'{value_name} ~ C({treatment_name})', data=data).fit() aov_table = sms.anova_lm(results, typ=2) f_stat, p_value = aov_table['F'][0], aov_table['PR(>F)'][0] render_table = aov_table.copy() render_table.columns = [ 'Sum of Squares', 'Degree of Freedom', 'F', 'p-value' ] # render_table.index = ['Treatment', 'Error'] render_table.loc['Total'] = render_table.sum() print(f'p-value: {p_value}') return results, aov_table, render_table, f_stat, p_value
def f_random_block(data, treatment_name, block_name, value_name, precision=4): """ return results, aov_table, render_table, treatment_f_stat, treatment_p_value, block_f_stat, block_p_value """ results = smf.ols(f'{value_name} ~ C({treatment_name}) + C({block_name})', data=data).fit() aov_table = sms.anova_lm(results, typ=2) treatment_f_stat, treatment_p_value = aov_table['F'][0], aov_table[ 'PR(>F)'][0] block_f_stat, block_p_value = aov_table['F'][1], aov_table['PR(>F)'][1] render_table = aov_table.copy() render_table.columns = [ 'Sum of Squares', 'Degree of Freedom', 'F', 'p-value' ] render_table.index = ['Treatment', 'Block', 'Error'] render_table.loc['Total'] = render_table.sum() render_table.loc['Total', ['F', 'p-value']] = np.nan print( f'Treatment p-value (main): {treatment_p_value:.{precision}f}\nBlock p-value: {block_p_value:.{precision}f}' ) return results, aov_table, render_table, treatment_f_stat, treatment_p_value, block_f_stat, block_p_value
# In[24]: formula = 'crime ~ 1' mod = smf.ols(formula, data = df) reg02 = mod.fit() print ( reg02.summary()) # ## <font color = blue>Compare your unrestricted model to the constant-only model (the restricted model).</font> # In[63]: anova_lm( reg01, reg02 ) # ## <font color = blue> Calculate elasticities.</font> # In[43]: dYdX = reg03.params[1] eta = dYdX * (df.crime.mean()/df.amtunemp.mean()) print ('eta = ', round(eta, 4)) # In[42]:
print(my_formula) lm_ins1 = smf.ols(my_formula, boston_df) lm_fit1 = lm_ins1.fit() print(lm_fit1.summary()) # Interaction Terms lm_fit2 = smf.ols('MEDV ~ LSTAT + I(LSTAT**2)', boston_df).fit() print(lm_fit2.summary()) # Non linear trnasformations # import anova function from statsmodels.stats.api import anova_lm lm_fit = smf.ols('MEDV ~ LSTAT', boston_df).fit() lm_fit2 = smf.ols('MEDV ~ LSTAT + I(LSTAT**2)', boston_df).fit() # perform the hypothesis test (see https://en.wikipedia.org/wiki/F-test regression section) print(anova_lm(lm_fit, lm_fit2)) fig, ax = plt.subplots(figsize=(8, 6)) # Plot the data ax.scatter(boston_df.LSTAT, boston_df.MEDV, facecolors='none', edgecolors='b', label="data") # plot the models fitted values ax.plot(boston_df.LSTAT, lm_fit2.fittedvalues, 'g', marker='o', linestyle='none',
ax.set_title(x) ax.text(1,.75, ("%d zero values dropped\nout of %d observations" % (zero_count, total_count)),transform=ax.transAxes, horizontalalignment='right',verticalalignment='top') if thou: ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int((x*1e-3)), ",") if x>1000 else format(int(x), ","))) remove_border(ax) else: #if no data for a specific month ax.plot([]) remove_border(ax, top=False, left=False, right=False, bottom=False) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) plt.savefig('./Output/' + filename + "_" + year + ".png", bbox_inches='tight') #now graph the small multiples years = dfT.year.unique() for year in years: small_mult_hist(dfT, 'delayHH', 'display_month', year, 'Distribution of Delay days per Household Employed among Blocks', filename= 'Dist_delay_HH', rows=4, log=False, bins=50, thou=False) small_mult_hist(dfT, 'delay', 'display_month', year, 'Distribution of Delay Days (thousands) among Blocks', log=False, bins=50, thou=True, filename= "Dist_delay") ### Variance analysis using ANOVA anova_lm(ols('delay ~ display_month', dfT.dropna(subset=['delay'])).fit()).to_csv('./Output/delay_ANOVA.csv') anova_lm(ols('delayHH ~ display_month', dfT.dropna(subset=['delayHH'])).fit()).to_csv('./Output/delayHH_ANOVA.csv')
# Add an interaction between salary and experience, allowing different intercepts for level of experience. # # $$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$ # <codecell> interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit() print interX_lm.summary() # <markdowncell> # Test that $\beta_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test. # <codecell> print anova_lm(lm, interX_lm) # <codecell> print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0') # <codecell> print interX_lm.f_test([[0, 0, 0, 0, 0, 1, -1], [0, 0, 0, 0, 0, 0, 1]]) # <markdowncell> # The contrasts are created here under the hood by patsy. # <markdowncell>
def anova_one_drug_one_feature_custom(self, drug_id, feature_name, formula, odof=None): """Same as :meth:`anova_one_drug_one_feature` but allows any formula :return: full ANOVA table but also populate interal attribute anova_pvalues that is a dictionary with pvalues for feature, media, msi and tissue Formula must be set in the settings attribute as follows:: an = ANOVA(...) an.settings['regression_formula'] = "Y ~ C(tissue) + feature" .. note:: This function is convenient but 3-4 times slower than :meth:`anova_one_drug_one_feature`. So if your formula are one of:: "Y ~ C(tissue) + C(media) + C(msi) + feature" "Y ~ C(tissue) + C(msi) + feature" "Y ~ C(msi) + feature" "Y ~ feature" you should rather use :meth:`anova_one_drug_one_feature` instead (keeping regression_formula set to 'auto'). By default, in categories, the first treatment (e.g tissue) is used as a reference and is not shown in the results. You may set the reference as follows:: "Y ~ C(tissue, Treatment(reference='breast'))" ANOVA pvalues returned are of type I .. versionadded:: 0.15.0 """ import statsmodels.formula.api as smf from statsmodels.stats.api import anova_lm if odof is None: odof = self._get_one_drug_one_feature_data(drug_id, feature_name) df = pd.DataFrame({'Y': odof.Y, 'feature': odof.masked_features}) # Add other categorical explanatory variables if available try: df['tissue'] = odof.masked_tissue.values except: pass try: df['msi'] = odof.masked_msi.values except: pass try: df['media'] = odof.masked_media.values except: pass # "Y ~ C(tissue) + C(msi) + C(media) + feature" assert "Y" in formula, "Y must be the LHS of the formula" # This returns a Model instance model = smf.ols(formula, data=df) self._debug_custom_df = df self._debug_custom_model = model anova = anova_lm(model.fit(), typ=1) anova_pvalues = {} for k, v in anova['PR(>F)'].iteritems(): if k == 'C(tissue)': anova_pvalues['tissue'] = v elif k == 'C(msi)': anova_pvalues['msi'] = v elif k == 'C(media)': anova_pvalues['media'] = v elif k == 'feature': anova_pvalues['feature'] = v self.anova_pvalues = anova_pvalues return anova
def get_p_fvalue(self, result_fit1, result_fit2): anova_table = anova_lm(result_fit1.result_, result_fit2.result_) return anova_table["Pr(>F)"][1]
# Add an interaction between salary and experience, allowing different intercepts for level of experience. # # $$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$ # <codecell> interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit() print interX_lm.summary() # <markdowncell> # Test that $B_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test. # <codecell> print anova_lm(lm, interX_lm) # <codecell> print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0') # <codecell> print interX_lm.f_test([[0,0,0,0,0,1,-1],[0,0,0,0,0,0,1]]) # <rawcell> # The contrasts are created here under the hood by patsy. # <markdowncell>
def anova_one_drug_one_feature_custom(self, drug_id, feature_name, formula, odof=None): """Same as anova_one_drug_one_feature but allows any formula Formula must be set in the settings attribute as settings.regression_formula:: :return: full ANOVA table but also populate interal attribute anova_pvalues that is a dictionary with pvalues for feature, media, msi and tissue an = ANOVA(...) an.settings.formula = "Y ~ C(tissue) + feature" .. note:: This function is convenient but 3 times slower than :meth:`anova_one_drug_one_feature`. So if your formula are one of "Y ~ C(tissue) + C(media) + C(msi) + feature" "Y ~ C(tissue) + C(msi) + feature" "Y ~ C(msi) + feature" "Y ~ feature" By default, in categories, the first treatment (e.g tissue) is used a reference and is not shown in the results. You may set the reference "Y ~ C(tissue, Treatment(reference='breast'))" . ANOVA pvalues returned are of type I .. versionadded: 0.15.0 """ import statsmodels.formula.api as smf from statsmodels.stats.api import anova_lm if odof is None: odof = self._get_one_drug_one_feature_data(drug_id, feature_name) df = pd.DataFrame({"Y": odof.Y, "feature": odof.masked_features}) # Add other categorical explanatory variables if available try: df["tissue"] = odof.masked_tissue.values except: pass try: df["msi"] = odof.masked_msi.values except: pass try: df["media"] = odof.masked_media.values except: pass # "Y ~ C(tissue) + C(msi) + C(media) + feature" assert "Y" in formula, "Y must be the LHS of the formula" # This returns a Model instance model = smf.ols(formula, data=df) self._debug_custom_df = df self._debug_custom_model = model anova = anova_lm(model.fit(), typ=1) anova_pvalues = {} for k, v in anova["PR(>F)"].iteritems(): if k == "C(tissue)": anova_pvalues["tissue"] = v elif k == "C(msi)": anova_pvalues["msi"] = v elif k == "C(media)": anova_pvalues["media"] = v elif k == "feature": anova_pvalues["feature"] = v self.anova_pvalues = anova_pvalues return anova
def anova_test(self): self.stat_multi_regression_b() self.stat_multi_include_calculation() table1 = anova_lm(self.fit, self.fit2) print table1
m = smf.ols('College ~ ACT', data = gpa) results = m.fit() print(results.summary()) m = smf.ols('College ~ ACT + HS', data = gpa) results = m.fit() print(results.summary()) plt.plot(results.predict(),'bo') ######anova example comp = smf.ols('College ~ ACT + HS + APHours + Height + ACT + KnownLanguages', data = gpa).fit() simp = smf.ols('College ~ ACT + APHours + Height', data = gpa).fit() from statsmodels.stats.api import anova_lm anova_lm(simp, comp) ############## # Load data url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv' dat = pd.read_csv(url) # Fit regression model (using the natural log of one of the regressors) results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit() # Inspect the results print results.summary() plt.plot(results.predict(),"bo")
plt.xlabel("x") plt.ylabel("y") plt.plot(x[:,1], yfit, 'blue', label='model fit') plt.plot(x[:,1], ytrue, 'red', label='population') plt.legend(loc='upper left') # (g) x2 = np.power(x,2)[:,1] x2 = x2.reshape(100,1) x = np.append(x, x2, 1) model8 = sm.OLS(y,x) model8_fit = model8.fit() print(model8.fit().summary()) table = anova_lm(model7.fit(), model8.fit()) print(table) # No evidence on polynomial being better. # (h)-(j) skip # ------------------------------------------- # Q14 # (a) np.random.seed(1) x1 = np.random.uniform(size=(100,)) x2 = 0.5*x1 + np.random.normal(size=(100,))/10. y = 2 + 2*x1 + 0.3*x2 + np.random.normal(size=(100,)) # (b) np.corrcoef(x1, x2)
#1-sample t-test stats.ttest_1samp(data['VIQ'], 0) #2-sample independent t-test female_viq = data[data['Gender'] == 'Female']['VIQ'] male_viq = data[data['Gender'] == 'Male']['VIQ'] stats.ttest_ind(female_viq, male_viq) #2-sample comparative t-test stats.ttest_rel(data['FSIQ'], data['PIQ']) stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0) #Exercise #Q1. Test the difference between weights in males and females. #Q2. Use non parametric statistics to test the difference between VIQ in males and females. #(Hint: using scipy.stats.mannwhitneyu()) from statsmodels.formula.api import ols model = ols("VIQ ~ Gender + 1", data) results = model.fit() print(results.summary()) model2 = ols('VIQ ~ Gender + Weight + Height + MRI_Count', data) results2 = model2.fit() print(results2.summary()) #Influence point checking infl = results2.get_influence() print(infl.summary_table()) from statsmodels.stats.api import anova_lm #ANOVA for Model Comparison table1 = anova_lm(results, results2) print(table1)