def test_plottype(self): fig = interaction_plot(self.weight, self.duration, self.days, plottype='line') assert_equal(isinstance(fig, plt.Figure), True) plt.close(fig) fig = interaction_plot(self.weight, self.duration, self.days, plottype='scatter') assert_equal(isinstance(fig, plt.Figure), True) plt.close(fig) assert_raises(ValueError, interaction_plot, self.weight, self.duration, self.days, plottype='unknown') plt.close('all')
def test_plot_both(self, close_figures): fig = interaction_plot(self.weight, self.duration, self.days, colors=['red', 'blue'], markers=['D', '^'], ms=10)
def test_formatting(self, close_figures): fig = interaction_plot(self.weight, self.duration, self.days, colors=['r', 'g'], linestyles=['--', '-.']) assert_equal(isinstance(fig, plt.Figure), True)
def test_plot_rainbow(self): fig = interaction_plot(self.weight, self.duration, self.days, markers=['D', '^'], ms=10) plt.close(fig)
def test_plot_pandas(self, astype, close_figures): weight = Series(self.weight, name='Weight').astype(astype) duration = Series(self.duration, name='Duration') days = Series(self.days, name='Days') fig = interaction_plot(weight, duration, days, markers=['D', '^'], ms=10) ax = fig.axes[0] trace = ax.get_legend().get_title().get_text() assert_equal(trace, 'Duration') assert_equal(ax.get_ylabel(), 'mean of Days') assert_equal(ax.get_xlabel(), 'Weight')
def test_plot_string_data(self): weight = Series(self.weight, name='Weight').astype('str') duration = Series(self.duration, name='Duration') days = Series(self.days, name='Days') fig = interaction_plot(weight, duration, days, markers=['D', '^'], ms=10) ax = fig.axes[0] trace = ax.get_legend().get_title().get_text() assert_equal(trace, 'Duration') assert_equal(ax.get_ylabel(), 'mean of Days') assert_equal(ax.get_xlabel(), 'Weight') plt.close(fig)
def test_plot_pandas(self): weight = Series(self.weight, name='Weight') duration = Series(self.duration, name='Duration') days = Series(self.days, name='Days') fig = interaction_plot(weight, duration, days, markers=['D','^'], ms=10) ax = fig.axes[0] trace = ax.get_legend().get_title().get_text() assert trace == 'Duration' assert ax.get_ylabel() == 'mean of Days' assert ax.get_xlabel() == 'Weight' plt.close(fig)
def test_plot_pandas(self, close_figures): weight = Series(self.weight, name='Weight') duration = Series(self.duration, name='Duration') days = Series(self.days, name='Days') fig = interaction_plot(weight, duration, days, markers=['D', '^'], ms=10) ax = fig.axes[0] trace = ax.get_legend().get_title().get_text() assert_equal(trace, 'Duration') assert_equal(ax.get_ylabel(), 'mean of Days') assert_equal(ax.get_xlabel(), 'Weight')
def galtonRegressInter(): IN = pd.read_csv(mydir + 'data/Galton.csv', sep=',') IN['Midparent'] = IN[['Father', 'Mother']].mean(axis=1) mod1 = smf.ols(formula='Height ~ Midparent + C(Gender)', data=IN).fit() mod2 = smf.ols(formula='Height ~ Midparent * C(Gender)', data=IN).fit() fig, ax = plt.subplots(figsize=(6, 6)) midparent = IN.Midparent.values gender = IN.Gender.values height = IN.Height.values fig = interaction_plot(x=midparent, trace=gender, response=height, colors=['#FF6347', '#87CEEB'], markers=['D', '^'], ms=10, ax=ax) plt.title('Interaction plot for the influence of mid-parent \n \ height and gender on offspring height', fontsize=20) plt.xlabel('Mid-parent height (inches)', fontsize=18) plt.ylabel('Mean of response', fontsize=18) fig_name = mydir + 'Figures/galtonRegressInterPlot.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close() x_M = IN.loc[IN['Gender'] == 'M'].Midparent x_F = IN.loc[IN['Gender'] == 'F'].Midparent y_M = IN.loc[IN['Gender'] == 'M'].Height y_F = IN.loc[IN['Gender'] == 'F'].Height fig = plt.figure() plt.scatter(x_M, y_M, c='#87CEEB', marker='o', label='Men') plt.scatter(x_F, y_F, c='#FF6347', marker='o', label='Women') y_pred_F = mod1.params[0] + mod1.params[1] * 0 + mod1.params[2] * midparent y_pred_M = mod1.params[0] + mod1.params[1] * 1 + mod1.params[2] * midparent plt.plot(midparent, y_pred_F, 'k-', lw=5, c='black', label='_nolegend_') plt.plot(midparent, y_pred_F, 'k-', lw=2, c='#FF6347', label='_nolegend_') plt.plot(midparent, y_pred_M, 'k-', lw=5, c='black', label='_nolegend_') plt.plot(midparent, y_pred_M, 'k-', lw=2, c='#87CEEB', label='_nolegend_') #plt.plot(midparent, y_pred_F, c = '#FF6347') #plt.plot(midparent, y_pred_M, c = '#87CEEB') fig_name = mydir + 'Figures/galtonRegressInter.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def nova_2way(plt_i, interaction_figure=False, qq_figure=True): return_dict = log_normal_stats(plt_i, figure=False) df_t = return_dict["log_data"] if interaction_figure: fig = interaction_plot( df_t["ssn_i"], df_t["sta_i"], df_t[pollutants[plt_i]], ms=10 ) ax = fig.axes[0] ax.set_xticks(range(len(seasons))) ax.set_xticklabels(seasons) ax.set_xlabel("季节") df_t_group = df_t.groupby(["sta_i", "ssn_i"]) df_t_count_min = df_t_group.count().min().values[0] df_t_sample = "" for i in df_t_group.groups.keys(): df_t_sample_unit = df_t_group.get_group(i) df_t_sample_index = random.sample(list(df_t_sample_unit.index), df_t_count_min) df_t_sample_unit = df_t_sample_unit.loc[df_t_sample_index, :] if type(df_t_sample) == str: df_t_sample = df_t_sample_unit else: df_t_sample = pd.concat( [df_t_sample, df_t_sample_unit], axis=0, join="inner" ) formula = "{} ~ C(sta_i) + C(ssn_i) + C(sta_i):C(ssn_i)".format(pollutants[plt_i]) model = ols(formula, df_t_sample).fit() aov_table = anova_lm(model, typ=2) eta_squared(aov_table) omega_squared(aov_table) if qq_figure: fig = sm.qqplot(model.resid, line="s") ax = fig.axes[0] # plt.show() return aov_table
def test_plot_rainbow(self): fig = interaction_plot(self.weight, self.duration, self.days, markers=['D','^'], ms=10) plt.close(fig)
# In[14]: plt.figure(figsize=(5, 4)) _ = sns.barplot(x='drink', y='value', data=df_long) # In[15]: plt.figure(figsize=(5, 4)) _ = sns.barplot(x='atd', y='value', data=df_long) # In[16]: from statsmodels.graphics.factorplots import interaction_plot fig = interaction_plot(df_long.drink, df_long.atd, df_long.value, colors=['red', 'blue', 'green'], markers=['D', '^', '*'], ms=10) # ## Posthoc Test # In[18]: from statsmodels.sandbox.stats.multicomp import MultiComparison multicomp = MultiComparison(df_long['value'], df_long['variable']) # testfunc # In[19]: # Bonferroni com = multicomp.allpairtest(st.ttest_rel, method='bonf') print(com[0])
def test_plot_rainbow(self, close_figures): fig = interaction_plot(self.weight, self.duration, self.days, markers=['D','^'], ms=10)
""" import pandas as pd import statsmodels.api as sm from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm from statsmodels.graphics.factorplots import interaction_plot import matplotlib.pyplot as plt from scipy import stats datafile = "ToothGrowth.csv" data = pd.read_csv(datafile) fig = interaction_plot(data.dose, data.supp, data.len, colors=['red', 'blue'], markers=['D', '^'], ms=10) # x, category, y N = len(data.len) df_a = len(data.supp.unique()) - 1 df_b = len(data.dose.unique()) - 1 df_axb = df_a * df_b df_w = N - (df_a + 1) * (df_b + 1) grand_mean = data['len'].mean() ssq_a = sum([(data[data.supp == l].len.mean() - grand_mean)**2 for l in data.supp]) ssq_b = sum([(data[data.dose == l].len.mean() - grand_mean)**2 for l in data.dose])
print(ttest_rel(dpc[:, 0].mean(-1), dpc[:, 1].mean(-1))) print(ttest_rel(dpc[:, 1].mean(-1), dpc[:, 2].mean(-1))) # %% Try to do an ANOVA an_sub, an_angle, an_snr = np.meshgrid(np.arange(n_sub), [0, 90, 180], snr[::-1], indexing='ij') data_dict = dict(subj=an_sub.ravel(), snr=an_snr.ravel(), angle=an_angle.ravel(), dpc=dpc[..., ::-1].ravel()) data = DataFrame(data_dict) from statsmodels.graphics.factorplots import interaction_plot interaction_plot(data.snr, data.angle, data.dpc) def eta_squared(aov): aov['eta_sq'] = 'NaN' aov['eta_sq'] = aov[:-1]['sum_sq'] / sum(aov['sum_sq']) return aov def omega_squared(aov): mse = aov['sum_sq'][-1] / aov['df'][-1] aov['omega_sq'] = 'NaN' aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*mse)) /\ (sum(aov['sum_sq'])+mse) return aov
import pandas as pd d = pd.read_csv("therms.csv") d.columns from statsmodels.graphics.factorplots import interaction_plot from matplotlib import pyplot as plt fig = interaction_plot(d['number'], d['status'], d['time']) plt.xticks([]) plt.xlabel("") plt.savefig("congruent-incongruent.png") from statsmodels.formula.api import ols ols_d = ols(formula="time ~ number * status", data=d) myfits = ols_d.fit() plt.clf() f = plt.figure() a = f.gca() ip1 = interaction_plot( d['number'], d['status'], myfits.fittedvalues, plottype="line", ax=a, ) ip2 = interaction_plot( d['number'], d['status'], d['time'],
df = pd.DataFrame(data=d) df.head() model = ols("Avg_cal_day ~ BMI_Group + Year", data=df) results = model.fit() df_model2 = ols("Avg_cal_day ~ BMI_Group*Year", data=df).fit() print(sm.stats.anova_lm(results, df_model2)) print('-----------') print(results.summary()) fig, ax = plt.subplots(figsize=(6, 6)) fig = interaction_plot(x=df['Year'], trace=df['BMI_Group'], response=df['Avg_cal_day'], colors=['red', 'blue', 'green'], markers=['D', '^', 's'], ms=10, ax=ax) #fig = sm.graphics.plot_partregress_grid(df_model) #fig.tight_layout(pad=1.0) #Plot checker fig2, ax2 = plt.subplots(figsize=(6, 6)) #fig2 = def LinearRegModel(model, year=0, Overweight=0, Underweight=0): intercept = model.params[0] over_coef = model.params[1]
# Look at dispesion of eggs of each factor df.boxplot(column="EGGS", by="DENSITY") print("See graphs/ex4_boxplot_eggs_density.png") plt.savefig(Path.cwd() / "Practical2/graphs/ex4_boxplot_eggs_density.png") df.boxplot(column="EGGS", by="SEASON") print("See graphs/ex4_boxplot_eggs_season.png") plt.savefig(Path.cwd() / "Practical2/graphs/ex4_boxplot_eggs_season.png") # And together df.boxplot(column="EGGS", by=["DENSITY", "SEASON"]) print("See graphs/ex4_boxplot_eggs_density_season.png") plt.savefig(Path.cwd() / "Practical2/graphs/ex4_boxplot_eggs_density_season.png") # Perform two way ANOVA print("Performing two way ANOVA") mod = ols('EGGS ~ DENSITY + SEASON + DENSITY:SEASON', data=df).fit() print(sm.stats.anova.anova_lm(mod)) print( "Both the density and season affect the eggs and there IS an interaction between the two factors." ) # Create interaction plot print("Creating interaction plot") interaction_plot(df['DENSITY'], df['SEASON'], df['EGGS']) print("See graphs/ex4_interaction_plot.png") plt.savefig(Path.cwd() / "Practical2/graphs/ex4_interaction_plot.png") print("More eggs are laid during spring") print("Lines are not parallel so an interaction occurs.")
Df.loc[:, "Sbar"] = Df[["S1", "S2"]].apply(statistics.mean, axis=1) Df.loc[:, "S_lns2"] = Df[["S1", "S2"]].apply( statistics.variance, axis=1).apply(lambda x: math.log(x) if x != 0 else math.log(0.1**20)) f, axes = plt.subplots(2, 3, sharex=True, sharey=True) g = sns.factorplot(x="A", y="Sbar", data=Df, ci=None, ax=axes[0, 0]) g = sns.factorplot(x="B", y="Sbar", data=Df, ci=None, ax=axes[0, 1]) g = sns.factorplot(x="C", y="Sbar", data=Df, ci=None, ax=axes[0, 2]) g = sns.factorplot(x="D", y="Sbar", data=Df, ci=None, ax=axes[1, 0]) g = sns.factorplot(x="E", y="Sbar", data=Df, ci=None, ax=axes[1, 1]) g = sns.factorplot(x="F", y="Sbar", data=Df, ci=None, ax=axes[1, 2]) plt.tight_layout() f.savefig("MainEffPlt.png") fig1 = interaction_plot(Df.A, Df.C, Df.Sbar) fig2 = interaction_plot(Df.A, Df.E, Df.Sbar) #frames=[DesMat,DesMat] #Df1=pd.concat(frames) #Df1.loc[:,"Y"]=Df.S1.tolist()+Df.S2.tolist() # #Df1.to_csv("Q9Dat.csv") f2, axes1 = plt.subplots(2, 3, sharex=True, sharey=True) g = sns.factorplot(x="A", y="S_lns2", data=Df, ci=None, ax=axes1[0, 0]) g = sns.factorplot(x="B", y="S_lns2", data=Df, ci=None, ax=axes1[0, 1]) g = sns.factorplot(x="C", y="S_lns2", data=Df, ci=None, ax=axes1[0, 2]) g = sns.factorplot(x="D", y="S_lns2", data=Df, ci=None, ax=axes1[1, 0]) g = sns.factorplot(x="E", y="S_lns2", data=Df, ci=None, ax=axes1[1, 1]) g = sns.factorplot(x="F", y="S_lns2", data=Df, ci=None, ax=axes1[1, 2])
## Plot Interaction of Categorical Factors # In this example, we will vizualize the interaction between categorical factors. First, we will create some categorical data are initialized. Then plotted using the interaction_plot function which internally recodes the x-factor categories to ingegers. import numpy as np import matplotlib.pyplot as plt from statsmodels.graphics.factorplots import interaction_plot from pandas import Series np.random.seed(12345) weight = Series(np.repeat(['low', 'hi', 'low', 'hi'], 15), name='weight') nutrition = Series(np.repeat(['lo_carb', 'hi_carb'], 30), name='nutrition') days = np.log(np.random.randint(1, 30, size=60)) plt.figure(figsize=(6, 6)); interaction_plot(x=weight, trace=nutrition, response=days, colors=['red', 'blue'], markers=['D', '^'], ms=10) # <matplotlib.figure.Figure at 0x106dd2a10> # image file: # image file:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 30 15:56:53 2021 @author: mattias """ import pandas as pd import matplotlib.pyplot as plt from statsmodels.graphics.factorplots import interaction_plot pd.set_option('display.max_columns', None) data = pd.read_excel(r"/home/mattias/Documents/class/hw8/hw8_q1.xlsx", engine='openpyxl') fig, ax = plt.subplots(figsize=(6, 6)) fig = interaction_plot(x=data['connector_type'], trace=data['battery_temp'], response=data['discharge_time_mins'], colors=['red', 'blue'], markers=['D', '^'], ms=10) plt.show()
134.9, 146.3, 145.2, 146.3, 125.9, 127.6, 108.9, 107.5, 148.6, 156.5, 148.6, 153.1, 135.5, 138.9, 132.1, 149.7, 152.0, 151.4, 149.7, 152.0, 142.9, 142.3, 141.7, 141.2] fac=np.array([1,2,3,4]) day=np.repeat(fac,8, axis=0) machine=np.concatenate((np.repeat(fac,2, axis=0), np.repeat(fac,2, axis=0),np.repeat(fac,2, axis=0),np.repeat(fac,2, axis=0))) trigly = {'y': y , 'day': pd.Categorical(day), 'machine': pd.Categorical(machine)} trigly = pd.DataFrame(data=trigly) trigly.info() print(pd.Categorical(day).categories) print(pd.Categorical(machine).categories) pd.crosstab(day, machine,rownames=['day'],colnames=['machine']) ## plot from statsmodels.graphics.factorplots import interaction_plot fig, ax = plt.subplots(figsize=(6, 6)) fig = interaction_plot(x=day, trace=machine, response=y,colors=['red', 'blue','brown','black'], markers=['.', '^','*','D'], ms=10, ax=ax) ## fitting model md2 = smf.mixedlm("y ~ (1-day)+(1-machine) + (1-day*machine) ", trigly, groups=machine) mdf2 = md2.fit() mdf2.summary() print(mdf2.tvalues) ## nesting ## pastes data Pastes=pd.read_csv('Pastes.csv',sep=" ") Pastes.head() Pastes.info() from pandas.api.types import CategoricalDtype cask=Pastes["cask"] batch=Pastes["batch"] strength=Pastes["strength"] ## ggplot
import matplotlib.pyplot as plt from patsy.contrasts import Sum Daten = DataFrame({ "Batch": np.tile(["1", "2", "3", "4", "5", "6"], 4), "Methode": np.repeat(["8500", "8700", "8900", "9100"], 6), "Y": np.array([ 90.3, 89.2, 98.2, 93.9, 87.4, 97.9, 92.5, 89.5, 90.6, 94.7, 87, 95.8, 85.5, 90.8, 89.6, 86.2, 88, 93.4, 82.5, 89.5, 85.6, 87.4, 78.9, 90.7 ]) }) interaction_plot(x=Daten["Batch"], trace=Daten["Methode"], response=Daten["Y"]) plt.ylabel("Daten Y") plt.show() # ============================================================================= # Zweiweg-Varianzanalyse mit Blöcken # ============================================================================= from patsy.contrasts import Sum fit = ols("Y ~ C(Methode, Sum)+C(Batch,Sum)", data=Daten).fit() fit.params fit = ols("Y ~ C(Methode, Sum)+C(Batch, Sum)", data=Daten).fit() anova_lm(fit) # ============================================================================= # Flugzeugfarbe
plt.show() # 2-3. RS Analysis: Do both "Era" and "League" affect team "RS"? # two-factor ANOVA F-test # factor 1: "Era" and factor 2: "League" model = ols("RS ~ C(Era) + C(League) + C(Era):C(League)", data=batting_df).fit() two_aov_table = sm.stats.anova_lm(model, typ=2) print("------- Two-factor ANOVA Table -------") print(two_aov_table.round(3)) # interaction plot fig, ax = plt.subplots(figsize=(9, 6)) interaction_plot(x=batting_df["League"], trace=batting_df["Era"], response=batting_df["RS"], colors=['#4c061d', '#d17a22', '#b4c292'], ax=ax) plt.title("Two-factor ANOVA Interaction Plot", fontsize=16) plt.ylabel("Mean RS") plt.show() # check ANOVA assumptions # normality fig = sm.qqplot(model.resid, line="s") plt.title("Two-factor ANOVA QQ Plot") plt.show() # equal-variance g = sns.FacetGrid(batting_df, col="Era", row="League", height=4, aspect=1) g.map_dataframe(sns.boxplot,
ins = pd.DataFrame([[col, row, x]], columns=["Occupation", "Location", "Salaries"]) plot_df = plot_df.append(ins) # transfer the data type of salary from object to numeic plot_df['Salaries'] = pd.to_numeric(plot_df['Salaries']) # reset the index to make it looks better, optional plot_df.reset_index(drop=True, inplace=True) # check the data frame before plotting plot_df fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 8)) interaction_plot(plot_df["Occupation"], plot_df["Location"], plot_df["Salaries"], colors=['red', 'blue'], func=np.mean, markers=['s', '^'], ms=5, ax=axes[0]) axes[0].legend(bbox_to_anchor=(1, .5), edgecolor='white', loc='center left') interaction_plot(plot_df["Location"], plot_df["Occupation"], plot_df["Salaries"], colors=['red', 'blue'], func=np.mean, markers=['s', '^'], ms=5, ax=axes[1]) axes[1].legend(bbox_to_anchor=(1, .5), edgecolor='white', loc='center left') ############################################################################
def test_plot_both(self): fig = interaction_plot(self.weight, self.duration, self.days, colors=['red','blue'], markers=['D','^'], ms=10) plt.close(fig)
def TWANOVA(data, x1, x2, y): import pandas as pd from statsmodels.graphics.factorplots import interaction_plot from scipy import stats # import the data data = pd.read_csv(data, sep='\t', header=(0)) '''Input - csv with data, independent factor 1,2; dependent factor (as col names, string); Calculating sum of squares (SS): Total (SSt), Between-Groups (SSb) for each factor, Within-Group (Error or SSw) and interaction SSi variability. SSt = SSx1+SSx2+SSi+SSw Adopted from https://www.marsja.se/three-ways-to-carry-out-2-way-anova-with-python/''' # Grand mean grand_mean = data[y].mean() # SS total SSt = sum((data[y] - grand_mean)**2) # SS for factors x1 and x2 SSx1 = sum([(data[y][data[x1] == e].mean() - grand_mean)**2 for e in data[x1]]) SSx2 = sum([(data[y][data[x2] == e].mean() - grand_mean)**2 for e in data[x2]]) # SS within (error/residual) SSw = 0 for i in range(len(data[y])): str_x1 = data[x1][i] str_x2 = data[x2][i] SSw = SSw + ((data[y][i] - data[y][(data[x1] == str_x1) & (data[x2] == str_x2)].mean())**2) # SS interaction SSi = SSt - SSx1 - SSx2 - SSw # degrees of freedom N = len(data[y]) df_x1 = len(data[x1].unique()) - 1 # levels of factor -1 df_x2 = len(data[x2].unique()) - 1 df_i = df_x1 * df_x2 df_w = N - (len(data[x1].unique()) * len(data[x2].unique())) # mean squares MS_x1 = SSx1 / df_x1 MS_x2 = SSx2 / df_x2 MS_i = SSi / df_i MS_w = SSw / df_w # F-ratio f_x1 = MS_x1 / MS_w f_x2 = MS_x2 / MS_w f_i = MS_i / MS_w # p-values p_x1 = stats.f.sf(f_x1, df_x1, df_w) p_x2 = stats.f.sf(f_x2, df_x2, df_w) p_i = stats.f.sf(f_i, df_i, df_w) #printing results results = { 'SS': [SSx1, SSx2, SSi, SSw], 'df': [df_x1, df_x2, df_i, df_w], 'F': [f_x1, f_x2, f_i, 'NaN'], 'PR(>F)': [p_x1, p_x2, p_i, 'NaN'] } columns = ['SS', 'df', 'F', 'PR(>F)'] table = pd.DataFrame( results, columns=columns, index=['Genotype', 'Treatment', 'GenotypexTreatment', 'Residual']) print(table) # interaction plot fig = interaction_plot(data[x1], data[x2], data[y], colors=['red', 'blue'], markers=['D', '^'], ms=10) # post-hoc Tukey's test x1_x2 = [] for i in range(len(data[y])): x1_x2.append(data[x1][i] + '_' + data[x2][i]) from statsmodels.stats.multicomp import pairwise_tukeyhsd print(pairwise_tukeyhsd(data[y], x1_x2, alpha=0.05))
def test_formatting(self): fig = interaction_plot(self.weight, self.duration, self.days, colors=['r','g'], linestyles=['--','-.']) assert_equal(isinstance(fig, plt.Figure), True) plt.close(fig)
plt.show() DOE_Plot8_1 = df_consol_final.boxplot(by='Product_mix', column=['Achieved_Yield_from_Mfg'], grid=False, fontsize=5) plt.show() DOE_Plot8_2 = sns.boxplot(x='Product_mix', y='Achieved_Yield_from_Mfg', data=df_consol_final, width=0.5, palette="colorblind") plt.show() DOE_Plot9 = interaction_plot(df_consol_final.Machine_Count, df_consol_final.Operators_Count, df_consol_final.Achieved_Yield_from_Mfg, ms=10) plt.show() DOE_Plot10 = interaction_plot(df_consol_final.Machine_Count, df_consol_final.Product_mix, df_consol_final.Achieved_Yield_from_Mfg, ms=10) plt.show() DOE_Plot11 = interaction_plot(df_consol_final.Operators_Count, df_consol_final.Product_mix, df_consol_final.Achieved_Yield_from_Mfg, ms=10) plt.show()
def btnTwoWayAnova_Click(self, m_widget): # Create a pandas DataFrame from the GUI table: dataframe = self.create_pandas_DataFrame(m_widget) if dataframe.empty: return # print(dataframe) # The table must have at least 3 rows: if len(dataframe) < 3: # number of rows = len(dataframe) tkinter.messagebox.showinfo('Two-way ANOVA', 'You must have at least 3 values for each group.') return # I will use "dependent" for the dependent variable, "twoplus" for the group that has at least 2 variables # and "threeplus" for the group that has at least 3 variables # print(dataframe.ix[:, 0]) # First column column_names = list(dataframe) # unsorted # ~~~ Each column name must start with a letter! ~~~ pattern = re.compile(r'^[a-z]') try: for x in column_names: m = re.search(pattern, x) # ~~~ If m doesn't exist, it's because a variable name doesn't begin with a-z or A-Z, thus the assertion # fails. assert m except: dataframe.columns = ['column_1', 'column_2', 'column_3'] column_names = list(dataframe) # ~~~ Launch the two-way ANOVA wizard ~~~ wiz = ach_generic.TwoWayAnovaWizard(self, settings=tuple(x for x in column_names)) if wiz.result is None: # The user presses Cancel return dependent_var = wiz.result[0] # just the column name posthoc_var = wiz.result[1] # just the column name # ~~~ Get the other two variables from the column_names list ~~~ temp_list = [str(x) for x in column_names if not str(x) == dependent_var] second_var = temp_list[0] third_var = temp_list[1] if not dataframe.dtypes[dependent_var] == float: tk.messagebox.showerror('Statistics', 'The dependent variable must be continuous') return # statmodels uses R-like model notation. # Two-way ANOVA with interactions: formula = 'len ~ C(supp) + C(dose) + C(supp):C(dose)' # Two-way ANOVA without interactions: formula = 'len ~ C(supp) + C(dose)' formula = '%s ~ C(%s) + C(%s)' % (dependent_var, second_var, third_var) # print(formula) model = ols(formula, dataframe).fit() aov_table1 = anova_lm(model, typ=2) # print('\n~~~ Two-way ANOVA without interactions ~~~') # print(aov_table1) results_string = '' results_string += '\n~~~ Two-way ANOVA without interactions ~~~\n' results_string += formula + '\n' results_string += aov_table1.to_string() results_string += '\n' # ~~~ Bonferroni's correction ~~~ if not posthoc_var == '': # ~~~ 1st variable: dependent_var, 2nd variable: c2_var, 3rd variable: posthoc_var ~~~ c2_var = [x for x in column_names if not x in [dependent_var, posthoc_var]][0] second_var = c2_var third_var = posthoc_var c1 = dataframe[dependent_var] c2 = dataframe[second_var] c3 = dataframe[third_var] # assert column c3 had at least 3 unique values if len(c3.unique()) < 3: info = 'Post hoc test \'Bonferroni\' should have at least 3 values in column \'%s\'' % posthoc_var tk.messagebox.showerror('Statistics', info) else: row_names = [''] * len(c3.unique()) p_v_cor, corresponding_groups = multiple_comparisons_with_bonferroni(c1, c3) dataframe_bon = self.create_bonferroni_dataframe(p_v_cor, corresponding_groups) # print('\n~~~ Post hoc test: Multiple comparisons with Bonferroni correction ~~~') # print(dataframe_bon) results_string += '\n~~~ Post hoc test: Multiple comparisons with Bonferroni correction ~~~\n' results_string += dataframe_bon.to_string() else: c1 = dataframe[dependent_var] c2 = dataframe[second_var] c3 = dataframe[third_var] # Plots: plt.close('all') # fig1 = interaction_plot(threeplus, twoplus, dependent, colors=['red', 'blue'], markers=['D', '^'], ms=10) # fig2 = sm.qqplot(model.resid, line='s') # ~~~ plotting fails when posthoc_var is 'supp' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # fig, axx = plt.subplots(nrows=2) # create two subplots, one in each row # interaction_plot(c3, c2, c1, colors=['red', 'blue'], markers=['D', '^'], ms=10, ax=axx[0]) # sm.qqplot(model.resid, line='s', ax=axx[1]) plots = [] try: plot1 = interaction_plot(c3, c2, c1, colors=['red', 'blue'], markers=['D', '^'], ms=10) plots.append(plot1) except Exception as e: print('Error in plot1:', str(e)) # plt.show() # Scatter plot # plt.figure() # if c3.dtypes == float: # plt.scatter(c3, c1, color='red') # else: # # Convert categorical variables to numbers # from sklearn.preprocessing import LabelEncoder # labelencoder = LabelEncoder() # c3_encoded = labelencoder.fit_transform(c3) # plt.scatter(c3_encoded, c1, color='red') # plt.title('Outliers') # Boxplot try: fig_boxplot = plt.figure() temp = [] for i in range(len(c3.unique())): temp2 = c1[c3 == c3.unique()[i]] temp.append(temp2) plt.boxplot(temp) plt.title('Outlier detection', figure=fig_boxplot) plt.xlabel(third_var, figure=fig_boxplot) plt.ylabel(dependent_var, figure=fig_boxplot) plots.append(fig_boxplot) except Exception as e: print('Error in Boxplot:', str(e)) results_popup = ach_generic.ResultsPopup(self, settings=results_string, plots=plots)
def two_way_anova(dataframe_a, dataframe_b, parameter, parm_val_a, parm_val_b, bin_var): """Performs regular two-way ANOVA for a given feature measured over bins. Arguments --- dataframe_a,b: pandas DataFrame Spreadsheet input, in the FastStat case coming from a filtered data frame from DataSet object. parameter: str Name of the variable for the two-way measurement parm_val_a, b: str Value for the chosen parameter :arg bin_var: str representing name of bin variable Returns --- pandas.DataFrame with ANOVA information""" # counts number of bins for given bin variable bin_num = dataframe_a.columns.str.contains(bin_var + ' bin ').sum() subdataset_a = DataSet( dataframe_a, dataframe_a.columns[dataframe_a.columns.get_loc(bin_var + ' bin 1') + bin_num]) subdataset_b = DataSet( dataframe_b, dataframe_b.columns[dataframe_b.columns.get_loc(bin_var + ' bin 1') + bin_num]) bin_dataset_a = bin_dataframe_generator( bins_subset(subdataset_a.data_frame, bin_var), bin_var) bin_dataset_b = bin_dataframe_generator( bins_subset(subdataset_b.data_frame, bin_var), bin_var) bin_dataset_a[parameter] = parm_val_a bin_dataset_b[parameter] = parm_val_b anova_dataset = bin_dataset_a.append(bin_dataset_b) fig = interaction_plot(anova_dataset['bin'], anova_dataset[parameter], anova_dataset[bin_var], colors=['red', 'blue'], markers=['D', '^'], ms=10) figfile = BytesIO() plt.savefig(figfile, format='png') figfile.seek(0) figdata_png = base64.b64encode(figfile.getvalue()) # Degrees of freedom - df n = len(anova_dataset[bin_var]) df_a = len(anova_dataset['bin'].unique()) - 1 df_b = len(anova_dataset[parameter].unique()) - 1 dfaxb = df_a * df_b df_within = n - (len(anova_dataset['bin'].unique()) * len(anova_dataset[parameter].unique())) # Sum of squares - ssq (factors A, B and total) grand_mean = anova_dataset[bin_var].mean() ssq_a = sum([ (anova_dataset[anova_dataset[parameter] == l][bin_var].mean() - grand_mean)**2 for l in anova_dataset[parameter] ]) ssq_b = sum([(anova_dataset[anova_dataset['bin'] == l][bin_var].mean() - grand_mean)**2 for l in anova_dataset['bin']]) ssq_total = sum((anova_dataset[bin_var] - grand_mean)**2) # Sum of Squares Within (error/residual) bin_means_a = [ bin_dataset_a[bin_dataset_a['bin'] == d][bin_var].mean() for d in bin_dataset_a['bin'] ] bin_means_b = [ bin_dataset_b[bin_dataset_b['bin'] == d][bin_var].mean() for d in bin_dataset_b['bin'] ] ssq_within = sum((bin_dataset_b[bin_var] - bin_means_b)**2) + sum( (bin_dataset_a[bin_var] - bin_means_a)**2) # Sum of Squares Interaction ssqaxb = ssq_total - ssq_a - ssq_b - ssq_within # Mean Squares ms_a = ssq_a / df_a ms_b = ssq_b / df_b ms_ax_b = ssqaxb / dfaxb ms_within = ssq_within / df_within # F-ratio f_a = ms_a / ms_within f_b = ms_b / ms_within faxb = ms_ax_b / ms_within # Obtaining p-values p_a = stats.f.sf(f_a, df_a, df_within) p_b = stats.f.sf(f_b, df_b, df_within) paxb = stats.f.sf(faxb, dfaxb, df_within) # table with results from ANOVA results = { 'SS': [ssq_a, ssq_b, ssqaxb, ssq_within], 'DF': [df_a, df_b, dfaxb, df_within], 'F': [f_a, f_b, faxb, ''], 'PR(>F)': [p_a, p_b, paxb, ''] } columns = ['SS', 'DF', 'F', 'PR(>F)'] return pd.DataFrame( results, columns=columns, index=[parameter, 'bin', parameter + ':bin', 'Residual']), urllib.parse.quote(figdata_png)
df = pd.read_csv( r"C:\Users\freya\OneDrive\HSLU\6. Semester 2020FS\STAT\SW010\Übungen\Diet.csv" ) df["weight_loss"] = df["weight6weeks"] - df["pre.weight"] df.head() # Serie 10 # Aufgabe 10.1 # a) sns.boxplot(x="gender", y="weight_loss", data=df) sns.stripplot(x="gender", y="weight_loss", data=df) # b) from statsmodels.graphics.factorplots import interaction_plot interaction_plot(x=df["gender"], trace=df["Diet"], response=df["weight_loss"]) # c) interaction_plot(x=df["Diet"], trace=df["gender"], response=df["weight_loss"]) # d) fit = ols("weight_loss~gender+Diet", data=df).fit() anova_lm(fit) # e) fit = ols("weight_loss~Diet*gender", data=df).fit() anova_lm(fit) # Aufgabe 10.2 # a) df = pd.read_csv(
# In[6]: df['genderX'] = df['gender'].replace({'Male': 1, 'Female': 2}) df['alcoholX'] = df['alcohol'].replace({'None': 1, '2 Pints': 2, '4 Pints': 3}) # In[10]: df.groupby(['gender', 'alcohol']).describe()['attractiveness'] # In[12]: from statsmodels.graphics.factorplots import interaction_plot fig = interaction_plot(df.alcoholX, df.gender, df.attractiveness, colors=['red', 'blue'], markers=['D', '^'], ms=10) # In[26]: _ = sns.lineplot(x='alcohol', y='attractiveness', hue='gender', err_style="bars", sort=False, data=df, style='gender', markers=['D', '^']) # In[28]:
import numpy as np from statsmodels.graphics.factorplots import interaction_plot np.random.seed(12345) weight = np.random.randint(1, 4, size=60) duration = np.random.randint(1, 3, size=60) days = np.log(np.random.randint(1, 30, size=60)) fig = interaction_plot(weight, duration, days, colors=['red', 'blue'], markers=['D', '^'], ms=10) import matplotlib.pyplot as plt #plt.show()
def test_plot_rainbow(self, close_figures): fig = interaction_plot(self.weight, self.duration, self.days, markers=['D', '^'], ms=10)
# -*- coding: utf-8 -*- """Plot Interaction of Categorical Factors """ #In this example, we will vizualize the interaction between #categorical factors. First, categorical data are initialized #and then plotted using the interaction_plot function. # #Author: Denis A. Engemann print __doc__ import numpy as np from statsmodels.graphics.factorplots import interaction_plot from pandas import Series np.random.seed(12345) weight = Series(np.repeat(['low', 'hi', 'low', 'hi'], 15), name='weight') nutrition = Series(np.repeat(['lo_carb', 'hi_carb'], 30), name='nutrition') days = np.log(np.random.randint(1, 30, size=60)) fig = interaction_plot(weight, nutrition, days, colors=['red', 'blue'], markers=['D', '^'], ms=10) import matplotlib.pylab as plt plt.show()
import numpy as np from statsmodels.graphics.factorplots import interaction_plot np.random.seed(12345) weight = np.random.randint(1, 4, size=60) duration = np.random.randint(1, 3, size=60) days = np.log(np.random.randint(1, 30, size=60)) fig = interaction_plot(weight, duration, days, colors=["red", "blue"], markers=["D", "^"], ms=10) import matplotlib.pyplot as plt # plt.show()