Esempio n. 1
0
 def test_plottype(self):
     fig = interaction_plot(self.weight, self.duration, self.days, plottype='line')
     assert_equal(isinstance(fig, plt.Figure), True)
     plt.close(fig)
     fig = interaction_plot(self.weight, self.duration, self.days, plottype='scatter')
     assert_equal(isinstance(fig, plt.Figure), True)
     plt.close(fig)
     assert_raises(ValueError, interaction_plot, self.weight, self.duration, self.days, plottype='unknown')
     plt.close('all')
Esempio n. 2
0
 def test_plottype(self):
     fig = interaction_plot(self.weight, self.duration, self.days, plottype='line')
     assert_equal(isinstance(fig, plt.Figure), True)
     plt.close(fig)
     fig = interaction_plot(self.weight, self.duration, self.days, plottype='scatter')
     assert_equal(isinstance(fig, plt.Figure), True)
     plt.close(fig)
     assert_raises(ValueError, interaction_plot, self.weight, self.duration, self.days, plottype='unknown')
     plt.close('all')
Esempio n. 3
0
 def test_plot_both(self, close_figures):
     fig = interaction_plot(self.weight,
                            self.duration,
                            self.days,
                            colors=['red', 'blue'],
                            markers=['D', '^'],
                            ms=10)
Esempio n. 4
0
 def test_formatting(self, close_figures):
     fig = interaction_plot(self.weight,
                            self.duration,
                            self.days,
                            colors=['r', 'g'],
                            linestyles=['--', '-.'])
     assert_equal(isinstance(fig, plt.Figure), True)
 def test_plot_rainbow(self):
     fig = interaction_plot(self.weight,
                            self.duration,
                            self.days,
                            markers=['D', '^'],
                            ms=10)
     plt.close(fig)
Esempio n. 6
0
 def test_plot_pandas(self, astype, close_figures):
     weight = Series(self.weight, name='Weight').astype(astype)
     duration = Series(self.duration, name='Duration')
     days = Series(self.days, name='Days')
     fig = interaction_plot(weight, duration, days,
                            markers=['D', '^'], ms=10)
     ax = fig.axes[0]
     trace = ax.get_legend().get_title().get_text()
     assert_equal(trace, 'Duration')
     assert_equal(ax.get_ylabel(), 'mean of Days')
     assert_equal(ax.get_xlabel(), 'Weight')
Esempio n. 7
0
 def test_plot_string_data(self):
     weight = Series(self.weight, name='Weight').astype('str')
     duration = Series(self.duration, name='Duration')
     days = Series(self.days, name='Days')
     fig = interaction_plot(weight, duration, days,
                            markers=['D', '^'], ms=10)
     ax = fig.axes[0]
     trace = ax.get_legend().get_title().get_text()
     assert_equal(trace, 'Duration')
     assert_equal(ax.get_ylabel(), 'mean of Days')
     assert_equal(ax.get_xlabel(), 'Weight')
     plt.close(fig)
Esempio n. 8
0
 def test_plot_pandas(self):
     weight = Series(self.weight, name='Weight')
     duration = Series(self.duration, name='Duration')
     days = Series(self.days, name='Days')
     fig = interaction_plot(weight, duration, days,
              markers=['D','^'], ms=10)
     ax = fig.axes[0]
     trace = ax.get_legend().get_title().get_text()
     assert trace == 'Duration'
     assert ax.get_ylabel() == 'mean of Days'
     assert ax.get_xlabel() == 'Weight'
     plt.close(fig)
Esempio n. 9
0
 def test_plot_pandas(self):
     weight = Series(self.weight, name='Weight')
     duration = Series(self.duration, name='Duration')
     days = Series(self.days, name='Days')
     fig = interaction_plot(weight, duration, days,
              markers=['D','^'], ms=10)
     ax = fig.axes[0]
     trace = ax.get_legend().get_title().get_text()
     assert trace == 'Duration'
     assert ax.get_ylabel() == 'mean of Days'
     assert ax.get_xlabel() == 'Weight'
     plt.close(fig)
Esempio n. 10
0
 def test_plot_pandas(self, close_figures):
     weight = Series(self.weight, name='Weight')
     duration = Series(self.duration, name='Duration')
     days = Series(self.days, name='Days')
     fig = interaction_plot(weight,
                            duration,
                            days,
                            markers=['D', '^'],
                            ms=10)
     ax = fig.axes[0]
     trace = ax.get_legend().get_title().get_text()
     assert_equal(trace, 'Duration')
     assert_equal(ax.get_ylabel(), 'mean of Days')
     assert_equal(ax.get_xlabel(), 'Weight')
Esempio n. 11
0
def galtonRegressInter():
    IN = pd.read_csv(mydir + 'data/Galton.csv', sep=',')
    IN['Midparent'] = IN[['Father', 'Mother']].mean(axis=1)
    mod1 = smf.ols(formula='Height ~ Midparent + C(Gender)', data=IN).fit()
    mod2 = smf.ols(formula='Height ~ Midparent * C(Gender)', data=IN).fit()

    fig, ax = plt.subplots(figsize=(6, 6))
    midparent = IN.Midparent.values
    gender = IN.Gender.values
    height = IN.Height.values
    fig = interaction_plot(x=midparent,
                           trace=gender,
                           response=height,
                           colors=['#FF6347', '#87CEEB'],
                           markers=['D', '^'],
                           ms=10,
                           ax=ax)
    plt.title('Interaction plot for the influence of mid-parent \n \
        height and gender on offspring height',
              fontsize=20)
    plt.xlabel('Mid-parent height (inches)', fontsize=18)
    plt.ylabel('Mean of response', fontsize=18)
    fig_name = mydir + 'Figures/galtonRegressInterPlot.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()

    x_M = IN.loc[IN['Gender'] == 'M'].Midparent
    x_F = IN.loc[IN['Gender'] == 'F'].Midparent
    y_M = IN.loc[IN['Gender'] == 'M'].Height
    y_F = IN.loc[IN['Gender'] == 'F'].Height

    fig = plt.figure()
    plt.scatter(x_M, y_M, c='#87CEEB', marker='o', label='Men')
    plt.scatter(x_F, y_F, c='#FF6347', marker='o', label='Women')
    y_pred_F = mod1.params[0] + mod1.params[1] * 0 + mod1.params[2] * midparent
    y_pred_M = mod1.params[0] + mod1.params[1] * 1 + mod1.params[2] * midparent
    plt.plot(midparent, y_pred_F, 'k-', lw=5, c='black', label='_nolegend_')
    plt.plot(midparent, y_pred_F, 'k-', lw=2, c='#FF6347', label='_nolegend_')
    plt.plot(midparent, y_pred_M, 'k-', lw=5, c='black', label='_nolegend_')
    plt.plot(midparent, y_pred_M, 'k-', lw=2, c='#87CEEB', label='_nolegend_')

    #plt.plot(midparent, y_pred_F, c = '#FF6347')
    #plt.plot(midparent, y_pred_M, c = '#87CEEB')

    fig_name = mydir + 'Figures/galtonRegressInter.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()
Esempio n. 12
0
def nova_2way(plt_i, interaction_figure=False, qq_figure=True):
    return_dict = log_normal_stats(plt_i, figure=False)
    df_t = return_dict["log_data"]

    if interaction_figure:
        fig = interaction_plot(
            df_t["ssn_i"], df_t["sta_i"], df_t[pollutants[plt_i]], ms=10
        )
        ax = fig.axes[0]
        ax.set_xticks(range(len(seasons)))
        ax.set_xticklabels(seasons)
        ax.set_xlabel("季节")

    df_t_group = df_t.groupby(["sta_i", "ssn_i"])
    df_t_count_min = df_t_group.count().min().values[0]
    df_t_sample = ""
    for i in df_t_group.groups.keys():
        df_t_sample_unit = df_t_group.get_group(i)
        df_t_sample_index = random.sample(list(df_t_sample_unit.index), df_t_count_min)
        df_t_sample_unit = df_t_sample_unit.loc[df_t_sample_index, :]
        if type(df_t_sample) == str:
            df_t_sample = df_t_sample_unit
        else:
            df_t_sample = pd.concat(
                [df_t_sample, df_t_sample_unit], axis=0, join="inner"
            )

    formula = "{} ~ C(sta_i) + C(ssn_i) + C(sta_i):C(ssn_i)".format(pollutants[plt_i])
    model = ols(formula, df_t_sample).fit()
    aov_table = anova_lm(model, typ=2)

    eta_squared(aov_table)
    omega_squared(aov_table)

    if qq_figure:
        fig = sm.qqplot(model.resid, line="s")
        ax = fig.axes[0]
        # plt.show()

    return aov_table
Esempio n. 13
0
 def test_plot_rainbow(self):
     fig = interaction_plot(self.weight, self.duration, self.days,
              markers=['D','^'], ms=10)
     plt.close(fig)
# In[14]:

plt.figure(figsize=(5, 4))
_ = sns.barplot(x='drink', y='value', data=df_long)

# In[15]:

plt.figure(figsize=(5, 4))
_ = sns.barplot(x='atd', y='value', data=df_long)

# In[16]:

from statsmodels.graphics.factorplots import interaction_plot
fig = interaction_plot(df_long.drink,
                       df_long.atd,
                       df_long.value,
                       colors=['red', 'blue', 'green'],
                       markers=['D', '^', '*'],
                       ms=10)

# ## Posthoc Test

# In[18]:

from statsmodels.sandbox.stats.multicomp import MultiComparison
multicomp = MultiComparison(df_long['value'], df_long['variable'])  # testfunc

# In[19]:

# Bonferroni
com = multicomp.allpairtest(st.ttest_rel, method='bonf')
print(com[0])
Esempio n. 15
0
 def test_plot_rainbow(self, close_figures):
     fig = interaction_plot(self.weight, self.duration, self.days,
              markers=['D','^'], ms=10)
Esempio n. 16
0
"""

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.factorplots import interaction_plot
import matplotlib.pyplot as plt
from scipy import stats

datafile = "ToothGrowth.csv"
data = pd.read_csv(datafile)

fig = interaction_plot(data.dose,
                       data.supp,
                       data.len,
                       colors=['red', 'blue'],
                       markers=['D', '^'],
                       ms=10)
# x, category, y

N = len(data.len)
df_a = len(data.supp.unique()) - 1
df_b = len(data.dose.unique()) - 1
df_axb = df_a * df_b
df_w = N - (df_a + 1) * (df_b + 1)

grand_mean = data['len'].mean()
ssq_a = sum([(data[data.supp == l].len.mean() - grand_mean)**2
             for l in data.supp])
ssq_b = sum([(data[data.dose == l].len.mean() - grand_mean)**2
             for l in data.dose])
Esempio n. 17
0
print(ttest_rel(dpc[:, 0].mean(-1), dpc[:, 1].mean(-1)))
print(ttest_rel(dpc[:, 1].mean(-1), dpc[:, 2].mean(-1)))

# %% Try to do an ANOVA
an_sub, an_angle, an_snr = np.meshgrid(np.arange(n_sub), [0, 90, 180],
                                       snr[::-1],
                                       indexing='ij')

data_dict = dict(subj=an_sub.ravel(),
                 snr=an_snr.ravel(),
                 angle=an_angle.ravel(),
                 dpc=dpc[..., ::-1].ravel())
data = DataFrame(data_dict)

from statsmodels.graphics.factorplots import interaction_plot
interaction_plot(data.snr, data.angle, data.dpc)


def eta_squared(aov):
    aov['eta_sq'] = 'NaN'
    aov['eta_sq'] = aov[:-1]['sum_sq'] / sum(aov['sum_sq'])
    return aov


def omega_squared(aov):
    mse = aov['sum_sq'][-1] / aov['df'][-1]
    aov['omega_sq'] = 'NaN'
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*mse)) /\
        (sum(aov['sum_sq'])+mse)
    return aov
Esempio n. 18
0
import pandas as pd
d = pd.read_csv("therms.csv")
d.columns

from statsmodels.graphics.factorplots import interaction_plot
from matplotlib import pyplot as plt

fig = interaction_plot(d['number'], d['status'], d['time'])
plt.xticks([])
plt.xlabel("")
plt.savefig("congruent-incongruent.png")

from statsmodels.formula.api import ols
ols_d = ols(formula="time ~ number * status", data=d)
myfits = ols_d.fit()
plt.clf()
f = plt.figure()
a = f.gca()

ip1 = interaction_plot(
    d['number'],
    d['status'],
    myfits.fittedvalues,
    plottype="line",
    ax=a,
)

ip2 = interaction_plot(
    d['number'],
    d['status'],
    d['time'],
df = pd.DataFrame(data=d)

df.head()
model = ols("Avg_cal_day ~ BMI_Group + Year", data=df)
results = model.fit()
df_model2 = ols("Avg_cal_day ~ BMI_Group*Year", data=df).fit()

print(sm.stats.anova_lm(results, df_model2))
print('-----------')
print(results.summary())

fig, ax = plt.subplots(figsize=(6, 6))
fig = interaction_plot(x=df['Year'],
                       trace=df['BMI_Group'],
                       response=df['Avg_cal_day'],
                       colors=['red', 'blue', 'green'],
                       markers=['D', '^', 's'],
                       ms=10,
                       ax=ax)

#fig = sm.graphics.plot_partregress_grid(df_model)
#fig.tight_layout(pad=1.0)

#Plot checker
fig2, ax2 = plt.subplots(figsize=(6, 6))


#fig2 =
def LinearRegModel(model, year=0, Overweight=0, Underweight=0):
    intercept = model.params[0]
    over_coef = model.params[1]
Esempio n. 20
0
# Look at dispesion of eggs of each factor
df.boxplot(column="EGGS", by="DENSITY")
print("See graphs/ex4_boxplot_eggs_density.png")
plt.savefig(Path.cwd() / "Practical2/graphs/ex4_boxplot_eggs_density.png")
df.boxplot(column="EGGS", by="SEASON")
print("See graphs/ex4_boxplot_eggs_season.png")
plt.savefig(Path.cwd() / "Practical2/graphs/ex4_boxplot_eggs_season.png")

# And together
df.boxplot(column="EGGS", by=["DENSITY", "SEASON"])
print("See graphs/ex4_boxplot_eggs_density_season.png")
plt.savefig(Path.cwd() /
            "Practical2/graphs/ex4_boxplot_eggs_density_season.png")

# Perform two way ANOVA
print("Performing two way ANOVA")
mod = ols('EGGS ~ DENSITY + SEASON + DENSITY:SEASON', data=df).fit()
print(sm.stats.anova.anova_lm(mod))
print(
    "Both the density and season affect the eggs and there IS an interaction between the two factors."
)

# Create interaction plot
print("Creating interaction plot")
interaction_plot(df['DENSITY'], df['SEASON'], df['EGGS'])
print("See graphs/ex4_interaction_plot.png")
plt.savefig(Path.cwd() / "Practical2/graphs/ex4_interaction_plot.png")

print("More eggs are laid during spring")
print("Lines are not parallel so an interaction occurs.")
Df.loc[:, "Sbar"] = Df[["S1", "S2"]].apply(statistics.mean, axis=1)
Df.loc[:, "S_lns2"] = Df[["S1", "S2"]].apply(
    statistics.variance,
    axis=1).apply(lambda x: math.log(x) if x != 0 else math.log(0.1**20))

f, axes = plt.subplots(2, 3, sharex=True, sharey=True)
g = sns.factorplot(x="A", y="Sbar", data=Df, ci=None, ax=axes[0, 0])
g = sns.factorplot(x="B", y="Sbar", data=Df, ci=None, ax=axes[0, 1])
g = sns.factorplot(x="C", y="Sbar", data=Df, ci=None, ax=axes[0, 2])
g = sns.factorplot(x="D", y="Sbar", data=Df, ci=None, ax=axes[1, 0])
g = sns.factorplot(x="E", y="Sbar", data=Df, ci=None, ax=axes[1, 1])
g = sns.factorplot(x="F", y="Sbar", data=Df, ci=None, ax=axes[1, 2])
plt.tight_layout()
f.savefig("MainEffPlt.png")

fig1 = interaction_plot(Df.A, Df.C, Df.Sbar)
fig2 = interaction_plot(Df.A, Df.E, Df.Sbar)

#frames=[DesMat,DesMat]
#Df1=pd.concat(frames)
#Df1.loc[:,"Y"]=Df.S1.tolist()+Df.S2.tolist()
#
#Df1.to_csv("Q9Dat.csv")

f2, axes1 = plt.subplots(2, 3, sharex=True, sharey=True)
g = sns.factorplot(x="A", y="S_lns2", data=Df, ci=None, ax=axes1[0, 0])
g = sns.factorplot(x="B", y="S_lns2", data=Df, ci=None, ax=axes1[0, 1])
g = sns.factorplot(x="C", y="S_lns2", data=Df, ci=None, ax=axes1[0, 2])
g = sns.factorplot(x="D", y="S_lns2", data=Df, ci=None, ax=axes1[1, 0])
g = sns.factorplot(x="E", y="S_lns2", data=Df, ci=None, ax=axes1[1, 1])
g = sns.factorplot(x="F", y="S_lns2", data=Df, ci=None, ax=axes1[1, 2])
## Plot Interaction of Categorical Factors

# In this example, we will vizualize the interaction between categorical factors. First, we will create some categorical data are initialized. Then plotted using the interaction_plot function which internally recodes the x-factor categories to ingegers.

import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.factorplots import interaction_plot
from pandas import Series
np.random.seed(12345)
weight = Series(np.repeat(['low', 'hi', 'low', 'hi'], 15), name='weight')
nutrition = Series(np.repeat(['lo_carb', 'hi_carb'], 30), name='nutrition')
days = np.log(np.random.randint(1, 30, size=60))
plt.figure(figsize=(6, 6));
interaction_plot(x=weight, trace=nutrition, response=days,
                 colors=['red', 'blue'], markers=['D', '^'], ms=10)


#     <matplotlib.figure.Figure at 0x106dd2a10>

# image file:

# image file:
Esempio n. 23
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 30 15:56:53 2021

@author: mattias
"""

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.factorplots import interaction_plot
pd.set_option('display.max_columns', None)

data = pd.read_excel(r"/home/mattias/Documents/class/hw8/hw8_q1.xlsx",
                     engine='openpyxl')

fig, ax = plt.subplots(figsize=(6, 6))
fig = interaction_plot(x=data['connector_type'],
                       trace=data['battery_temp'],
                       response=data['discharge_time_mins'],
                       colors=['red', 'blue'],
                       markers=['D', '^'],
                       ms=10)
plt.show()
Esempio n. 24
0
       134.9, 146.3, 145.2, 146.3, 125.9, 127.6, 108.9, 107.5,
       148.6, 156.5, 148.6, 153.1, 135.5, 138.9, 132.1, 149.7, 
       152.0, 151.4, 149.7, 152.0, 142.9, 142.3, 141.7, 141.2] 
fac=np.array([1,2,3,4])
day=np.repeat(fac,8, axis=0)
machine=np.concatenate((np.repeat(fac,2, axis=0), np.repeat(fac,2, axis=0),np.repeat(fac,2, axis=0),np.repeat(fac,2, axis=0)))
trigly = {'y': y , 'day': pd.Categorical(day), 'machine': pd.Categorical(machine)}
trigly = pd.DataFrame(data=trigly)
trigly.info()
print(pd.Categorical(day).categories)
print(pd.Categorical(machine).categories)
pd.crosstab(day, machine,rownames=['day'],colnames=['machine'])
## plot
from statsmodels.graphics.factorplots import interaction_plot
fig, ax = plt.subplots(figsize=(6, 6))
fig = interaction_plot(x=day, trace=machine, response=y,colors=['red', 'blue','brown','black'], markers=['.', '^','*','D'], ms=10, ax=ax)
## fitting model 
md2 = smf.mixedlm("y ~  (1-day)+(1-machine) + (1-day*machine) ", trigly, groups=machine)
mdf2 = md2.fit()
mdf2.summary()
print(mdf2.tvalues)
## nesting
## pastes data
Pastes=pd.read_csv('Pastes.csv',sep=" ") 
Pastes.head()
Pastes.info()
from pandas.api.types import CategoricalDtype
cask=Pastes["cask"]
batch=Pastes["batch"]
strength=Pastes["strength"]
## ggplot 
import matplotlib.pyplot as plt
from patsy.contrasts import Sum

Daten = DataFrame({
    "Batch":
    np.tile(["1", "2", "3", "4", "5", "6"], 4),
    "Methode":
    np.repeat(["8500", "8700", "8900", "9100"], 6),
    "Y":
    np.array([
        90.3, 89.2, 98.2, 93.9, 87.4, 97.9, 92.5, 89.5, 90.6, 94.7, 87, 95.8,
        85.5, 90.8, 89.6, 86.2, 88, 93.4, 82.5, 89.5, 85.6, 87.4, 78.9, 90.7
    ])
})

interaction_plot(x=Daten["Batch"], trace=Daten["Methode"], response=Daten["Y"])
plt.ylabel("Daten Y")
plt.show()

# =============================================================================
# Zweiweg-Varianzanalyse mit Blöcken
# =============================================================================
from patsy.contrasts import Sum
fit = ols("Y ~ C(Methode, Sum)+C(Batch,Sum)", data=Daten).fit()
fit.params

fit = ols("Y ~ C(Methode, Sum)+C(Batch, Sum)", data=Daten).fit()
anova_lm(fit)

# =============================================================================
# Flugzeugfarbe
plt.show()

# 2-3. RS Analysis: Do both "Era" and "League" affect team "RS"?
# two-factor ANOVA F-test
# factor 1: "Era" and factor 2: "League"
model = ols("RS ~ C(Era) + C(League) + C(Era):C(League)",
            data=batting_df).fit()
two_aov_table = sm.stats.anova_lm(model, typ=2)
print("------- Two-factor ANOVA Table -------")
print(two_aov_table.round(3))

# interaction plot
fig, ax = plt.subplots(figsize=(9, 6))
interaction_plot(x=batting_df["League"],
                 trace=batting_df["Era"],
                 response=batting_df["RS"],
                 colors=['#4c061d', '#d17a22', '#b4c292'],
                 ax=ax)
plt.title("Two-factor ANOVA Interaction Plot", fontsize=16)
plt.ylabel("Mean RS")
plt.show()

# check ANOVA assumptions
# normality
fig = sm.qqplot(model.resid, line="s")
plt.title("Two-factor ANOVA QQ Plot")
plt.show()

# equal-variance
g = sns.FacetGrid(batting_df, col="Era", row="League", height=4, aspect=1)
g.map_dataframe(sns.boxplot,
Esempio n. 27
0
            ins = pd.DataFrame([[col, row, x]],
                               columns=["Occupation", "Location", "Salaries"])
            plot_df = plot_df.append(ins)

# transfer the data type of salary from object to numeic
plot_df['Salaries'] = pd.to_numeric(plot_df['Salaries'])
# reset the index to make it looks better, optional
plot_df.reset_index(drop=True, inplace=True)
# check the data frame before plotting
plot_df

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 8))
interaction_plot(plot_df["Occupation"],
                 plot_df["Location"],
                 plot_df["Salaries"],
                 colors=['red', 'blue'],
                 func=np.mean,
                 markers=['s', '^'],
                 ms=5,
                 ax=axes[0])
axes[0].legend(bbox_to_anchor=(1, .5), edgecolor='white', loc='center left')
interaction_plot(plot_df["Location"],
                 plot_df["Occupation"],
                 plot_df["Salaries"],
                 colors=['red', 'blue'],
                 func=np.mean,
                 markers=['s', '^'],
                 ms=5,
                 ax=axes[1])
axes[1].legend(bbox_to_anchor=(1, .5), edgecolor='white', loc='center left')

############################################################################
Esempio n. 28
0
 def test_plot_both(self):
     fig = interaction_plot(self.weight, self.duration, self.days,
              colors=['red','blue'], markers=['D','^'], ms=10)
     plt.close(fig)
Esempio n. 29
0
def TWANOVA(data, x1, x2, y):

    import pandas as pd
    from statsmodels.graphics.factorplots import interaction_plot
    from scipy import stats

    # import the data
    data = pd.read_csv(data, sep='\t', header=(0))
    '''Input - csv with data, independent factor 1,2; dependent factor (as col names, string);
  Calculating sum of squares (SS):
  Total (SSt), Between-Groups (SSb) for each factor, 
  Within-Group (Error or SSw)
  and interaction SSi variability. 
  SSt = SSx1+SSx2+SSi+SSw
  Adopted from https://www.marsja.se/three-ways-to-carry-out-2-way-anova-with-python/'''

    # Grand mean
    grand_mean = data[y].mean()

    # SS total
    SSt = sum((data[y] - grand_mean)**2)

    # SS for factors x1 and x2
    SSx1 = sum([(data[y][data[x1] == e].mean() - grand_mean)**2
                for e in data[x1]])
    SSx2 = sum([(data[y][data[x2] == e].mean() - grand_mean)**2
                for e in data[x2]])

    # SS within (error/residual)
    SSw = 0
    for i in range(len(data[y])):
        str_x1 = data[x1][i]
        str_x2 = data[x2][i]
        SSw = SSw + ((data[y][i] - data[y][(data[x1] == str_x1) &
                                           (data[x2] == str_x2)].mean())**2)

    # SS interaction
    SSi = SSt - SSx1 - SSx2 - SSw

    # degrees of freedom
    N = len(data[y])
    df_x1 = len(data[x1].unique()) - 1  # levels of factor -1
    df_x2 = len(data[x2].unique()) - 1
    df_i = df_x1 * df_x2
    df_w = N - (len(data[x1].unique()) * len(data[x2].unique()))

    # mean squares
    MS_x1 = SSx1 / df_x1
    MS_x2 = SSx2 / df_x2
    MS_i = SSi / df_i
    MS_w = SSw / df_w

    # F-ratio
    f_x1 = MS_x1 / MS_w
    f_x2 = MS_x2 / MS_w
    f_i = MS_i / MS_w

    # p-values
    p_x1 = stats.f.sf(f_x1, df_x1, df_w)
    p_x2 = stats.f.sf(f_x2, df_x2, df_w)
    p_i = stats.f.sf(f_i, df_i, df_w)

    #printing results
    results = {
        'SS': [SSx1, SSx2, SSi, SSw],
        'df': [df_x1, df_x2, df_i, df_w],
        'F': [f_x1, f_x2, f_i, 'NaN'],
        'PR(>F)': [p_x1, p_x2, p_i, 'NaN']
    }
    columns = ['SS', 'df', 'F', 'PR(>F)']

    table = pd.DataFrame(
        results,
        columns=columns,
        index=['Genotype', 'Treatment', 'GenotypexTreatment', 'Residual'])
    print(table)

    # interaction plot
    fig = interaction_plot(data[x1],
                           data[x2],
                           data[y],
                           colors=['red', 'blue'],
                           markers=['D', '^'],
                           ms=10)

    # post-hoc Tukey's test
    x1_x2 = []
    for i in range(len(data[y])):
        x1_x2.append(data[x1][i] + '_' + data[x2][i])

    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    print(pairwise_tukeyhsd(data[y], x1_x2, alpha=0.05))
Esempio n. 30
0
 def test_formatting(self):
     fig = interaction_plot(self.weight, self.duration, self.days, colors=['r','g'], linestyles=['--','-.'])
     assert_equal(isinstance(fig, plt.Figure), True)
     plt.close(fig)
Esempio n. 31
0
plt.show()

DOE_Plot8_1 = df_consol_final.boxplot(by='Product_mix',
                                      column=['Achieved_Yield_from_Mfg'],
                                      grid=False,
                                      fontsize=5)
plt.show()
DOE_Plot8_2 = sns.boxplot(x='Product_mix',
                          y='Achieved_Yield_from_Mfg',
                          data=df_consol_final,
                          width=0.5,
                          palette="colorblind")
plt.show()

DOE_Plot9 = interaction_plot(df_consol_final.Machine_Count,
                             df_consol_final.Operators_Count,
                             df_consol_final.Achieved_Yield_from_Mfg,
                             ms=10)
plt.show()

DOE_Plot10 = interaction_plot(df_consol_final.Machine_Count,
                              df_consol_final.Product_mix,
                              df_consol_final.Achieved_Yield_from_Mfg,
                              ms=10)
plt.show()

DOE_Plot11 = interaction_plot(df_consol_final.Operators_Count,
                              df_consol_final.Product_mix,
                              df_consol_final.Achieved_Yield_from_Mfg,
                              ms=10)
plt.show()
Esempio n. 32
0
    def btnTwoWayAnova_Click(self, m_widget):
        # Create a pandas DataFrame from the GUI table:
        dataframe = self.create_pandas_DataFrame(m_widget)
        if dataframe.empty:
            return
        # print(dataframe)
        # The table must have at least 3 rows:
        if len(dataframe) < 3:  # number of rows = len(dataframe)
            tkinter.messagebox.showinfo('Two-way ANOVA', 'You must have at least 3 values for each group.')
            return

        # I will use "dependent" for the dependent variable, "twoplus" for the group that has at least 2 variables
        # and "threeplus" for the group that has at least 3 variables

        # print(dataframe.ix[:, 0])  # First column
        column_names = list(dataframe)  # unsorted
        # ~~~ Each column name must start with a letter! ~~~
        pattern = re.compile(r'^[a-z]')
        try:
            for x in column_names:
                m = re.search(pattern, x)
                # ~~~ If m doesn't exist, it's because a variable name doesn't begin with a-z or A-Z, thus the assertion
                # fails.
                assert m
        except:
            dataframe.columns = ['column_1', 'column_2', 'column_3']
            column_names = list(dataframe)

        # ~~~ Launch the two-way ANOVA wizard ~~~
        wiz = ach_generic.TwoWayAnovaWizard(self, settings=tuple(x for x in column_names))
        if wiz.result is None:  # The user presses Cancel
            return
        dependent_var = wiz.result[0]  # just the column name
        posthoc_var = wiz.result[1]  # just the column name
        # ~~~ Get the other two variables from the column_names list ~~~
        temp_list = [str(x) for x in column_names if not str(x) == dependent_var]
        second_var = temp_list[0]
        third_var = temp_list[1]
        if not dataframe.dtypes[dependent_var] == float:
            tk.messagebox.showerror('Statistics', 'The dependent variable must be continuous')
            return

        # statmodels uses R-like model notation.
        # Two-way ANOVA with interactions: formula = 'len ~ C(supp) + C(dose) + C(supp):C(dose)'
        # Two-way ANOVA without interactions: formula = 'len ~ C(supp) + C(dose)'
        formula = '%s ~ C(%s) + C(%s)' % (dependent_var, second_var, third_var)
        # print(formula)
        model = ols(formula, dataframe).fit()
        aov_table1 = anova_lm(model, typ=2)
        # print('\n~~~ Two-way ANOVA without interactions ~~~')
        # print(aov_table1)
        results_string = ''
        results_string += '\n~~~ Two-way ANOVA without interactions ~~~\n'
        results_string += formula + '\n'
        results_string += aov_table1.to_string()
        results_string += '\n'

        # ~~~ Bonferroni's correction ~~~
        if not posthoc_var == '':
            # ~~~ 1st variable: dependent_var, 2nd variable: c2_var, 3rd variable: posthoc_var ~~~
            c2_var = [x for x in column_names if not x in [dependent_var, posthoc_var]][0]
            second_var = c2_var
            third_var = posthoc_var
            c1 = dataframe[dependent_var]
            c2 = dataframe[second_var]
            c3 = dataframe[third_var]
            # assert column c3 had at least 3 unique values
            if len(c3.unique()) < 3:
                info = 'Post hoc test \'Bonferroni\' should have at least 3 values in column \'%s\'' % posthoc_var
                tk.messagebox.showerror('Statistics', info)
            else:
                row_names = [''] * len(c3.unique())
                p_v_cor, corresponding_groups = multiple_comparisons_with_bonferroni(c1, c3)
                dataframe_bon = self.create_bonferroni_dataframe(p_v_cor, corresponding_groups)
                # print('\n~~~ Post hoc test: Multiple comparisons with Bonferroni correction ~~~')
                # print(dataframe_bon)
                results_string += '\n~~~ Post hoc test: Multiple comparisons with Bonferroni correction ~~~\n'
                results_string += dataframe_bon.to_string()
        else:
            c1 = dataframe[dependent_var]
            c2 = dataframe[second_var]
            c3 = dataframe[third_var]



        # Plots:
        plt.close('all')
        # fig1 = interaction_plot(threeplus, twoplus, dependent, colors=['red', 'blue'], markers=['D', '^'], ms=10)
        # fig2 = sm.qqplot(model.resid, line='s')

        # ~~~ plotting fails when posthoc_var is 'supp' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # fig, axx = plt.subplots(nrows=2)  # create two subplots, one in each row
        # interaction_plot(c3, c2, c1, colors=['red', 'blue'], markers=['D', '^'], ms=10, ax=axx[0])
        # sm.qqplot(model.resid, line='s', ax=axx[1])
        plots = []
        try:
            plot1 = interaction_plot(c3, c2, c1, colors=['red', 'blue'], markers=['D', '^'], ms=10)
            plots.append(plot1)
        except Exception as e:
            print('Error in plot1:', str(e))

        # plt.show()

        # Scatter plot
        # plt.figure()
        # if c3.dtypes == float:
        #     plt.scatter(c3, c1, color='red')
        # else:
        #     # Convert categorical variables to numbers
        #     from sklearn.preprocessing import LabelEncoder
        #     labelencoder = LabelEncoder()
        #     c3_encoded = labelencoder.fit_transform(c3)
        #     plt.scatter(c3_encoded, c1, color='red')
        # plt.title('Outliers')

        # Boxplot
        try:
            fig_boxplot = plt.figure()
            temp = []
            for i in range(len(c3.unique())):
                temp2 = c1[c3 == c3.unique()[i]]
                temp.append(temp2)
            plt.boxplot(temp)
            plt.title('Outlier detection', figure=fig_boxplot)
            plt.xlabel(third_var, figure=fig_boxplot)
            plt.ylabel(dependent_var, figure=fig_boxplot)
            plots.append(fig_boxplot)
        except Exception as e:
            print('Error in Boxplot:', str(e))

        results_popup = ach_generic.ResultsPopup(self, settings=results_string, plots=plots)
Esempio n. 33
0
def two_way_anova(dataframe_a, dataframe_b, parameter, parm_val_a, parm_val_b,
                  bin_var):
    """Performs regular two-way ANOVA for a given feature measured over bins.

    Arguments
    ---
    dataframe_a,b: pandas DataFrame
        Spreadsheet input, in the FastStat case coming from a filtered data
        frame from DataSet object.

    parameter: str 
        Name of the variable for the two-way measurement

    parm_val_a, b: str 
        Value for the chosen parameter
    :arg bin_var: str representing name of bin variable

    Returns
    ---
    pandas.DataFrame with ANOVA information"""

    # counts number of bins for given bin variable
    bin_num = dataframe_a.columns.str.contains(bin_var + ' bin ').sum()

    subdataset_a = DataSet(
        dataframe_a,
        dataframe_a.columns[dataframe_a.columns.get_loc(bin_var + ' bin 1') +
                            bin_num])
    subdataset_b = DataSet(
        dataframe_b,
        dataframe_b.columns[dataframe_b.columns.get_loc(bin_var + ' bin 1') +
                            bin_num])
    bin_dataset_a = bin_dataframe_generator(
        bins_subset(subdataset_a.data_frame, bin_var), bin_var)

    bin_dataset_b = bin_dataframe_generator(
        bins_subset(subdataset_b.data_frame, bin_var), bin_var)
    bin_dataset_a[parameter] = parm_val_a
    bin_dataset_b[parameter] = parm_val_b
    anova_dataset = bin_dataset_a.append(bin_dataset_b)

    fig = interaction_plot(anova_dataset['bin'],
                           anova_dataset[parameter],
                           anova_dataset[bin_var],
                           colors=['red', 'blue'],
                           markers=['D', '^'],
                           ms=10)

    figfile = BytesIO()
    plt.savefig(figfile, format='png')
    figfile.seek(0)
    figdata_png = base64.b64encode(figfile.getvalue())

    # Degrees of freedom - df

    n = len(anova_dataset[bin_var])
    df_a = len(anova_dataset['bin'].unique()) - 1
    df_b = len(anova_dataset[parameter].unique()) - 1
    dfaxb = df_a * df_b
    df_within = n - (len(anova_dataset['bin'].unique()) *
                     len(anova_dataset[parameter].unique()))

    # Sum of squares - ssq (factors A, B and total)

    grand_mean = anova_dataset[bin_var].mean()
    ssq_a = sum([
        (anova_dataset[anova_dataset[parameter] == l][bin_var].mean() -
         grand_mean)**2 for l in anova_dataset[parameter]
    ])
    ssq_b = sum([(anova_dataset[anova_dataset['bin'] == l][bin_var].mean() -
                  grand_mean)**2 for l in anova_dataset['bin']])
    ssq_total = sum((anova_dataset[bin_var] - grand_mean)**2)

    # Sum of Squares Within (error/residual)

    bin_means_a = [
        bin_dataset_a[bin_dataset_a['bin'] == d][bin_var].mean()
        for d in bin_dataset_a['bin']
    ]
    bin_means_b = [
        bin_dataset_b[bin_dataset_b['bin'] == d][bin_var].mean()
        for d in bin_dataset_b['bin']
    ]
    ssq_within = sum((bin_dataset_b[bin_var] - bin_means_b)**2) + sum(
        (bin_dataset_a[bin_var] - bin_means_a)**2)

    # Sum of Squares Interaction

    ssqaxb = ssq_total - ssq_a - ssq_b - ssq_within

    # Mean Squares

    ms_a = ssq_a / df_a
    ms_b = ssq_b / df_b
    ms_ax_b = ssqaxb / dfaxb
    ms_within = ssq_within / df_within

    # F-ratio

    f_a = ms_a / ms_within
    f_b = ms_b / ms_within
    faxb = ms_ax_b / ms_within

    # Obtaining p-values

    p_a = stats.f.sf(f_a, df_a, df_within)
    p_b = stats.f.sf(f_b, df_b, df_within)
    paxb = stats.f.sf(faxb, dfaxb, df_within)

    # table with results from ANOVA

    results = {
        'SS': [ssq_a, ssq_b, ssqaxb, ssq_within],
        'DF': [df_a, df_b, dfaxb, df_within],
        'F': [f_a, f_b, faxb, ''],
        'PR(>F)': [p_a, p_b, paxb, '']
    }
    columns = ['SS', 'DF', 'F', 'PR(>F)']

    return pd.DataFrame(
        results,
        columns=columns,
        index=[parameter, 'bin', parameter + ':bin',
               'Residual']), urllib.parse.quote(figdata_png)
Esempio n. 34
0
df = pd.read_csv(
    r"C:\Users\freya\OneDrive\HSLU\6. Semester 2020FS\STAT\SW010\Übungen\Diet.csv"
)
df["weight_loss"] = df["weight6weeks"] - df["pre.weight"]
df.head()

# Serie 10
# Aufgabe 10.1
# a)
sns.boxplot(x="gender", y="weight_loss", data=df)
sns.stripplot(x="gender", y="weight_loss", data=df)

# b)
from statsmodels.graphics.factorplots import interaction_plot
interaction_plot(x=df["gender"], trace=df["Diet"], response=df["weight_loss"])

# c)
interaction_plot(x=df["Diet"], trace=df["gender"], response=df["weight_loss"])

# d)
fit = ols("weight_loss~gender+Diet", data=df).fit()
anova_lm(fit)

# e)
fit = ols("weight_loss~Diet*gender", data=df).fit()
anova_lm(fit)

# Aufgabe 10.2
# a)
df = pd.read_csv(
Esempio n. 35
0
# In[6]:

df['genderX'] = df['gender'].replace({'Male': 1, 'Female': 2})
df['alcoholX'] = df['alcohol'].replace({'None': 1, '2 Pints': 2, '4 Pints': 3})

# In[10]:

df.groupby(['gender', 'alcohol']).describe()['attractiveness']

# In[12]:

from statsmodels.graphics.factorplots import interaction_plot
fig = interaction_plot(df.alcoholX,
                       df.gender,
                       df.attractiveness,
                       colors=['red', 'blue'],
                       markers=['D', '^'],
                       ms=10)

# In[26]:

_ = sns.lineplot(x='alcohol',
                 y='attractiveness',
                 hue='gender',
                 err_style="bars",
                 sort=False,
                 data=df,
                 style='gender',
                 markers=['D', '^'])

# In[28]:
Esempio n. 36
0
import numpy as np
from statsmodels.graphics.factorplots import interaction_plot

np.random.seed(12345)
weight = np.random.randint(1, 4, size=60)
duration = np.random.randint(1, 3, size=60)
days = np.log(np.random.randint(1, 30, size=60))
fig = interaction_plot(weight,
                       duration,
                       days,
                       colors=['red', 'blue'],
                       markers=['D', '^'],
                       ms=10)
import matplotlib.pyplot as plt
#plt.show()
Esempio n. 37
0
 def test_plot_rainbow(self, close_figures):
     fig = interaction_plot(self.weight,
                            self.duration,
                            self.days,
                            markers=['D', '^'],
                            ms=10)
# -*- coding: utf-8 -*-
"""Plot Interaction of Categorical Factors
"""

#In this example, we will vizualize the interaction between
#categorical factors. First, categorical data are initialized
#and then plotted using the interaction_plot function.
#
#Author: Denis A. Engemann


print __doc__

import numpy as np
from statsmodels.graphics.factorplots import interaction_plot
from pandas import Series

np.random.seed(12345)
weight = Series(np.repeat(['low', 'hi', 'low', 'hi'], 15), name='weight')
nutrition = Series(np.repeat(['lo_carb', 'hi_carb'], 30), name='nutrition')
days = np.log(np.random.randint(1, 30, size=60))

fig = interaction_plot(weight, nutrition, days, colors=['red', 'blue'],
                       markers=['D', '^'], ms=10)

import matplotlib.pylab as plt
plt.show()
import numpy as np
from statsmodels.graphics.factorplots import interaction_plot

np.random.seed(12345)
weight = np.random.randint(1, 4, size=60)
duration = np.random.randint(1, 3, size=60)
days = np.log(np.random.randint(1, 30, size=60))
fig = interaction_plot(weight, duration, days, colors=["red", "blue"], markers=["D", "^"], ms=10)
import matplotlib.pyplot as plt

# plt.show()