def plot_interaction(data_lastDV):
    """
    Plot the interaction of the given data (should be three columns)
    :param data: data frame containing the independent variables in first two columns, dependent in the third
    :return: None
    """
    col_names = data_lastDV.columns.values  # get the columns' names
    factor_groups = data_lastDV[col_names].dropna()

    # TODO: fix the boxplot generating a separate plot (why doesn't subplots work?)
    plt.figure()

    plt.subplot(121)
    interaction_plot(factor_groups[col_names[0]], factor_groups[col_names[1]], factor_groups[col_names[2]], colors=['red', 'blue'], markers=['D', '^'], ms=10, ax=plt.gca())

    plt.subplot(122)
    factor_groups.boxplot(return_type='axes', column=col_names[2], by=[col_names[0], col_names[1]])
    plt.show()
plt.ylabel('Salary')

# From our first look at the data, the difference between Master's and PhD
# in the management group is different than in the non-management group.
# This is an interaction between the two qualitative variables management,M
# and education,E. We can visualize this by first removing the effect of
# experience, then plotting the means within each of the 6 groups using
# interaction.plot.

U = S - X * interX_lm32.params['X']

plt.figure(figsize=(6, 6))
interaction_plot(E,
                 M,
                 U,
                 colors=['red', 'blue'],
                 markers=['^', 'D'],
                 markersize=10,
                 ax=plt.gca())

# ## Minority Employment Data

try:
    jobtest_table = pd.read_table('jobtest.table')
except:  # don't have data already
    url = 'http://stats191.stanford.edu/data/jobtest.table'
    jobtest_table = pd.read_table(url)

factor_group = jobtest_table.groupby(['MINORITY'])

fig, ax = plt.subplots(figsize=(6, 6))
Example #3
0
    idx = group.index
    plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1],
                s=144, edgecolors='black')
    # drop NA because there is no idx 32 in the final model
    plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),
            ls=lstyle[j], color=colors[i-1])
plt.xlabel('Experience');
plt.ylabel('Salary');


# From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot.

U = S - X * interX_lm32.params['X']

plt.figure(figsize=(6,6))
interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],
        markersize=10, ax=plt.gca())


# ## Minority Employment Data

try:
    minority_table = pandas.read_table('minority.table')
except:  # don't have data already
    url = 'http://stats191.stanford.edu/data/minority.table'
    minority_table = pandas.read_table(url)

factor_group = minority_table.groupby(['ETHN'])

plt.figure(figsize=(6,6))
colors = ['purple', 'green']
markers = ['o', 'v']
    i, j = values
    idx = group.index
    plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i - 1], s=144, edgecolors="black")
    # drop NA because there is no idx 32 in the final model
    plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(), ls=lstyle[j], color=colors[i - 1])
plt.xlabel("Experience")
# @savefig fitted_drop32.png align=center
plt.ylabel("Salary")

# From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot.

U = S - X * interX_lm32.params["X"]

plt.figure(figsize=(6, 6))
# @savefig interaction_plot.png align=center
interaction_plot(E, M, U, colors=["red", "blue"], markers=["^", "D"], markersize=10, ax=plt.gca())

# Minority Employment Data
# ------------------------

try:
    minority_table = pandas.read_table("minority.table")
except:  # don't have data already
    url = "http://stats191.stanford.edu/data/minority.table"
    minority_table = pandas.read_table(url)

factor_group = minority_table.groupby(["ETHN"])

plt.figure(figsize=(6, 6))
colors = ["purple", "green"]
markers = ["o", "v"]
       Days  Duration  Weight  ID
    0   0.0         1       1   1
    1   2.0         1       1   2
    2   1.0         1       1   3
    3   3.0         1       1   4
    4   0.0         1       1   5

    r = 3 (weight gain)
    m = 2 (duration of treatment)
    n_ij = 10 for all (i, j)
    """
    print("Balanced panel" + "\n")
    fig = interaction_plot(kidney_table['Weight'],
                           kidney_table['Duration'],
                           np.log(kidney_table['Days'] + 1),
                           colors=['red', 'blue'],
                           markers=['D', '^'],
                           ms=10,
                           ax=plt.gca())
    plt.show()

    formula = "np.log(Days+1) ~ C(Duration)"
    lm = fit_linear_model(formula, data=kidney_table)

    formula2 = "np.log(Days+1) ~ C(Weight)"
    lm2 = fit_linear_model(formula2, data=kidney_table)

    formula3 = "np.log(Days+1) ~ C(Duration) + C(Weight)"
    lm3 = fit_linear_model(formula3, data=kidney_table)

    formula4 = "np.log(Days+1) ~ C(Duration) * C(Weight)"
plt.ylabel("Salary")

# From our first look at the data, the difference between Master's and PhD
# in the management group is different than in the non-management group.
# This is an interaction between the two qualitative variables management,M
# and education,E. We can visualize this by first removing the effect of
# experience, then plotting the means within each of the 6 groups using
# interaction.plot.

U = S - X * interX_lm32.params["X"]

plt.figure(figsize=(6, 6))
interaction_plot(E,
                 M,
                 U,
                 colors=["red", "blue"],
                 markers=["^", "D"],
                 markersize=10,
                 ax=plt.gca())

# ## Minority Employment Data

try:
    jobtest_table = pd.read_table("jobtest.table")
except:  # do not have data already
    url = "http://stats191.stanford.edu/data/jobtest.table"
    jobtest_table = pd.read_table(url)

factor_group = jobtest_table.groupby(["MINORITY"])

fig, ax = plt.subplots(figsize=(6, 6))
Example #7
0
#
# Measurement of fetal head circumference **hs**, by four observers in three fetuses.

# In[55]:

# https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/altman_12_6.txt
df = pd.read_csv('../data/altman_12_6.txt', names=['hs', 'fetus', 'observer'])
df.head()

# In[56]:

from statsmodels.graphics.api import interaction_plot
plt.figure(figsize=(8, 6))
fig = interaction_plot(df['fetus'],
                       df['observer'],
                       df['hs'],
                       ms=10,
                       ax=plt.gca())

# In[169]:

formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)'
lm = ols(formula, df).fit()
print(anova_lm(lm))

#
# ###  卡方检验 A chi-squared test
#
# https://en.wikipedia.org/wiki/Chi-squared_test
#
#
Example #8
0
            if len(groups) == 2:

                X = data[X]
                Y = data[S]

                s = 100

                plt.figure(figsize=(8, 6))

                groups = data.groupby(data[E])

                for key, group in groups:  # ERROR (working on it)
                    interaction_plot(X,
                                     group,
                                     np.log(Y + 1),
                                     colors=['r', 'b'],
                                     markers=['D', '^'],
                                     ms=10,
                                     ax=plt.gca())

                    plt.show()  #?

            else:

                fig, ax = plt.subplots(figsize=(8, 6))

                s = 100

                for key, group in groups:  # ERROR (working on it)

                    group.plot(ax=ax,
Example #9
0
gese.loc[gese.stressful_life_events > 10, 'events_cat'] = 10  # Recode labels

#############################################################################################
# Simple plotting
import matplotlib.pyplot as plt
import seaborn

# Histograms
plt.hist(gese.depression)  # slightly right skewed
plt.hist(gese.gene)  # unequally distributed
plt.hist(gese.stressful_life_events)  # heavily right skewed

#############################################################################################
# Testing for interaction
from statsmodels.graphics.api import interaction_plot
fig = interaction_plot(gese.stressful_life_events, gese.gene, gese.depression)
plt.show(
)  # The plot does indicate an interaction between gene and stressful life events

# Correlation
pearsoncorr = stats.pearsonr(
    gese.stressful_life_events, gese.depression
)  # p-value (<.05) indicates a significant correlation between stressful life events and depression outcome

# t test
t_test = stats.ttest_ind(gese.depression[gese.gene == 1],
                         gese.depression[gese.gene == 0])
print(
    t_test_gene
)  # p-value (>.05) indicates mean of depression is not significantly different in these two genotypes
Example #10
0
    anova_lm_check(res_lm_subset, res_lm_interaction_M_subset)
    """
        df_resid           ssr  df_diff       ss_diff            F        Pr(>F)
    0      40.0  4.320910e+07      0.0           NaN          NaN           NaN
    1      38.0  1.711881e+05      2.0  4.303791e+07  4776.734853  2.291239e-46
    """

    resid_studentized_subset = plot_residuals_studentized(
        result=res_lm_interaction_M_subset, data=salary_table)

    # fitted value plotting
    plot_fitted_values(formula=formula_interaction_M,
                       data=salary_table,
                       drop_idx=drop_idx)

    # the difference between Master's and PhD in the management group is different
    # than in the non-management group. (interaction between the two qualitative variables M and E)
    # => first remove the effect of experience,
    # => then plot the means within each of the 6 groups using interaction.plot.
    U = salary_table.S - salary_table.X * res_lm_interaction_X_subset.params[
        'X']

    # Interaction plot for factor level statistics.
    interaction_plot(x=salary_table.E,
                     trace=salary_table.M,
                     response=U,
                     colors=['red', 'blue'],
                     markers=['^', 'D'],
                     markersize=10,
                     ax=plt.gca())
    plt.show()