Example #1
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''

    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)

    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    p = glm('carbohydrate ~ age + weight + protein',
            family=Gaussian(),
            data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print p.summary()

    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)
Example #2
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)
    
    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    glm = glm('carbohydrate ~ age + weight + protein',
            family=Gaussian(), data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print glm.summary()
    
    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)    
Example #3
0
def ancova():
    ''' ANCOVA
    chapter 6.5, p 117 '''

    # get the data from the web
    inFile = r'GLM_data/Table 6.12 Achievement scores.xls'
    df = get_data(inFile)

    # fit the model
    model = ols('y~x+method', data=df).fit()
    print anova_lm(model)
    print model.summary()
Example #4
0
def ancova():
    ''' ANCOVA
    chapter 6.5, p 117 '''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.12 Achievement scores.xls'
    df = get_data(inFile)
    
    # fit the model
    model = ols('y~x+method', data=df).fit()
    print anova_lm(model)
    print model.summary()    
Example #5
0
def f_twoway(data, factorA_name, factorB_name, value_name, precision=4):
    """
    return results, aov_table, render_table, factorA_f_stat, factorA_p_value, factorB_f_stat, factorB_p_value, inter_f_stat, inter_p_value
    """
    results = smf.ols(
        f'{value_name} ~ C({factorA_name}) + C({factorB_name}) + C({factorA_name}):C({factorB_name})',
        data=data).fit()
    aov_table = sms.anova_lm(results, typ=2)

    factorA_f_stat, factorA_p_value = aov_table['F'][0], aov_table['PR(>F)'][0]
    factorB_f_stat, factorB_p_value = aov_table['F'][1], aov_table['PR(>F)'][1]
    inter_f_stat, inter_p_value = aov_table['F'][2], aov_table['PR(>F)'][2]
    render_table = aov_table.copy()
    render_table.columns = [
        'Sum of Squares', 'Degree of Freedom', 'F', 'p-value'
    ]

    render_table.index = ['Factor A', 'Factor B', 'Interaction', 'Error']

    render_table.loc['Total'] = render_table.sum()
    render_table.loc['Total', ['F', 'p-value']] = np.nan
    print(
        f'Factor A\'s p-value: {factorA_p_value:.{precision}f}\nFactor B\'s p-value: {factorB_p_value:.{precision}f}\nInteraction p-value: {inter_p_value:.{precision}f}'
    )
    return results, aov_table, render_table, factorA_f_stat, factorA_p_value, factorB_f_stat, factorB_p_value, inter_f_stat, inter_p_value
Example #6
0
def anova_test_formula(formula, data=None, typ=1):
    """ANOVA Test by formula.

    """
    lin_model = smf.ols(formula, data=data).fit()
    res_anova = smt.anova_lm(lin_model, typ=typ)

    return res_anova, lin_model
Example #7
0
def anova():
    '''ANOVA
    chapter 6.4, p. 108, and p. 113
    GLM does not work with anova_lm.
    '''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.6 Plant experiment.xls'
    df = get_data(inFile)
    
    # fit the model (p 109)
    glm = glm('weight~group', family=Gaussian(), data=df)
    print glm.fit().summary()        
    
    print '-'*65
    print 'OLS'
    model = ols('weight~group', data=df)
    print model.fit().summary()
    print anova_lm(model.fit())            
    
    # The model corresponding to the null hypothesis of no treatment effect is
    model0 = ols('weight~1', data=df)
    
    # Get the data for the two-factor ANOVA (p 113)
    inFile = r'GLM_data/Table 6.9 Two-factor data.xls' 
    df = get_data(inFile)
    
    # adjust the header names from the Excel-file
    df.columns = ['A','B', 'data']
    
    # two-factor anova, with interactions
    ols_int = ols('data~A*B', data=df)
    anova_lm(ols_int.fit())
    
    # The python commands for the other four models are
    ols_add = ols('data~A+B', data=df)
    ols_A = ols('data~A', data=df)    
    ols_B = ols('data~B', data=df)    
    ols_mean = ols('data~1', data=df)    
Example #8
0
def anova():
    '''ANOVA
    chapter 6.4, p. 108, and p. 113
    GLM does not work with anova_lm.
    '''

    # get the data from the web
    inFile = r'GLM_data/Table 6.6 Plant experiment.xls'
    df = get_data(inFile)

    # fit the model (p 109)
    p = glm('weight~group', family=Gaussian(), data=df)
    print p.fit().summary()

    print '-' * 65
    print 'OLS'
    model = ols('weight~group', data=df)
    print model.fit().summary()
    print anova_lm(model.fit())

    # The model corresponding to the null hypothesis of no treatment effect is
    model0 = ols('weight~1', data=df)

    # Get the data for the two-factor ANOVA (p 113)
    inFile = r'GLM_data/Table 6.9 Two-factor data.xls'
    df = get_data(inFile)

    # adjust the header names from the Excel-file
    df.columns = ['A', 'B', 'data']

    # two-factor anova, with interactions
    ols_int = ols('data~A*B', data=df)
    anova_lm(ols_int.fit())

    # The python commands for the other four models are
    ols_add = ols('data~A+B', data=df)
    ols_A = ols('data~A', data=df)
    ols_B = ols('data~B', data=df)
    ols_mean = ols('data~1', data=df)
Example #9
0
def f_oneway(data, treatment_name, value_name):
    """
    return results, aov_table, render_table, f_stat, p_value
    """
    results = smf.ols(f'{value_name} ~ C({treatment_name})', data=data).fit()
    aov_table = sms.anova_lm(results, typ=2)
    f_stat, p_value = aov_table['F'][0], aov_table['PR(>F)'][0]
    render_table = aov_table.copy()
    render_table.columns = [
        'Sum of Squares', 'Degree of Freedom', 'F', 'p-value'
    ]
    # render_table.index = ['Treatment', 'Error']
    render_table.loc['Total'] = render_table.sum()
    print(f'p-value: {p_value}')
    return results, aov_table, render_table, f_stat, p_value
Example #10
0
def f_random_block(data, treatment_name, block_name, value_name, precision=4):
    """
    return results, aov_table, render_table, treatment_f_stat, treatment_p_value, block_f_stat, block_p_value
    """
    results = smf.ols(f'{value_name} ~ C({treatment_name}) + C({block_name})',
                      data=data).fit()
    aov_table = sms.anova_lm(results, typ=2)

    treatment_f_stat, treatment_p_value = aov_table['F'][0], aov_table[
        'PR(>F)'][0]
    block_f_stat, block_p_value = aov_table['F'][1], aov_table['PR(>F)'][1]
    render_table = aov_table.copy()
    render_table.columns = [
        'Sum of Squares', 'Degree of Freedom', 'F', 'p-value'
    ]

    render_table.index = ['Treatment', 'Block', 'Error']

    render_table.loc['Total'] = render_table.sum()
    render_table.loc['Total', ['F', 'p-value']] = np.nan
    print(
        f'Treatment p-value (main): {treatment_p_value:.{precision}f}\nBlock p-value: {block_p_value:.{precision}f}'
    )
    return results, aov_table, render_table, treatment_f_stat, treatment_p_value, block_f_stat, block_p_value
# In[24]:


formula = 'crime ~ 1'
mod = smf.ols(formula, data = df)
reg02 = mod.fit()
print ( reg02.summary())


# ## <font color = blue>Compare your unrestricted model to the constant-only model (the restricted model).</font>

# In[63]:


anova_lm( reg01, reg02 )


# ## <font color = blue> Calculate elasticities.</font>

# In[43]:


dYdX = reg03.params[1]
eta = dYdX * (df.crime.mean()/df.amtunemp.mean())
print ('eta = ', round(eta, 4))


# In[42]:

Example #12
0
print(my_formula)
lm_ins1 = smf.ols(my_formula, boston_df)
lm_fit1 = lm_ins1.fit()
print(lm_fit1.summary())

# Interaction Terms
lm_fit2 = smf.ols('MEDV ~ LSTAT + I(LSTAT**2)', boston_df).fit()
print(lm_fit2.summary())

# Non linear trnasformations
# import anova function
from statsmodels.stats.api import anova_lm
lm_fit = smf.ols('MEDV ~ LSTAT', boston_df).fit()
lm_fit2 = smf.ols('MEDV ~ LSTAT + I(LSTAT**2)', boston_df).fit()
# perform the hypothesis test (see https://en.wikipedia.org/wiki/F-test regression section)
print(anova_lm(lm_fit, lm_fit2))

fig, ax = plt.subplots(figsize=(8, 6))

# Plot the data
ax.scatter(boston_df.LSTAT,
           boston_df.MEDV,
           facecolors='none',
           edgecolors='b',
           label="data")
# plot the models fitted values
ax.plot(boston_df.LSTAT,
        lm_fit2.fittedvalues,
        'g',
        marker='o',
        linestyle='none',
            ax.set_title(x)
            ax.text(1,.75, ("%d zero values dropped\nout of %d observations" % (zero_count, total_count)),transform=ax.transAxes, horizontalalignment='right',verticalalignment='top')
            if thou: ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int((x*1e-3)), ",") if x>1000 else format(int(x), ",")))
            remove_border(ax)
        else: #if no data for a specific month
            ax.plot([])
            remove_border(ax, top=False, left=False, right=False, bottom=False)
            ax.xaxis.set_visible(False)
            ax.yaxis.set_visible(False)
   
    plt.savefig('./Output/' + filename + "_" + year + ".png", bbox_inches='tight')


#now graph the small multiples
years = dfT.year.unique()
for year in years:
    small_mult_hist(dfT, 'delayHH', 'display_month', 
                    year, 'Distribution of Delay days per Household Employed among Blocks', filename= 'Dist_delay_HH',
                    rows=4, log=False, bins=50, thou=False)
    small_mult_hist(dfT, 'delay', 'display_month', year, 'Distribution of Delay Days (thousands) among Blocks', 
                    log=False, bins=50, thou=True, filename= "Dist_delay")

    


### Variance analysis using ANOVA

anova_lm(ols('delay ~ display_month', dfT.dropna(subset=['delay'])).fit()).to_csv('./Output/delay_ANOVA.csv')
anova_lm(ols('delayHH ~ display_month', dfT.dropna(subset=['delayHH'])).fit()).to_csv('./Output/delayHH_ANOVA.csv')

Example #14
0
# Add an interaction between salary and experience, allowing different intercepts for level of experience.
#
# $$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$

# <codecell>

interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit()
print interX_lm.summary()

# <markdowncell>

# Test that $\beta_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test.

# <codecell>

print anova_lm(lm, interX_lm)

# <codecell>

print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0')

# <codecell>

print interX_lm.f_test([[0, 0, 0, 0, 0, 1, -1], [0, 0, 0, 0, 0, 0, 1]])

# <markdowncell>

# The contrasts are created here under the hood by patsy.

# <markdowncell>
Example #15
0
    def anova_one_drug_one_feature_custom(self,
                                          drug_id,
                                          feature_name,
                                          formula,
                                          odof=None):
        """Same as :meth:`anova_one_drug_one_feature` but allows any formula

        :return: full ANOVA table but also populate interal attribute
            anova_pvalues that is a dictionary with pvalues for
            feature, media, msi and tissue

        Formula must be set in the settings attribute as follows::

            an = ANOVA(...)
            an.settings['regression_formula'] = "Y ~  C(tissue) + feature"

        .. note:: This function is convenient but 3-4 times slower than
            :meth:`anova_one_drug_one_feature`. So if your formula are one of::

                "Y ~  C(tissue) + C(media) + C(msi) + feature"
                "Y ~  C(tissue) + C(msi) + feature"
                "Y ~  C(msi) + feature"
                "Y ~  feature"

            you should rather use :meth:`anova_one_drug_one_feature` instead
            (keeping regression_formula set to 'auto').

        By default, in categories, the first treatment (e.g tissue) is used as a
        reference and is not shown in the results. You may set the reference as
        follows::

            "Y ~ C(tissue, Treatment(reference='breast'))"

        ANOVA pvalues returned are of type I

        .. versionadded:: 0.15.0

        """
        import statsmodels.formula.api as smf
        from statsmodels.stats.api import anova_lm

        if odof is None:
            odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        df = pd.DataFrame({'Y': odof.Y, 'feature': odof.masked_features})
        # Add other categorical explanatory variables if available
        try:
            df['tissue'] = odof.masked_tissue.values
        except:
            pass
        try:
            df['msi'] = odof.masked_msi.values
        except:
            pass
        try:
            df['media'] = odof.masked_media.values
        except:
            pass

        # "Y ~  C(tissue) + C(msi) + C(media) + feature"
        assert "Y" in formula, "Y must be the LHS of the formula"
        # This returns a Model instance
        model = smf.ols(formula, data=df)

        self._debug_custom_df = df
        self._debug_custom_model = model

        anova = anova_lm(model.fit(), typ=1)
        anova_pvalues = {}
        for k, v in anova['PR(>F)'].iteritems():
            if k == 'C(tissue)':
                anova_pvalues['tissue'] = v
            elif k == 'C(msi)':
                anova_pvalues['msi'] = v
            elif k == 'C(media)':
                anova_pvalues['media'] = v
            elif k == 'feature':
                anova_pvalues['feature'] = v
        self.anova_pvalues = anova_pvalues
        return anova
 def get_p_fvalue(self, result_fit1, result_fit2):
     anova_table = anova_lm(result_fit1.result_, result_fit2.result_)
     return anova_table["Pr(>F)"][1]
Example #17
0
# Add an interaction between salary and experience, allowing different intercepts for level of experience.
# 
# $$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$

# <codecell>

interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit()
print interX_lm.summary()

# <markdowncell>

# Test that $B_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test.

# <codecell>

print anova_lm(lm, interX_lm)

# <codecell>

print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0')

# <codecell>

print interX_lm.f_test([[0,0,0,0,0,1,-1],[0,0,0,0,0,0,1]])

# <rawcell>

# The contrasts are created here under the hood by patsy.

# <markdowncell>
Example #18
0
    def anova_one_drug_one_feature_custom(self, drug_id, feature_name, formula, odof=None):
        """Same as anova_one_drug_one_feature but allows any formula

        Formula must be set in the settings attribute as 
        settings.regression_formula::

        :return: full ANOVA table but also populate interal attribute
            anova_pvalues that is a dictionary with pvalues for
            feature, media, msi and tissue

            an = ANOVA(...)
            an.settings.formula = "Y ~  C(tissue) + feature"

        .. note:: This function is convenient but 3 times slower than
            :meth:`anova_one_drug_one_feature`. So if your formula are one of
            "Y ~  C(tissue) + C(media) + C(msi) + feature"
            "Y ~  C(tissue) + C(msi) + feature"
            "Y ~  C(msi) + feature"
            "Y ~  feature"

        By default, in categories, the first treatment (e.g tissue) is used a
        reference and is not shown in the results. You may set the reference
        "Y ~ C(tissue, Treatment(reference='breast'))" .  

        ANOVA pvalues returned are of type I

        .. versionadded: 0.15.0
        """
        import statsmodels.formula.api as smf
        from statsmodels.stats.api import anova_lm

        if odof is None:
            odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        df = pd.DataFrame({"Y": odof.Y, "feature": odof.masked_features})
        # Add other categorical explanatory variables if available
        try:
            df["tissue"] = odof.masked_tissue.values
        except:
            pass
        try:
            df["msi"] = odof.masked_msi.values
        except:
            pass
        try:
            df["media"] = odof.masked_media.values
        except:
            pass

        # "Y ~  C(tissue) + C(msi) + C(media) + feature"
        assert "Y" in formula, "Y must be the LHS of the formula"
        # This returns a Model instance
        model = smf.ols(formula, data=df)

        self._debug_custom_df = df
        self._debug_custom_model = model

        anova = anova_lm(model.fit(), typ=1)
        anova_pvalues = {}
        for k, v in anova["PR(>F)"].iteritems():
            if k == "C(tissue)":
                anova_pvalues["tissue"] = v
            elif k == "C(msi)":
                anova_pvalues["msi"] = v
            elif k == "C(media)":
                anova_pvalues["media"] = v
            elif k == "feature":
                anova_pvalues["feature"] = v
        self.anova_pvalues = anova_pvalues
        return anova
 def anova_test(self):
     self.stat_multi_regression_b()
     self.stat_multi_include_calculation()
     table1 = anova_lm(self.fit, self.fit2)
     print table1
Example #20
0
m = smf.ols('College ~ ACT', data = gpa)
results = m.fit()
print(results.summary())

m = smf.ols('College ~ ACT + HS', data = gpa)
results = m.fit()
print(results.summary())


plt.plot(results.predict(),'bo')
######anova example

comp = smf.ols('College ~ ACT + HS + APHours + Height + ACT + KnownLanguages', data = gpa).fit()
simp = smf.ols('College ~ ACT + APHours + Height', data = gpa).fit()

from statsmodels.stats.api import anova_lm
anova_lm(simp, comp)

##############
# Load data
url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv'
dat = pd.read_csv(url)

# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()

# Inspect the results
print results.summary()
plt.plot(results.predict(),"bo")

Example #21
0
plt.xlabel("x")
plt.ylabel("y")
plt.plot(x[:,1], yfit, 'blue', label='model fit')
plt.plot(x[:,1], ytrue, 'red', label='population')
plt.legend(loc='upper left')

# (g)
x2 = np.power(x,2)[:,1]
x2 = x2.reshape(100,1)
x  = np.append(x, x2, 1)

model8     = sm.OLS(y,x)
model8_fit = model8.fit()
print(model8.fit().summary())

table = anova_lm(model7.fit(), model8.fit())
print(table)
# No evidence on polynomial being better. 

# (h)-(j) skip

# -------------------------------------------
# Q14
# (a)
np.random.seed(1)
x1 = np.random.uniform(size=(100,))
x2 = 0.5*x1 + np.random.normal(size=(100,))/10.
y  = 2 + 2*x1 + 0.3*x2 + np.random.normal(size=(100,))

# (b)
np.corrcoef(x1, x2)
Example #22
0
#1-sample t-test
stats.ttest_1samp(data['VIQ'], 0)
#2-sample independent t-test
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)
#2-sample comparative t-test
stats.ttest_rel(data['FSIQ'], data['PIQ'])
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)

#Exercise
#Q1. Test the difference between weights in males and females.
#Q2. Use non parametric statistics to test the difference between VIQ in males and females.
#(Hint: using scipy.stats.mannwhitneyu())

from statsmodels.formula.api import ols

model = ols("VIQ ~ Gender + 1", data)
results = model.fit()
print(results.summary())
model2 = ols('VIQ ~ Gender + Weight + Height + MRI_Count', data)
results2 = model2.fit()
print(results2.summary())
#Influence point checking
infl = results2.get_influence()
print(infl.summary_table())

from statsmodels.stats.api import anova_lm
#ANOVA for Model Comparison
table1 = anova_lm(results, results2)
print(table1)