def model_formulas():
    ''' Define models through formulas '''
    
    # Get the data:
    # Development of world record times for the 100m Freestyle, for men and women.
    data = pd.read_csv('swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('----------------- Results ANOVAs: Model 1 -----------------------')
    print((anova_lm(model1)))
    
    print('--------------------- Model 2 -----------------------------------')
    print((anova_lm(model2)))
    
    print('--------------------- Model 3 -----------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
Beispiel #2
0
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal. '''
    
    # Get the data
    data = getData('altman_910.txt')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    
    # Print the results
    print 'Altman 910:'
    print (F_statistic, pVal)
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    print anova_lm(model)
def anova_interaction(data_lastDV):
    """
    Two-way ANOVA and interaction analysis of given data
    http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html

    Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each
    :param data: data frame containing the independent variables in first two columns, dependent in the third
    :return: None
    """

    col_names = data_lastDV.columns.values  # get the columns' names
    factor_groups = data_lastDV[col_names].dropna()
    if len(col_names) < 3:
        print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names))

    # two-way anova
    formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")"
    formula_interaction = formula.replace('+', '*')
    interaction_lm = ols(formula, data=factor_groups).fit()  # linear model
    print(interaction_lm.summary())

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -")
    print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
Beispiel #4
0
def model_formulas():
    ''' Define models through formulas '''
    # Get the dta
    data = read_csv(r'..\Data\data_kaplan\swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('-----------------------------------------------------------------')
    print((anova_lm(model1)))
    
    print('-----------------------------------------------------------------')
    print((anova_lm(model2)))
    
    print('-----------------------------------------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
Beispiel #5
0
    def run_anova(self):
        ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)]

        #ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit()
        ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rf', anova['F'].values[0:3])
        self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])]
        print 'nsamples =', len(ps_table_for_anova_low)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_low', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_low', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])]
        print 'nsamples =', len(ps_table_for_anova_high)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_high', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_high', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
Beispiel #6
0
def anova_interaction():
    '''ANOVA with interaction: Measurement of fetal head circumference,
    by four observers in three fetuses.'''
    
    # Get the data
    data = getData('altman_12_6.txt')
    
    # Bring them in dataframe-format
    df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer'])
    
    # Determine the ANOVA with interaction
    formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)'
    lm = ols(formula, df).fit()
    print anova_lm(lm)
def anova(dv):
    """Perform ANOVA."""

    df = make_summary()
    lm = ols('%s ~ C(group) * age * iq' % dv, data=df).fit()
    divider = '---------'
    print divider, dv, divider, '\n', anova_lm(lm, typ=2, robust='hc3')
Beispiel #8
0
def anova(df, fmla, typ=3):
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm

    # Anova/OLS
    lm = ols(fmla, df).fit()  # 'data' <==> 'df' keyword change with version

    # Grab the pvalues (note we use Type III)
    aov = anova_lm(lm, typ=typ)
    pvals = aov["PR(>F)"]
    pvals.index = map(lambda s: "p_" + s, pvals.index)

    # Grab the explainable sum of squares
    ess = aov.drop("Residual").sum_sq
    ess = ess / ess.sum()
    ess.index = map(lambda s: "ess_" + s, ess.index)

    # Grab the fit
    fit = lm.params
    fit.index = map(lambda s: "fit_" + s, fit.index)

    # I think this happens with pathological inputs
    if np.any(aov["sum_sq"] < 0):
        1 / 0

    return {"lm": lm, "aov": aov, "pvals": pvals, "ess": ess, "fit": fit}
Beispiel #9
0
    def test_results(self):
        new_model = ols("np.log(Days+1) ~ C(Duration) + C(Weight)",
                        self.data).fit()
        results = anova_lm(new_model, self.kidney_lm)

        Res_Df = np.array([
             56, 54
            ])
        RSS = np.array([
             29.62486, 28.9892
            ])
        Df = np.array([
             0, 2
            ])
        Sum_of_Sq = np.array([
             np.nan, 0.6356584
            ])
        F = np.array([
             np.nan, 0.5920404
            ])
        PrF = np.array([
             np.nan, 0.5567479
            ])

        np.testing.assert_equal(results["df_resid"].values, Res_Df)
        np.testing.assert_almost_equal(results["ssr"].values, RSS, 4)
        np.testing.assert_almost_equal(results["df_diff"].values, Df)
        np.testing.assert_almost_equal(results["ss_diff"].values, Sum_of_Sq)
        np.testing.assert_almost_equal(results["F"].values, F)
        np.testing.assert_almost_equal(results["Pr(>F)"].values, PrF)
    def startanova(self):
        from urllib2 import urlopen
        import numpy as np
        import pandas
        import matplotlib.pyplot as plt
        from statsmodels.formula.api import ols
        from statsmodels.graphics.api import interaction_plot, abline_plot
        from statsmodels.stats.anova import anova_lm


        try:
             rehab_table = pandas.read_csv('rehab.table')
        except:
             url = 'http://stats191.stanford.edu/data/rehab.csv'
             #the next line is not necessary with recent version of pandas
             url = urlopen(url)
             rehab_table = pandas.read_table(url, delimiter=",")
             rehab_table.to_csv('rehab.table')

        print rehab_table

        plt.figure(figsize=(6, 6));
        rehab_table.boxplot('Time', 'Fitness', ax=plt.gca())
        rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit()
        table9 = anova_lm(rehab_lm,test=self.test,robust=self.robust)
        print table9
        print rehab_lm.model.data.orig_exog
        print rehab_lm.summary()
        plt.show()
def doAnova(data):
    '''one-way ANOVA'''
    
    df = pd.DataFrame(data)
    model = ols('StressReduction ~ C(Treatment)',df).fit()
    
    anovaResults =  anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
Beispiel #12
0
def anova_statsmodels():
    """ do the ANOVA with a function """

    # Get the data
    data = pd.read_csv(r"..\Data\data_kaplan\galton.csv")

    anova_results = anova_lm(ols("height ~ 1 + sex", data).fit())
    print('\nANOVA with "statsmodels" ------------------------------')
    print(anova_results)

    return anova_results["F"][0]
Beispiel #13
0
def anova_statsmodels():
    ''' do the ANOVA with a function '''

    # Get the data
    data = pd.read_csv(r'..\Data\data_kaplan\galton.csv')

    anova_results = anova_lm(ols('height ~ 1 + sex', data).fit())
    print('\nANOVA with "statsmodels" ------------------------------')
    print(anova_results)

    return anova_results['F'][0]
Beispiel #14
0
    def test_results(self):
        Df = np.array([2, 2, 2, 54])
        sum_sq = np.array([158.6415227, 16.97129, 0.6356584, 28.9892])
        mean_sq = np.array([79.3207613, 8.485645, 0.3178292, 0.536837])
        f_value = np.array([147.7557648, 15.80674, 0.5920404, np.nan])
        pr_f = np.array([1.262324e-22, 3.944502e-06, 0.5567479, np.nan])

        results = anova_lm(self.kidney_lm)
        np.testing.assert_equal(results["df"].values, Df)
        np.testing.assert_almost_equal(results["sum_sq"].values, sum_sq, 4)
        np.testing.assert_almost_equal(results["F"].values, f_value, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, pr_f)
Beispiel #15
0
    def test_results(self):
        Df = np.array([1, 2, 2, 54])
        sum_sq = np.array([2.339693, 16.97129, 0.6356584, 28.9892])
        mean_sq = np.array([2.339693, 8.485645, 0.3178292, 0.536837])
        f_value = np.array([4.358293, 15.80674, 0.5920404, np.nan])
        pr_f = np.array([0.0415617, 3.944502e-06, 0.5567479, np.nan])

        results = anova_lm(self.kidney_lm)
        np.testing.assert_equal(results['df'].values, Df)
        np.testing.assert_almost_equal(results['sum_sq'].values, sum_sq, 4)
        np.testing.assert_almost_equal(results['F'].values, f_value, 4)
        np.testing.assert_almost_equal(results['PR(>F)'].values, pr_f)
def one_stats(data_lastDV):
    """
    Do basic analysis of one IV onto one DV
    :param data: pandas dataframe we are exploring (IV-of-interest in first column, followed by IVs, and DV in last index)
    :return: None
    """
    col_names = data_lastDV.columns.values.tolist()  # get the columns' names
    causal = col_names.pop(0)  # first item is the topic
    outcome = col_names.pop()  # remove the last item in the list
    topic_data = data_lastDV[[causal, outcome]]

    # descriptive stats
    print(FORMAT_LINE)
    print(topic_data[causal].describe())
    print(FORMAT_LINE)

    fig = plt.figure()
    # bar chart of topics
    ax1 = fig.add_subplot(121)
    df_compare = topic_data.groupby(causal)[causal].count()  # displays num instances assigned to each condition
    ax1 = df_compare.plot(kind='bar', title=causal)
    ax1.set_xlabel(causal)
    ax1.set_ylabel("count instances")
    # scatter plot
    ax2 = fig.add_subplot(122)
    df_compare = data_lastDV.groupby(causal)[outcome].mean()  # displays num helpers selected in each topic
    ax2 = df_compare.plot(kind='bar', title=causal)
    ax2.set_xlabel(causal)
    ax2.set_ylabel("mean " + outcome)
    plt.show()

    # One Way ANOVA
    cond_lm = ols(outcome + " ~ C(" + causal + ")", data=topic_data).fit()
    anova_table = anova_lm(cond_lm)

    print("\n"+FORMAT_LINE)
    print("One-Way ANOVA: " + causal + " --> " + outcome)
    print(FORMAT_LINE)
    print(anova_table)
    #print(cond_lm.model.data.orig_exog)
    print(cond_lm.summary())

    # boxplot of topics --> num helpers selected
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax = topic_data.boxplot(outcome, causal, ax=plt.gca())
    ax.set_xlabel(causal)
    ax.set_ylabel(outcome)
    plt.show()

    for cond in col_names:
        anova_interaction(data_lastDV[[causal, cond, outcome]])
        plot_interaction(data_lastDV[[causal, cond, outcome]])
Beispiel #17
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 2, 2, 51])
        F = np.array([6.972744, 13.7804, 0.1709936, np.nan])
        PrF = np.array([0.01095599, 1.641682e-05, 0.8433081, np.nan])

        results = anova_lm(anova_ii, typ="II", robust="hc0")
        np.testing.assert_equal(results["df"].values, Df)
        # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #18
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 1, 2, 2, 51])
        F_value = np.array([279.7545, 5.367071, 12.43245, 0.1760025, np.nan])
        PrF = np.array([2.379855e-22, 0.02457384, 3.999431e-05, 0.8391231, np.nan])

        results = anova_lm(anova_iii, typ="III")
        np.testing.assert_equal(results["df"].values, Df)
        np.testing.assert_almost_equal(results["sum_sq"].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F_value, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #19
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 1, 2, 2, 51])
        F = np.array([298.3404, 5.723638, 13.76069, 0.1709936, np.nan])
        PrF = np.array([5.876255e-23, 0.02046031, 1.662826e-05, 0.8433081, np.nan])

        results = anova_lm(anova_iii, typ="III", robust="hc0")
        np.testing.assert_equal(results["df"].values, Df)
        # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #20
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 1, 2, 2, 51])
        F = np.array([266.9361, 5.12115, 12.3122, 0.1529943, np.nan])
        PrF = np.array([6.54355e-22, 0.02792296, 4.336712e-05, 0.858527, np.nan])

        results = anova_lm(anova_iii, typ="III", robust="hc1")
        np.testing.assert_equal(results["df"].values, Df)
        # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #21
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 1, 2, 2, 51])
        F = np.array([264.5137, 5.074677, 12.19158, 0.1501224, np.nan])
        PrF = np.array([7.958286e-22, 0.02860926, 4.704831e-05, 0.8609815, np.nan])

        results = anova_lm(anova_iii, typ="III", robust="hc2")
        np.testing.assert_equal(results["df"].values, Df)
        # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #22
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit()

        Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181])
        Df = np.array([1, 1, 2, 2, 51])
        F = np.array([234.4026, 4.496996, 10.79903, 0.1317223, np.nan])
        PrF = np.array([1.037224e-20, 0.03883841, 0.0001228716, 0.8768817, np.nan])

        results = anova_lm(anova_iii, typ="III", robust="hc3")
        np.testing.assert_equal(results["df"].values, Df)
        # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
Beispiel #23
0
    def test_results(self):
        data = self.data.drop([0, 1, 2])
        anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum) - 1", data).fit()

        Sum_Sq = np.array([154.7131692, 13.27205, 0.1905093, 27.60181])
        Df = np.array([2, 2, 2, 51])
        F_value = np.array([142.9321191, 12.26141, 0.1760025, np.nan])
        PrF = np.array([1.238624e-21, 4.487909e-05, 0.8391231, np.nan])

        results = anova_lm(anova_ii, typ="II")
        np.testing.assert_equal(results["df"].values, Df)
        np.testing.assert_almost_equal(results["sum_sq"].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results["F"].values, F_value, 4)
        np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal.
    
    Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups:
    
    Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h.
    Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation.
    Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h.
    
    The data show red cell folate levels for the three groups after 24h' ventilation.
    
    '''
    
    # Get the data
    print('One-way ANOVA: -----------------')
    inFile = 'altman_910.txt'
    data = np.genfromtxt(inFile, delimiter=',')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # --- >>> START stats <<< ---
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    # --- >>> STOP stats <<< ---
    
    # Print the results
    print('Data form Altman 910:')
    print((F_statistic, pVal))
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    anovaResults = anova_lm(model)
    print(anovaResults)
    
    # Check if the two results are equal. If they are, there is no output
    np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])
    
    return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def do_ANOVA(data):
    '''4.3.2. Perform an ANOVA on the data'''
    
    print('ANOVA: ----------------------------------------------')
    
    # First, I fit a statistical "ordinary least square (ols)"-model to the data, using the
    # formula language from "patsy". The formula 'weight ~ C(group)' says:
    # "weight" is a function of the categorical value "group"
    # and the data are taken from the DataFrame "data", which contains "weight" and "group"
    model = ols('weight ~ C(group)', data).fit()
    
    # "anova_lm" (where "lm" stands for "linear model") extracts the ANOVA-parameters
    # from the fitted model.
    anovaResults = anova_lm(model)
    print(anovaResults)
    
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
Beispiel #26
0
def anova_interaction():
    '''ANOVA with interaction: Measurement of fetal head circumference,
    by four observers in three fetuses, from a study investigating the
    reproducibility of ultrasonic fetal head circumference data.'''
    
    # Get the data
    data = getData('altman_12_6.txt', subDir='..\Data\data_altman')
    
    # Bring them in dataframe-format
    df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer'])
    
    # --- >>> START stats <<< ---
    # Determine the ANOVA with interaction
    formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)'
    lm = ols(formula, df).fit()
    anovaResults = anova_lm(lm)
    # --- >>> STOP stats <<< ---
    print(anovaResults)

    return  anovaResults['F'][0]
def anova_interaction():
    """ANOVA with interaction: Measurement of fetal head circumference,
    by four observers in three fetuses, from a study investigating the
    reproducibility of ultrasonic fetal head circumference data.
    """

    # Get the data
    data = getData("altman_12_6.txt", subDir="..\Data\data_altman")

    # Bring them in DataFrame-format
    df = pd.DataFrame(data, columns=["hs", "fetus", "observer"])

    # --- >>> START stats <<< ---
    # Determine the ANOVA with interaction
    formula = "hs ~ C(fetus) + C(observer) + C(fetus):C(observer)"
    lm = ols(formula, df).fit()
    anovaResults = anova_lm(lm)
    # --- >>> STOP stats <<< ---
    print(anovaResults)

    return anovaResults["F"][0]
def ancova(data_covar_lastDV):
    """
    ANCOVA for when you have a numerical covariate to control for. Read more about ANOVA/ANCOVA/etc here: http://www.statsmakemecry.com/smmctheblog/stats-soup-anova-ancova-manova-mancova
    http://elderlab.yorku.ca/~elder/teaching/psyc3031/lectures/Lecture%207%20Analysis%20of%20Covariance%20-%20ANCOVA%20%28GLM%202%29.pdf  (slide 24)

    :param data: data frame containing the independent and dependent variables (covariate is second to last, DV is last item in list)
    :return: None
    """
    col_names = data_covar_lastDV.columns.values.tolist()  # get the columns' names
    outcome = col_names.pop()  # remove the last item in the list
    covariate = col_names.pop()  # remove the [second to] last item in the list

    fig = plt.figure()
    i = 1

    for cond in col_names:
        cond_table = data_covar_lastDV[[cond, covariate, outcome]].dropna()

        cond_lm = ols(outcome + " ~ " + covariate + " + " + cond, data=cond_table).fit()
        anova_table = anova_lm(cond_lm)

        print("\n"+FORMAT_LINE)
        print("ANCOVA: " + cond + " + " + covariate + " --> " + outcome)
        print(FORMAT_LINE)
        print(anova_table)
        #print(cond_lm.model.data.orig_exog)
        print(cond_lm.summary())

        ax = fig.add_subplot(1,2, i)
        ax = cond_table.boxplot(outcome, cond, ax=plt.gca())
        ax.set_xlabel(cond)
        ax.set_ylabel(outcome)
        i += 1
    # box plot
    # TODO: need to remove the effect of the covariate before plotting
    # http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html
    user_input = input(">> Display boxplot of conditions? [y/n]: ")
    if is_yes(user_input):
        fig.tight_layout()
        plt.show()
Beispiel #29
0
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal. '''

    # Get the data
    print('One-way ANOVA: -----------------')
    data = getData('altman_910.txt', subDir='..\Data\data_altman')

    # Sort them into groups, according to column 1
    group1 = data[data[:, 1] == 1, 0]
    group2 = data[data[:, 1] == 2, 0]
    group3 = data[data[:, 1] == 3, 0]

    # First, check if the variances are equal, with the "Levene"-test
    (W, p) = stats.levene(group1, group2, group3)
    if p < 0.05:
        print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(
            p))

    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)

    # Print the results
    print('Data form Altman 910:')
    print((F_statistic, pVal))
    if pVal < 0.05:
        print('One of the groups is significantly different.')

    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])
    model = ols('value ~ C(treatment)', df).fit()
    anovaResults = anova_lm(model)
    print(anovaResults)

    # Check if the two results are equal. If they are, there is no output
    np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])

    return (F_statistic,
            pVal)  # should be (3.711335988266943, 0.043589334959179327)
Beispiel #30
0
    def test_results(self):
        data = self.data.drop([0,1,2])
        anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
                                data).fit()

        Sum_Sq = np.array([
             151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
            ])
        Df = np.array([
             1, 2, 2, 51
            ])
        F = np.array([
             5.633786, 10.89842, 0.1317223, np.nan
            ])
        PrF = np.array([
             0.02142223, 0.0001145965, 0.8768817, np.nan
            ])

        results = anova_lm(anova_ii, typ="II", robust="hc3")
        np.testing.assert_equal(results['df'].values, Df)
        #np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
        np.testing.assert_almost_equal(results['F'].values, F, 4)
        np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
# l=[]
# for x in range(80):
#     l.append([1,1])
# for y in range(1):
#     l.append([2,1])
# exog=np.array(l)
# q,r = np.linalg.qr(exog)
# print(np.dot(q,r))
a = np.array([[-0.5, -0.5, -0.5, -0.5], [-0.5, -0.5, 0.5, 0.5]])
b = np.array([2, 2, 2, 2])
print(a, b)
print(np.dot(a, b))
# coh_list=[0,1,3,3]
coh_list = []
for x in range(4):
    coh_list.append(2)
id = []
for x in range(2):
    id.append(2)
for x in range(2):
    id.append(1)

data = {'id': id, 'coherence': coh_list}

df = pd.DataFrame(data)
print(df)
print(ols('coherence ~ C(id)', df).fit())
anova_res = anova_lm(ols('coherence ~ C(id)', df).fit(), typ=1)

print(anova_res)
print(type(anova_res.loc['C(id)']['PR(>F)']))
    fig, ax = plot_data(jobtest_table)
    fig = abline_plot(intercept=lm4.params['Intercept'],
                      slope=lm4.params['TEST'],
                      ax=ax,
                      color='purple')

    fig = abline_plot(intercept=lm4.params['Intercept'] +
                      lm4.params['MINORITY'],
                      slope=lm4.params['TEST'] + lm4.params['TEST:MINORITY'],
                      ax=ax,
                      color='green')
    plt.title("JPERF ~ TEST * TEST:MINORITY")
    plt.show()

    # is there any effect of MINORITY on slope or intercept?
    table = anova_lm(lm, lm4)
    print("TEST vs. TEST * MINORITY")
    print(table)
    print("\n")
    """
    TEST vs. TEST * MINORITY
    ========================
        df_resid        ssr  df_diff    ss_diff         F    Pr(>F)
    0      18.0  45.568297      0.0        NaN       NaN       NaN
    1      16.0  31.655473      2.0  13.912824  3.516061  0.054236
    """

    # is there any effect of MINORITY on slope
    # NOTE: assumption. the slope is the same within each group
    table = anova_lm(lm, lm3)
    print("TEST vs. TEST:MINORITY")
Beispiel #33
0
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as an
from statsmodels.stats.anova import anova_lm

a = pd.DataFrame({
    "Fertilizer": [100, 200, 300, 400, 500, 600, 700],
    "Rainfall": [10, 20, 10, 30, 20, 20, 30],
    "Yield": [40, 50, 50, 70, 65, 65, 80]
})
result = an.ols(formula="Yield ~ Fertilizer + Rainfall", data=a).fit()
print result.params
print(anova_lm(result))
print(result.summary())

fig = plt.figure()
axis = fig.add_subplot(111, projection='3d')
axis.scatter(a['Fertilizer'], a['Rainfall'], a['Yield'], c='r', marker='o')
xx, yy = np.meshgrid(a['Fertilizer'], a['Rainfall'])
exog = pd.core.frame.DataFrame({
    'Fertilizer': xx.ravel(),
    'Rainfall': yy.ravel()
})
out = result.predict(exog=exog)
axis.plot_surface(xx,
                  yy,
                  out.values.reshape(xx.shape),
                  rstride=1,
                  cstride=1,
Beispiel #34
0
# print(df.columns.tolist())

# 2、数据处理
# 1)标识属性
catCols = ['季度']
intCols = ['GNP', '失业率', '利率']
target = '销量'

# 2)特征选择(协方差分析)
import statsmodels.formula.api as smf
import statsmodels.stats.anova as sms

cols = catCols + intCols
formula = '{} ~ {}'.format(target, '+'.join(cols))
module = smf.ols(formula, df).fit()
dfanova = sms.anova_lm(module)

cond = dfanova['PR(>F)'] < 0.05
cols = dfanova[cond].index.tolist()
print('显著影响的因素:', cols)

# 去除无显著影响的因素
for col in catCols:
    if col not in cols:
        catCols.remove(col)
for col in intCols:
    if col not in cols:
        intCols.remove(col)

# # 3)如果要预测,则需要错位/移位,并去除首行
# shiftCols = ['GNP','失业率','利率']
    def run(self, dfx, tsy):

        tsy = tsy.reset_index(drop=True)
        dfx = dfx.reset_index(drop=True)

        msg = {}

        xl = len(dfx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = '输入的dfx的长度为:%s 不等于输入的tsy的长度: %s  ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        if not isSeries(tsy) or not isCategory(tsy):
            logging.error(
                'input tsy is not a pandas Series or not a category data!')
            msg['error'] = '输入的tsy不是定类型数据或者Series类型'

            return {'result': pd.DataFrame(), 'msg': msg}

        else:
            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

            if x_numer_cols == []:
                logging.error(
                    'All input dfx are no numeric columns, Please check your input dfx data!'
                )
                msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据'
                return {'result': pd.DataFrame(), 'msg': msg}

            else:

                if x_cate_cols != []:
                    logging.warning(
                        'input dfx has non-numeric columns: %s, will ignore these columns!'
                        % x_cate_cols)

                    msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols

                name = tsy.name

                dfu = dfx[x_numer_cols].join(tsy)
                m = dfu.groupby(name).mean().T
                s = dfu.groupby(name).std().T

                def change(ts):
                    v = []
                    for i in ts.index:
                        r = '%s±%s' % (round(ts.loc[i],
                                             2), round(s[ts.name].loc[i], 2))
                        v.append(r)
                    return pd.Series(v, index=ts.index)

                m1 = m.apply(change)

                rs = []
                for i in x_numer_cols:
                    model = ols('%s ~ %s' % (i, tsy.name), dfu).fit()
                    anovat = anova_lm(model)
                    anovat.columns = ['自由度', '平方和', '均方和', 'F-值', 'p-值']
                    rs.append(anovat.iloc[0].to_frame(name=i).T)

                res = m1.join(pd.concat(rs))
                res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x))

                return {'result': res.round(5), 'msg': msg}
Beispiel #36
0
      'treat1':list(np.random.normal(15,5,100)), \
      'treat2':list(np.random.normal(20,5,100)), \
      'treat3':list(np.random.normal(30,5,100)), \
      'treat4':list(np.random.normal(31,5,100))}
#组合成数据框
import pandas as pd
df = pd.DataFrame(df)
df.head()

df.boxplot(grid=False)
import matplotlib.pyplot as plt
plt.show()

#数据格式整理为一列为处理,一列为数值的形式
df_melt = df.melt()
df_melt.head()

df_melt.columns = ['Treat', 'Value']
df_melt.head()

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model = ols('Value~C(Treat)', data=df_melt).fit()
anova_table = anova_lm(model, typ=2)
print(anova_table)

import seaborn as sns
sns.boxplot(x='Treat', y='Value', data=df_melt)

#参考https://zhuanlan.zhihu.com/p/91031244
Beispiel #37
0
print(model.summary())

## 新增列求月份值及数据预处理
data1['month'] = data1['date'].dt.month

'''
问题二:
按照α = 0.05,由此可看出:
Fr(2.467) < Fa 或者P(0.124) > 0.05,因此原假设H0成立,月份对销售量没影响
'''
formula2='value~month'
model2=ols(formula2,data=data1).fit()
print(model2.summary())

model2_2= ols(formula2,data1).fit()
anovat=anova_lm(model2_2)
print(anovat)

'''
问题二:
按照α = 0.05,由此可看出:
1.地区:P(0.020985) <0.05,因此拒绝H0,地区对销售量有显著性影响
2.月份:P(0.094860) >0.05,因此不能拒绝H0,月份对销售量没显著性影响
'''

formula3='value~month + item'
model3=ols(formula3,data=data1).fit()
anovat3=anova_lm(model3)
print(anovat3)

np.random.seed(1)
# normal distributed noise
y = -5 + 3*x + 4 * np.random.normal(size=x.shape)
# Create a data frame containing all the relevant variables
data = pd.DataFrame({'x': x, 'y': y})

plt.figure(figsize=(5, 4))
plt.plot(data["x"],data["y"], 'o')

from statsmodels.formula.api import ols
model = ols("y ~ x", data).fit()
print(model.summary()) 

# Peform analysis of variance on fitted linear model
from statsmodels.stats.anova import anova_lm
anova_results = anova_lm(model)
print(anova_results)

# Retrieve the parameter estimates
beta_0, beta_1 = model._results.params
plt.plot(x, x*beta_1 + beta_0)
plt.xlabel('x')
plt.ylabel('y')

#data with linear model depicited run all lines together
plt.plot(data["x"],data["y"], 'o')
plt.plot(x, x*beta_1 + beta_0,color="black")
plt.xlabel('x')
plt.ylabel('y')

Beispiel #39
0
def get_fairness_analyses(df,
                          group,
                          system_score_column,
                          human_score_column='sc1',
                          base_group=None):
    """
    Compute fairness analyses described in `Loukina et al. 2019 <https://www.aclweb.org/anthology/W19-4401/>`_.

    The function computes how much variance group membership explains in
    overall score accuracy (osa), overall score difference (osd),
    and conditional score difference (csd). See the paper for more
    details.

    Parameters
    ----------
    df: pandas DataFrame
        A dataframe containing columns with numeric human scores,
        columns with numeric system scores and a column with
        group membership.
    group: str
        Name of the column containing group membership.
    system_score_column: str
        Name of the column containing system scores.
    human_score_column: str
        Name of the column containing human scores.
    base_group: str, optional
        Name of the group to use as the reference category.
        Defaults to ``None`` in which case the group with the
        largest number of cases will be used as the reference
        category. Ties are broken alphabetically.

    Returns
    -------
    model_dict: dictionary
        A dictionary with different proposed metrics as keys
        and fitted models as values.
    fairness_container: DataContainer
        A datacontainer with the following datasets:

         - "estimates_<METRIC>_by_<GROUP>" where "<GROUP>" corresponds to
           the given group and "<METRIC>" can be "osa", "osd" and "csd" estimates
           for each group computed by the respective models.
         - "fairness_metrics_by_<GROUP>" - a summary of model fits (R2 and
           p values).
    """
    # compute error and squared error
    df['error'] = df[system_score_column] - df[human_score_column]
    df['SE'] = df['error']**2

    # convert group values to category and reorder them using
    # the largest category as reference

    df['group'] = convert_to_ordered_category(df[group], base_group=base_group)
    base_group = df['group'].cat.categories[0]

    df['sc1_cat'] = convert_to_ordered_category(df[human_score_column])

    # Overall score accuracy (OSA)
    # Variance in squared error explained by L1

    # fit the model
    osa_model = smf.ols(formula='SE ~ group', data=df)
    osa_fit = osa_model.fit()

    # collect the results
    osa_dict = {'R2': osa_fit.rsquared_adj, 'sig': osa_fit.f_pvalue}

    osa_results = pd.Series(osa_dict, name='Overall score accuracy')

    df_coefficients_osa = get_coefficients(osa_fit, base_group)

    # Overall score difference (OSD)
    # variance in signed residuals (raw error) explained by L1

    # fit the model
    osd_model = smf.ols(formula='error ~ group', data=df)
    osd_fit = osd_model.fit()

    # collect the results
    osd_dict = {'R2': osd_fit.rsquared_adj, 'sig': osd_fit.f_pvalue}

    osd_results = pd.Series(osd_dict, name='Overall score difference')

    df_coefficients_osd = get_coefficients(osd_fit, base_group)

    # conditional score difference CSD
    # Variance in score difference conditioned on Native language

    # fit "null" model with human score only
    csd_null_mod = smf.ols(formula='error ~ sc1_cat', data=df)
    csd_null_fit = csd_null_mod.fit()

    # fit model with both human score and group
    csd_mod = smf.ols(formula='error ~ group + sc1_cat', data=df)
    csd_fit = csd_mod.fit()

    # compare the two models using anova_lm
    # we filter warnings for this function because we get
    # runtime warning due to NaNs in the data.
    # these seem to be by design: https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k
    np.warnings.filterwarnings('ignore')
    anova_results = anova_lm(csd_null_fit, csd_fit)
    # we reset warnings
    np.warnings.resetwarnings()

    # collect the results. Note that R2 in this case is a difference
    # in R2 between the two models and significance is obtained from anova
    csd_dict = {
        'R2': csd_fit.rsquared_adj - csd_null_fit.rsquared_adj,
        'sig': anova_results.values[1][-1]
    }
    csd_results = pd.Series(csd_dict, name='Conditional score difference')

    df_coefficients_csd = get_coefficients(csd_fit, base_group)

    # create a summary table

    df_r2_all = pd.concat([osa_results, osd_results, csd_results],
                          axis=1,
                          sort=True)
    df_r2_all['base_category'] = base_group

    # assemble all datasets into a DataContainer

    datasets = [{
        'name': 'estimates_osa_by_{}'.format(group),
        'frame': df_coefficients_osa
    }, {
        'name': 'estimates_osd_by_{}'.format(group),
        'frame': df_coefficients_osd
    }, {
        'name': 'estimates_csd_by_{}'.format(group),
        'frame': df_coefficients_csd
    }, {
        'name': 'fairness_metrics_by_{}'.format(group),
        'frame': df_r2_all
    }]

    # assemble all models into a dictionary
    model_dict = {'osa': osa_fit, 'osd': osd_fit, 'csd': csd_fit}

    return model_dict, DataContainer(datasets=datasets)
Beispiel #40
0
def _oneway_anova(table, response_cols, factor_col):
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## One-way Analysis of Variance Result
    """))
    groups = table[factor_col].unique()
    groups.sort()
    sum_len = np.sum([len(str(group)) for group in groups])

    result = dict()
    result['_grouped_data'] = dict()

    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()

        ax = sns.boxplot(x=factor_col,
                         y=response_col,
                         data=table,
                         order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        fig_box = plt2MD(plt)
        plt.clf()

        model = ols(
            """Q('{response_col}') ~ C(Q('{factor_col}'))""".format(
                response_col=response_col, factor_col=factor_col),
            table).fit()  # TODO factor_col = class => error
        anova = anova_lm(model)

        index_list = anova.index.tolist()
        remove_list = ["C(Q('", "'))", "Q('", "')"]
        for v in remove_list:
            index_list = [i.replace(v, "") for i in index_list]
        anova.insert(0, '', index_list)

        anova_df = pandasDF2MD(anova)

        p_value = anova["""PR(>F)"""][0]

        residual = model.resid

        sns.distplot(residual)
        distplot = plt2MD(plt)
        plt.clf()

        sm.qqplot(residual, line='s')
        qqplot = plt2MD(plt)
        plt.clf()

        rb.addMD(
            strip_margin("""
        | ## {response_col} by {factor_col}
        | {fig_box}
        |
        | ### ANOVA
        | {anova_df}
        | 
        | ### Diagnostics
        | {distplot}
        |
        | {qqplot}
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   fig_box=fig_box,
                   anova_df=anova_df,
                   distplot=distplot,
                   qqplot=qqplot)))

        result['_grouped_data'][response_col]['p_value'] = p_value

    result['_repr_brtc_'] = rb.get()
    return {'result': result}
Beispiel #41
0
#
# <b>Reject $H_0$ (reject the null hypothesis)</b>
#
# so there is a significance difference between the proportion of smokers in different genders

# #### d. Is the distribution of bmi across women with no children, one child and two children, the same ?

# In[223]:

filter1 = data[data["sex"] == "female"]
filter2 = filter1[data["children"].cat.codes < 3]
filter2["children"] = pd.to_numeric(filter2["children"])
filter2["children"] = pd.Categorical(filter2["children"])
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
formula = "bmi ~ C(children)"
model = ols(formula, filter2).fit()
aov_table = anova_lm(model)
print(aov_table)

# $H_0:$ The bmi is significantly similar for women with 0, 1, 2 children i.e $\mu_1 = \mu_2 = \mu_3$
#
# $H_1$: The bmi is not significantly similar for women with 0, 1, 2 children i.e $\mu_1 \neq \mu_2 \neq \mu_3$
#
# The anova value 0.715 is greater than 0.05 so this confirms that
#
# <b>Accept $H_0$ (accept the null hypothesis)</b>
#
# The the distribution of bmi across women with 0, 1, 2 children is similar
# i.e $\mu_1 = \mu_2 = \mu_3$
Beispiel #42
0
least_error = np.finfo(np.float64).max
featurewithLeastSimple = ""

#Scale all data between 0 and 1
df3 = input_train[Columnlist]
tempcolumnnames = df3.columns
x_scaled = min_max_scaler.fit_transform(df3)
df3 = pd.DataFrame(x_scaled)
df3.columns = tempcolumnnames

#Simple regression
for col in Columnlist:
    x = input_train[col]
    Linearmodel = ols("y ~ x", x).fit()
    offset, coef = Linearmodel._results.params
    anova_results = anova_lm(Linearmodel)
    if (least_error > anova_results['mean_sq'][1]):
        least_error = anova_results['mean_sq'][1]
        featurewithLeastSimple = col
print('\nANOVA results')
print(anova_results)
plt.plot(x, x * coef + offset)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
print('Best Feature was: ' + featurewithLeastSimple)

#multiple feature regression
multipleReg = LinearRegression()
multipleReg.fit(df3, y)
print('Weights are : ')
Beispiel #43
0
def Correlations_Cont_Cat(p_data,
                          p_predictors,
                          p_numeric_cat_index=np.array([]),
                          p_weight=None,
                          p_p_val=0.01,
                          p_subsamplesize=100,
                          p_seed=0):
    """ Use ANOVA to find categorical - continuous relationships. Small differences come through
        as significant with a high number of observations, therefore we use a sample size of 100 
        Also keep in mind that by using ANOVA we assume normally distributed data and equal variances
        an alternative is to use Kruskal - Wallis """
    """ Use ICC to define correlations, give box-plots for highly correlated pairs """
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm

    cont_index = np.intersect1d(p_predictors,
                                ContCatSplit(p_data, p_numeric_cat_index)[0])
    cat_index = np.intersect1d(p_predictors,
                               ContCatSplit(p_data, p_numeric_cat_index)[1])

    # List of pairs along with correlation above threshold
    cont_cat_corr_list = []

    from random import sample, seed
    seed(p_seed)
    rand_vals = sample(range(len(p_data)), k=p_subsamplesize)

    # Search for the highly correlated pairs
    for i in cont_index:
        for j in cat_index:
            formula = p_data.columns[i] + " ~ " + p_data.columns[j]
            model_fit = ols(formula, data=p_data.iloc[rand_vals, :]).fit()
            anova_model = anova_lm(model_fit)
            p = anova_model.iloc[0, 4]
            if p < p_p_val:
                cont_cat_corr_list.append(
                    [p, i, j])  #store correlation and columns index

    # Order variables by level of correlation
    s_cont_cat_corr_list = sorted(cont_cat_corr_list, key=lambda x: abs(x[0]))

    cont_cat_corr_features = []
    # Print correlations and column names
    print('One-way ANOVA p-values - Predictors')
    for v, i, j in s_cont_cat_corr_list:
        cont_cat_corr_features.append(
            [p_data.columns[i], p_data.columns[j], v])
        print('{} and {} = {:.2}'.format(p_data.columns[i], p_data.columns[j],
                                         v))

    # Box plot of the highly correlated pairs
    for v, i, j in s_cont_cat_corr_list:
        fg, ax = plt.subplots(figsize=(12, 8))
        fg = p_data.boxplot(p_data.columns[i],
                            p_data.columns[j],
                            ax=ax,
                            grid=False)
        plt.xticks(rotation=90)
        plt.show()

    return cont_cat_corr_features
def neuron_period_activity_analysis(
    hp,
    log,
    trial_list,
    model_dir,
    rule,
    seltive_epoch,
    analy_epoch,
    n_types=('exh_neurons', 'mix_neurons'),
    norm=True,
    PSTH_log=None,
    last_step=True,
    bin_wid=0.5,
):

    print("Start neuron period activity analysis")
    print(
        "\trule: " + rule + " selective epoch: " + seltive_epoch +
        " analysis epoch: ", analy_epoch)

    with open(model_dir + '/task_info.pkl', 'rb') as tinf:
        task_info = pickle.load(tinf)

    if PSTH_log is None:
        PSTH_log = gen_PSTH_log(hp,
                                trial_list,
                                model_dir,
                                rule,
                                seltive_epoch,
                                n_types=n_types,
                                norm=norm)

    if isinstance(analy_epoch, str):
        start = task_info[rule]['epoch_info'][analy_epoch][0]
        end = task_info[rule]['epoch_info'][analy_epoch][1]
    elif isinstance(analy_epoch, (tuple, list)):
        start = int(analy_epoch[0] / hp["dt"])
        end = int(analy_epoch[1] / hp["dt"])
    else:
        raise ValueError('Wrong analy_epoch format!')

    is_dict = False
    is_list = False
    if isinstance(trial_list, dict):
        temp_list = list()
        is_dict = True
        for value in trial_list[rule].values():
            temp_list += value
        temp_list = sorted(set(temp_list))
    elif isinstance(trial_list, list):
        temp_list = trial_list
        is_list = True

    trial_sort_by_matur = dict()
    fire_rate_dict = dict()

    for trial_num in temp_list:
        growth = log['perf_' + rule][trial_num // log['trials'][1]]
        if (is_list and growth > hp['mid_target_perf']) or (
                is_dict and trial_num in trial_list[rule]['mature']):
            mature = 'mature'
        elif (is_list and growth > hp['early_target_perf']) or (
                is_dict and trial_num in trial_list[rule]['mid']):
            mature = 'mid'
        elif is_list or (is_dict and trial_num in trial_list[rule]['early']):
            mature = 'early'

        if mature not in trial_sort_by_matur:
            trial_sort_by_matur[mature] = list()
            fire_rate_dict[mature] = list()
        trial_sort_by_matur[mature].append(trial_num)

    if last_step:
        for key, value in trial_sort_by_matur.items():
            trial_sort_by_matur[key] = value[-1:]

    for mature_key, sub_trial_list in trial_sort_by_matur.items():
        for trial_num in sub_trial_list:
            fire_rate_dict[mature_key] += list(
                PSTH_log[trial_num][:, start:end].mean(axis=1))

    #ANOVA#
    #f,p = stats.f_oneway(*list(fire_rate_dict.values()))

    dict_melt = dict()
    dict_melt['Maturation'] = list()
    dict_melt['Fire_rate'] = list()
    for key, value in fire_rate_dict.items():
        dict_melt['Maturation'] += [key for i in range(len(value))]
        dict_melt['Fire_rate'] += list(value)

    df_melt = pd.DataFrame(dict_melt)
    model = ols('Fire_rate~C(Maturation)', data=df_melt).fit()
    anova_table = anova_lm(model, typ=2)

    p = anova_table['PR(>F)'][0]
    df_g = anova_table['df'][0]
    df_res = anova_table['df'][1]

    #print("\tP value:",anova_table['PR(>F)'][0])

    # plot #
    colors = {'early': 'green', 'mid': 'blue', 'mature': 'red'}
    save_path = 'figure/figure_' + model_dir.rstrip('/').split('/')[
        -1] + '/' + rule + '/' + seltive_epoch + '/' + '_'.join(n_types) + '/'
    fig, axes = plt.subplots(2, 1, figsize=(12, 15))
    for mature, fire_rate in fire_rate_dict.items():
        axes[0].hist(fire_rate,bins=int(max(fire_rate)/bin_wid)+1,histtype="step",alpha=0.6,\
            color=colors[mature],label=mature+' mean:%.3f'%(np.mean(fire_rate)),density=1)
    axes[0].legend()
    axes[0].set_xlabel("activity")

    m_keys = list(fire_rate_dict.keys())
    axes[1].boxplot([fire_rate_dict[m_key] for m_key in m_keys], labels=m_keys)
    axes[1].set_ylabel("activity")

    fig.suptitle("rule: "+rule+" selective epoch: "+seltive_epoch+" analysis epoch: "+str(analy_epoch)+\
        "\n p value: %.3e"%(p)+" group df: %.1f"%(df_g)+" residual df: %.1f"%(df_res))

    if isinstance(analy_epoch, str):
        plt.savefig(save_path + rule + '_' + analy_epoch +
                    '_activity_oneway_anova_analysis.png')
        plt.savefig(save_path + rule + '_' + analy_epoch +
                    '_activity_oneway_anova_analysis.pdf')
    elif isinstance(analy_epoch, (tuple, list)):
        plt.savefig(save_path + rule + '_' + str(analy_epoch[0]) + '_' +
                    str(analy_epoch[1]) +
                    '_activity_oneway_anova_analysis.png')
        plt.savefig(save_path + rule + '_' + str(analy_epoch[0]) + '_' +
                    str(analy_epoch[1]) +
                    '_activity_oneway_anova_analysis.pdf')
Beispiel #45
0
        , "FilterImg": FilterImg
        , "ResultImg": ResultImg
    }
)

print(degf)
## Apply regression formula and draw result  in 3d graphs
Reg4 = ols(formula="ResultImg ~ NoisedImg + FilterImg", NoisedImg=degf)
Fit4 = Reg4.fit()
print(Fit4.summary())
print(Fit4.params)
print(Fit4.fittedvalues)
print(Fit4.resid)
print(Fit4.bse)
print(Fit4.centered_tss)
print(anova_lm(Fit4))
fg = plt.figure()
ax = fg.add_subplot(111, projection="3d")
ax.scatter(
    degf["NoisedImg"]
    , degf["FilterImg"]
    , degf["ResultImg"]
    , color="Blue"
    , marker="+"
    , alpha=0.5
)
##Draw the axis for all values with alpha values changes
ax.set_xlabel("NoisedImg")
ax.set_ylabel("FilterImg")
ax.set_zlabel("ResultImg")
x_surf = numpy.arange(110, 700, 40)
Beispiel #46
0
            continue            
        genedata = pd.DataFrame({"expr": expr_vals, "SNP": snp_genotypes, "STR": str_genotypes})
        genedata = genedata[~np.isnan(genedata["STR"]) & ~np.isnan(genedata["SNP"]) & ~np.isnan(genedata["expr"])]
        # Remove outlier STR genotypes
        gtcounts = genedata.groupby("STR", as_index=False).agg({"SNP": len})
        keepgt = set(gtcounts[gtcounts["SNP"]>=args.mingt]["STR"])
        genedata = genedata[genedata["STR"].apply(lambda x: x in keepgt)]
#        print("STR r: %s"%scipy.stats.pearsonr(genedata["STR"], genedata["expr"])[0])
#        print("SNP r: %s"%scipy.stats.pearsonr(genedata["SNP"], genedata["expr"])[0])
#        print(genedata.groupby("STR", as_index=False).agg({"SNP": len}))
        # Normalize
        genedata["STR"] = ZNorm(genedata["STR"])
        genedata["SNP"] = ZNorm(genedata["SNP"])
        genedata["expr"] = ZNorm(genedata["expr"])
        formula_snpstr = "expr ~ STR+SNP"
        formula_snp = "expr ~ SNP"
        try:
            lm_snpstr = ols(formula_snpstr, genedata).fit()
        except:
            PROGRESS("Error running snpstr model for gene: %s"%gene)
            continue
        try:
            lm_snp = ols(formula_snp, genedata).fit()
        except:
            PROGRESS("Error running SNP only model for gene: %s"%gene)
            continue            
        anova_results = anova_lm(lm_snp, lm_snpstr)
        pval = anova_results["Pr(>F)"].values[1]
        outitems = [gene, args.chrom+":"+str(str_pos), args.chrom+":"+str(snp_pos), pval]
        outf.write("\t".join([str(item) for item in outitems])+"\n")
Beispiel #47
0
data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv")
data.head()

from statsmodels.sandbox.stats.runs import mcnemar

crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])
x2, p = mcnemar(crosstab, correction=False)
print('Chi-square=%1.2f, p = %1.2f'%(x2, p))

## Independent *t*-test

vowels = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv")
vowels.head()

t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])
print("t-score=%1.2f, p=%1.2f"%(t,p))

## One-way ANOVA

data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/05-2_reactiontimes.csv")
data

data = data.dropna()

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

model = ols('RT ~ FAMILIARITY', data).fit()
aov = anova_lm(model)
print(aov)
Beispiel #48
0
]]
display(frame1, frame2)

# %%
frame = pd.concat([frame1, frame2])
frame.index = range(len(frame))
frame['cv'] = frame['cv'].map(lambda e: str(e))
display(frame)

# %%
for value in ['f1score', 'recall', 'precision']:
    print('-' * 80)
    print(value)
    formula = f'{value} ~ subject + method + cv'
    model = ols(formula, data=frame).fit()
    anova = anova_lm(model)
    anova.to_html(f'anova_{value}.html')
    display(anova)

# %%
newframe = frame.groupby(['method', 'subject']).mean()
newframe.pop('f1score')

display(newframe)

plt.style.use('ggplot')
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for j, value in enumerate(['recall', 'precision', 'f1-score']):
    ax = axes[j]
    ax.boxplot([newframe.loc['raw'][value], newframe.loc['manifolder'][value]],
import os
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from scipy.stats import shapiro
from variables import DIR_OUT, DIR_JAM
import numpy as np

if __name__ == "__main__":
    df = pd.read_csv(
        os.path.join(DIR_OUT, "derived_tables",
                     "nb_streamlines_hemi_level.csv"))
    df["PP_CS_Depth_Normalised"] = df["PP_CS_Depth"] / df["Max_Geo_Depth"]
    # df['Roi_Area_Normalised']
    # df.to_csv(os.path.join(DIR_OUT,'derived_tables','nb_streamlines_hemi_level_norm.csv'), index=False)
    # model = smf.ols('Nb_Streamlines_Hemi ~ PP_CS_Depth_Normalised + C(Hemisphere) + C(HandednessQ) + Roi_Area',data=df).fit()
    model = smf.ols("PP_CS_Depth_Normalised ~ C(Hemisphere)", data=df).fit()
    resid = model.resid
    W, p = shapiro(resid)
    print
    W, p
    summary = model.summary()
    print
    summary
    anova = anova_lm(model, type=3)
    print
    anova
Beispiel #50
0
from statsmodels.stats.anova import anova_lm
from pandas import Series, DataFrame

import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

hs = [1.00, 2.00, 3.00, 4.00, 5.00, 6.00, 7.00, 8.00, 9.00, 2.00]
fetus = [1.00, 2.00, 1.00, 2.00, 1.00, 2.00, 2.00, 2.00, 1.00, 1.00]
observer = [1.00, 2.00, 3.00, 2.00, 1.00, 2.00, 4.00, 4.00, 1.00, 1.00]
data = {'value': value, 'group': group}
df = DataFrame(data)
# print frame
import json
formula = 'hs ~ fetus'
anova_results = anova_lm(ols(formula, df).fit())
print anova_results

formula = 'hs~C(fetus)+C(observer)+C(fetus):C(observer)'
anova_results = anova_lm(ols(formula, df).fit())

print anova_results

hsd = pairwise_tukeyhsd(hs, fetus)
print hsd.summary()

valsDict_rs = {}
# rowKey = colKey, codeID = cols[5], organID = cols[1], examID = cols[6], examTime = cols[2].replace("D", "")
valsDict_rs['rowKey'] = "rowKey6"
valsDict_rs['codeID'] = "codeID4"
valsDict_rs['organID'] = "organID3"
Beispiel #51
0
plt.savefig(PATH + 'grid_of_age.png', dpi=300)
plt.close()

# ANOVA
X1 = PolynomialFeatures(degree=1).fit_transform(wage[['age']])
X2 = PolynomialFeatures(degree=2).fit_transform(wage[['age']])
X3 = PolynomialFeatures(degree=3).fit_transform(wage[['age']])
X5 = PolynomialFeatures(degree=5).fit_transform(wage[['age']])
poly1 = sm.GLS(wage['wage'], X1).fit()
poly2 = sm.GLS(wage['wage'], X2).fit()
poly3 = sm.GLS(wage['wage'], X3).fit()
poly5 = sm.GLS(wage['wage'], X5).fit()
# ANOVA, as in chpater 3 notebook
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")  ## Supress warnings
    print(anova_lm(poly1, poly2, poly3, poly4, poly5))

# Polynomial regression of degree 5 on orthogonalized X. Refer to chapter 3 notebook.
X5_ortho = ortho_poly_fit(wage[['age']], degree=5)[0]
X5_ortho[:, 0] = 1  # Replace constant column with 1s for Intercept estimation.
poly5_ortho = sm.GLS(wage['wage'], X5_ortho).fit()
print(poly5_ortho.summary())

# Create binary qualitative response
y_clf = (wage.wage > 250).map({False: 0, True: 1})
# Logistic regression
logreg = sm.GLM(y_clf, X4, family=sm.families.Binomial()).fit()
print(logreg.summary())

# Predict on age grid
y_pred_clf = logreg.predict(X_test)
Beispiel #52
0
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#单因素方差分析
data = pd.read_excel(unicode(r'C:\Users\mime\Desktop\统计学学习数据.xlsx', 'utf-8'),
                     sheetname=0)
list_value = []
list_variable = []
for i in arange(len(data.columns)):
    x = data.iloc[:, i]
    for value in x:
        list_value.append(value)
        list_variable.append(data.iloc[:, i].name)
data = pd.DataFrame([list_variable, list_value], index=['indestry', 'Y']).T
formula = 'Y ~ C(indestry)'
anova_results = anova_lm(ols(formula, data).fit())

mean_data = data.mean(axis=1)
k = len(data.index)
n = (data.count(axis=1)).sum()
mean_all = ((data.sum(axis=1)).sum()) / ((data.count(axis=1)).sum())
SST = (((data - mean_all)**2).sum()).sum()
SSA = float((((mean_data - mean_all)**2).mul(data.count(axis=1),
                                             axis=0)).sum())
SSE = ((data.sub(mean_data, axis=0)**2).sum(axis=1)).sum()
F = (SSA / (k - 1)) / (SSE / (n - k))
F_pval = st.f.cdf(F, k - 1, n - k)
F_alpha = st.f.ppf(1 - 0.05, k - 1, n - k)
R_square = SSA / SST
R = mt.sqrt(SSA / SST)
Beispiel #53
0
#%%
sns.boxplot(x="Typ", y="Druckfestigkeit", data=df)
plt.xlabel("Typ")
plt.ylabel("Druckfestigkeit")
plt.show()

#%% b)

fit = ols("Druckfestigkeit~Typ", data=df).fit()
fit.params

#%% c)

# H_0 = mu_1 = mu_2 = mu_3 = mu_4
from statsmodels.stats.anova import anova_lm
anova_lm(fit)

#%% 10.2

from pandas import DataFrame
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
df = DataFrame({
    "Behandlung":
    np.repeat(["A", "B", "C", "D"], [4, 6, 6, 8]),
    "Koagulationszeit": [
        62, 60, 63, 59, 63, 67, 71, 64, 65, 66, 68, 66, 71, 67, 68, 68, 56, 62,
        60, 61, 63, 64, 63, 59
Beispiel #54
0
def two_way_anova(pdf, var_name, grouping_names):
    """Two-way ANOVA

    Arguments:
    pdf (pd dataframe)
    var_name (str):
    grouping_names (list of str):
    """
    # TODO extend it to multi-way ANOVA
    text_result = ''

    # http://statsmodels.sourceforge.net/stable/examples/generated/example_interactions.html#one-way-anova
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    data = pdf.dropna(subset=[var_name] + grouping_names)
    # from IPython import embed; embed()
    # FIXME If there is a variable called 'C', then patsy is confused whether C is the variable or the categorical variable
    # http://gotoanswer.stanford.edu/?q=Statsmodels+Categorical+Data+from+Formula+%28using+pandas%
    # http://stackoverflow.com/questions/22545242/statsmodels-categorical-data-from-formula-using-pandas
    # http://stackoverflow.com/questions/26214409/ipython-notebook-and-patsy-categorical-variable-formula
    anova_model = ols(str('%s ~ C(%s) + C(%s) + C(%s):C(%s)' %
                          (var_name, grouping_names[0], grouping_names[1],
                           grouping_names[0], grouping_names[1])),
                      data=data).fit()
    anova_result = anova_lm(anova_model, typ=3)
    text_result += _('Result of two-way ANOVA:' + '\n')
    # Main effects
    for group_i, group in enumerate(grouping_names):
        text_result += _('Main effect of %s: ' % group) + '<i>F</i>(%d, %d) = %0.3g, %s\n' % \
                       (anova_result['df'][group_i+1], anova_result['df'][4], anova_result['F'][group_i+1],
                        cs_util.print_p(anova_result['PR(>F)'][group_i+1]))
    # Interaction effects
    text_result += _('Interaction of %s and %s: ') % (grouping_names[0], grouping_names[1]) + '<i>F</i>(%d, %d) = %0.3g, %s\n' % \
                   (anova_result['df'][3], anova_result['df'][4], anova_result['F'][3], cs_util.print_p(anova_result['PR(>F)'][3]))
    """ # TODO
    # http://en.wikipedia.org/wiki/Effect_size#Omega-squared.2C_.CF.892
    omega2 = (anova_result['sum_sq'][0] - (anova_result['df'][0] * anova_result['mean_sq'][1])) / (
                (anova_result['sum_sq'][0] + anova_result['sum_sq'][1]) + anova_result['mean_sq'][1])
    text_result += _('Effect size: ') + '&omega;<sup>2</sup> = %0.3g\n' % omega2
    """
    """ # TODO
    # http://statsmodels.sourceforge.net/stable/stats.html#multiple-tests-and-multiple-comparison-procedures
    if anova_result['PR(>F)'][0] < 0.05:  # post-hoc
        post_hoc_res = sm.stats.multicomp.pairwise_tukeyhsd(np.array(data[var_name]), np.array(data[grouping_name]),
                                                            alpha=0.05)
        text_result += '\n' + _(u'Groups differ. Post-hoc test of the means.') + '\n'
        text_result += ('<fix_width_font>%s\n<default>' % post_hoc_res).replace(' ', u'\\u00a0')
        ''' # TODO create our own output
        http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.TukeyHSDResults.html#statsmodels.sandbox.stats.multicomp.TukeyHSDResults
        These are the original data:
        post_hoc_res.data
        post_hoc_res.groups

        These are used for the current output:
        post_hoc_res.groupsunique
        post_hoc_res.meandiffs
        post_hoc_res.confint
        post_hoc_res.reject
        '''
    """
    return text_result
Beispiel #55
0
# 그룹 간 데이터 들의 분포를 시각화 
#plot_data = [gr1,gr2,gr3]
#plt.boxplot(plot_data)
#plt.show()

f_statistic, p_val = stats.f_oneway(gr1,gr2,gr3)
print('일원분산분석 결과 : f_statistic:%f , p_val:%f'%(f_statistic,p_val))
# 일원분산분석 결과 : f_statistic:3.711336 , p_val:0.043589 <0.05 이므로 귀무기각
 
# 그룹별 (3개) 시험점수는 차이가 있다 라는 의견이 통계적으로 유의하다

#일원분산분석 방법2 - Linear Model 을 속성으로 사용 
df = pd.DataFrame(data, columns = ['value','group'])
#print(df)
lmodel = ols('value ~ C(group)', df).fit() # C(그룹칼럼..) : 범주형임을 명시적으로 표시  PR(>F)=p-value 0.043589
print(anova_lm(lmodel))


#이원분산분석 : 집단 구분 요인2
url = 'https://raw.githubusercontent.com/pykwon/python/master/testdata_utf8/group3_2.txt'
data = pd.read_csv(url)
print(data.head(3))
print(data.tail(3))

#귀무  : 관측자와 태아수 그룹에 따라 태아의 머리둘레에 차이가 없다. 
#대립  : 관측자와 태아수 그룹에 따라 태아의 머리둘레에 차이가 있다. 

# 시각화 
plt.rc('font', family = 'malgun gothic')
data.boxplot(column = '머리둘레' , by='태아수' , grid = True)
#plt.show() # 태아의 머리둘레는 차이가 있어 보임 . 관측자와 상호 작용이 있는지 분산분석으로 검정
        #axes[0].yaxis.tick_right()

        for ax in axes.flat:
            ax.margins(0.00)
            ax.grid(True)

        fig.tight_layout(rect=[0, 0.01, 1, 0.97])
        fig.subplots_adjust(wspace=0.0)
        title = '{0} Conv2D Layer {1}'.format(name, layer_id + 1)

        plt.suptitle(title, x=0.55, y=1.0)
        if not os.path.exists('./feat_plots'):
            os.mkdir('feat_plots')
        plt.savefig('./feat_plots/{0}.png'.format(title))
        #fig.savefig("foo.pdf", bbox_inches='tight')
        plt.clf()

    anova_all = np.array(anova_all)
    df = pd.DataFrame(data=anova_all[1:, 1:],
                      index=anova_all[1:, 0].tolist(),
                      columns=anova_all[0, 1:].tolist())
    df.colums = ['id', 'y', 'layer_id', 'is_sparse']
    df = df.astype({'y': 'float32', 'layer_id': 'int32', 'is_sparse': 'int32'})
    formula = 'y ~ C(layer_id) + C(is_sparse) + C(layer_id)*C(is_sparse)'
    model = ols(formula, df).fit()
    aov_table = anova_lm(model, typ=1)

    eta_squared(aov_table)
    omega_squared(aov_table)
    print(aov_table)
                resid[group.index],
                marker=symbols[j],
                color=colors[i - 1],
                s=144,
                edgecolors='black')
plt.xlabel('Group')
#@savefig residual_groups.png align=center
plt.ylabel('Residuals')

# now we will test some interactions using anova or f_test

interX_lm = ols("S ~ C(E) * X + C(M)", salary_table).fit()
print interX_lm.summary()

# Do an ANOVA check
table1 = anova_lm(lm, interX_lm)
print table1

interM_lm = ols("S ~ X + C(E)*C(M)", data=salary_table).fit()
print interM_lm.summary()

table2 = anova_lm(lm, interM_lm)
print table2

# The design matrix as a DataFrame
interM_lm.model.data.orig_exog
# The design matrix as an ndarray
interM_lm.model.exog
interM_lm.model.exog_names

infl = interM_lm.get_influence()
# 3.levene의 등분산 검정
(W, p) = stats.levene(group1, group2, group3)
if p < 0.05:
    print('Warning: the P-value of the Levene test is <0.05: p=', p)

# 4.One-way ANOVA
## 4.1.
F_statistic, pVal = stats.f_oneway(group1, group2, group3)
print((F_statistic, pVal))
if pVal < 0.05:
    print('One of the groups is significantly different.')

## 4.2.
model = ols('value ~ C(treatment)', data).fit()
anovaResults = anova_lm(model)
print(anovaResults)

## 4.3.두 경우 결과 비교(같으면 OUTPUT 없음)
np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])
"""
사후검정 """
# 1.library 선언
import matplotlib.pyplot as plt
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 2.데이터 입력
plot_data = [group1, group2, group3]
df = pd.DataFrame(data, columns=["value", "treatment"])

# 3.Tukey 사후검정
Beispiel #59
0
    data_frame["X1"]
    , data_frame["X2"]
    , data_frame["Y"]
    , color="blue"
    , marker="o"
    , alpha=1
)


# From here....graph code is taken from class mate "Arsalan Ali"

Reg = ols(formula="Y ~ X! + X2", data=data_frame)

Fit2 = Reg.fit()
print("\n", Fit2.summary())
print("\n", anova_lm(Fit2))

# Again plotting our Dear 3D-Graph
ax = plt.figure().gca(projection='3d')

# Creating Axis X,Y,Z out of our Data Y,X1,X2
ax.scatter(
    data_frame["X1"]
    , data_frame["X2"]
    , data_frame["y"]
    , color="blue"
    , marker="o"
    , alpha=1
)

# Title of the graph
Beispiel #60
0
随机性:样本是随机采样但
独立性:来自不同组但样本是相互独立但
正太分布性:组内样本都来自一个正太分布
方差齐性:不同组但方差相等或相近
"""

# 读取数据, d1 对应于算法 a,d2 对应于算法 b
df = pd.read_csv("./oneway.csv")
d1 = df[df['algo'] == 'a']['ratio']
d2 = df[df['algo'] == 'b']['ratio']

# 检验两个水平的正态性
print('---------------- 检验两个水平的正态性 ----------------')
print(ss.normaltest(d1))
print(ss.normaltest(d2))

# 检测两个水平的方差齐性
print('---------------- 检测两个水平的方差齐性 ----------------')
args = [d1, d2]
print(ss.levene(*args))

# F 检验的第一种方法
print('---------------- F 检验的第一种方法 ----------------')
print(ss.f_oneway(*args))

# F 检验的第二种方法
print('---------------- F 检验的第二种方法 ----------------')
model = ols('ratio ~ algo', df).fit()
anovat = anova_lm(model)
print(anovat)