Exemple #1
0
def anova_analysis():
    data_dropped_na = data.dropna()

    mc1 = multi.MultiComparison(data_dropped_na[var_formula.get().split('~')[0]], data_dropped_na[var_formula.get().split('~')[1]])
    result = mc1.tukeyhsd()
    t = result.summary().as_text()
    a_list = t.split('\n')
    cols = [col for col in a_list[2].split(' ') if col]
    df = pd.DataFrame(columns=cols)
    for i in range(4, len(a_list) - 1):

        items = [item for item in a_list[i].split(' ') if item]
        df.loc[i-4] = items

    formula = var_formula.get()
    mod = ols(formula, data=data_dropped_na).fit()

    # print(mod.summary())
    aov_table = sm.stats.anova_lm(mod, typ=2)
    writer = pd.ExcelWriter('Analysis/ANOVA.xlsx')
    caption = pd.DataFrame(columns=[var_formula.get().split('~')[0]])
    caption.to_excel(writer, sheet_name='Sheet1')
    aov_table.to_excel(writer, sheet_name='Sheet1', startcol=1)
    df.to_excel(writer, sheet_name='Sheet1', startcol=7)
    writer.save()
    os.startfile('Analysis\ANOVA.xlsx')
Exemple #2
0
def test_using_anova(model,
                     model_results,
                     homoskedastic,
                     df,
                     dependent_var,
                     *independent_vars,
                     anova_type=2):
    """
    Generate and ANOVA table and test the results using that
    """
    output = ''
    results = {}

    aov_table = sm.stats.anova_lm(model_results,
                                  typ=anova_type,
                                  robust=None if homoskedastic else 'hc3')
    aov_table = augment_anova_table(aov_table)
    output += f"ANOVA\n{aov_table}\n\n"
    results['anova'] = aov_table

    # Perform multiple comparisons
    mc = multicomp.MultiComparison(
        df[dependent_var], df.loc[:,
                                  independent_vars].astype(str).agg(','.join,
                                                                    axis=1))
    mc_results = mc.tukeyhsd()
    output += f"Tukey's HSD:\n{mc_results}\n\n"
    results['multiple'] = mc_results

    return output, results
def bonferroni(item):
    modelData = data[['Condition', item]].copy()
    modelData['Condition'] = modelData['Condition'].map(str)

    comp = mc.MultiComparison(modelData[item], modelData['Condition'])
    tbl, a1, a2 = comp.allpairtest(stats.ttest_ind, method="bonf")
    print(tbl)
    def tukeys_test(self, Features=None, Clstrs=None, alpha=0.05):
        """
            Tukey’s range test to compare means of all pairs of groups

             Parameters
             ----------
             Features: 2D_array_like
                 The arrays must have the same shape, except in the dimension
             Clstrs: array_like
             alpha: Value of FWER at which to calculate HSD

             Returns
             -------
             A results class containing relevant data and some post-hoc calculations
        """
        if Features is None:
            Features = self.__data.columns[:-1].copy()

        if Clstrs is None:
            Clstrs = self.__data["Clusters"].copy()
            Clstrs = Clstrs.dropna().unique().tolist()

        Clstrs.sort()

        for feature in Features:
            print("\n\n", feature,"\n")
            sub = self.__data[[feature, "Clusters"]].copy()
            sub = sub.dropna()
            mc = multi.MultiComparison(sub[sub["Clusters"].isin(Clstrs)][feature],
                                       sub[sub["Clusters"].isin(Clstrs)]["Clusters"])
            res = mc.tukeyhsd(alpha)
            print(res.summary(),"\n\n")
def tukey(item):
    modelData = data[['Condition', item]].copy()
    modelData['Condition'] = modelData['condition'].map(str)

    comp = mc.MultiComparison(modelData[item], modelData['condition'])
    post_hoc_res = comp.tukeyhsd()
    post_hoc_res.summary()
    print(post_hoc_res.summary())
def compare_many(data):
    '''Multiple comparisons: Which one is different? '''
    
    print('\n MultComp: --------------------------------------')
    
    # An ANOVA is a hypothesis test, and only answers the question: "Are all the groups 
    # from the same distribution?" It does not tell you which one is different.
    # Since we now compare many different groups to each other, we have to adjust the
    # p-values to make sure that we don't get a Type I error. For this, we use the 
    # statscom module "multicomp"
    mc = multicomp.MultiComparison(data['weight'], data['group'])
    
    # There are many ways to do multiple comparisons. Here, we choose
    # "Tukeys Honest Significant Difference" test
    # The first element of the output ("0") is a table containing the results
    print(mc.tukeyhsd().summary())
    
    # Show the group names
    print(mc.groupsunique)
    
    # Generate a print ----------------
    
    res2 = mc.tukeyhsd()     # Get the data
    
    simple = False
    if simple:
        # You can do the plot with a one-liner, but then this does not - yet - look that great
        res2.plot_simultaneous()
    else:
        # Or you can do it the hard way, i.e. by hand:
        
        # Plot values and errorbars
        xvals = np.arange(3)
        plt.plot(xvals, res2.meandiffs, 'o')
        errors = np.ravel(np.diff(res2.confint)/2)
        plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o')
        
        # Set the x-limits
        xlim = -0.5, 2.5
        # The "*xlim" passes the elements of the variable "xlim" elementwise into the function "hlines"
        plt.hlines(0, *xlim)
        plt.xlim(*xlim)
        
        # Plot labels (this is a bit tricky):
        # First, "np.array(mc.groupsunique)" makes an array with the names of the groups;
        # and then, "np.column_stack(res2[1][0])]" puts the correct groups together
        pair_labels = mc.groupsunique[np.column_stack(res2._multicomp.pairindices)]
        plt.xticks(xvals, pair_labels)
        
        plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
        '\n Pairwise Mean Differences')
    
    plt.show()
Exemple #7
0
def anova_analysis():
    if var_formula.get() == '':
        print_status("Warning: Formula is missing", 'red')
        return

    data_dropped_na = data.dropna()
    continuous_columns = [
        col for col in data_dropped_na.columns
        if data_dropped_na[col].dtype != 'object'
    ]

    col = 1
    writer = pd.ExcelWriter('../../Analysis/ANOVA.xlsx')

    if '.' in var_formula.get():
        dependent_vars = continuous_columns
    else:
        dependent_vars = var_formula.get().split('~')[0].split('+')

    for dependent in dependent_vars:

        mc1 = multi.MultiComparison(
            data_dropped_na[dependent],
            data_dropped_na[var_formula.get().split('~')[1]])
        result = mc1.tukeyhsd()
        t = result.summary().as_text()
        a_list = t.split('\n')
        cols = [col for col in a_list[2].split(' ') if col]
        df = pd.DataFrame(columns=cols)
        for i in range(4, len(a_list) - 1):
            items = [item for item in a_list[i].split(' ') if item]
            df.loc[i - 4] = items

        formula = dependent + '~' + var_formula.get().split('~')[1]
        mod = ols(formula, data=data_dropped_na).fit()

        aov_table = sm.stats.anova_lm(mod, typ=2)

        caption = pd.DataFrame(columns=[dependent])
        caption.to_excel(writer, sheet_name='Sheet1', startrow=2, startcol=col)
        aov_table.to_excel(writer,
                           sheet_name='Sheet1',
                           startcol=col,
                           startrow=3)
        df.to_excel(writer, sheet_name='Sheet1', startrow=3, startcol=col + 7)
        col += 16
    writer.save()

    os.startfile('../../Analysis\ANOVA.xlsx')
    print_status('Status: Successful analysis', 'black')
Exemple #8
0
def Stat_Test(df_gene, df_sg):
    comp_groups = raw_input(
        'Choose the groups you want to compare. (e.g. good/bad or good/no ...)'
    )
    tests = raw_input(
        'Which statistical test you want to do? (e.g. unpaired_t_test or mann_whitney_test or fisher_exact_test or mann_whitney_test&fisher_exact_test ...)'
    )
    group_list = comp_groups.split('/')
    test_list = tests.split('&')
    if len(group_list) == 2:
        group1 = group_list[0]
        group2 = group_list[1]
        list_g1 = list(df_sg.loc[df_sg.Group == group1, 'Sample'])
        list_g2 = list(df_sg.loc[df_sg.Group == group2, 'Sample'])
        df_gene = df_gene.loc[:, ['Gene'] + list_g1 + list_g2]
        df_gene[group1 + '_Sum'] = df_gene.loc[:, list_g1].sum(axis=1)
        df_gene[group2 + '_Sum'] = df_gene.loc[:, list_g2].sum(axis=1)
        df_gene = df_gene.loc[(df_gene[group1 + '_Sum'] != 0) &
                              (df_gene[group2 + '_Sum'] != 0), :]
        for t in test_list:
            if t == 'unpaired_t_test':
                df_gene['unpaired_t_test'] = df_gene.apply(
                    lambda row: unpaired_t_test(row, list_g1, list_g2), axis=1)
            elif t == 'mann_whitney_test':
                df_gene['mann_whitney_test'] = df_gene.apply(
                    lambda row: mann_whitney_test(row, list_g1, list_g2),
                    axis=1)
            elif t == 'fisher_exact_test':
                df_gene['fisher_exact_test'] = df_gene.apply(
                    lambda row: fisher_exact_test(row, list_g1, list_g2),
                    axis=1)
            else:
                print 'Please choose from unpaired_t_test, mann_whitney_test and fisher_exact_test'
                sys.exit(1)
    p_cor_flag = raw_input(
        'Do you want to perform mutiple testing correction? (Y|N)')
    if p_cor_flag == 'Y':
        p_cor = raw_input(
            'Which correction method do you want to choose? (e.g. bonferroni or fdr_bh or bonferroni&fdr_bh)'
        )
        p_cor = p_cor.split('&')
        p_val = raw_input(
            'Which p value do you want to correct? (e.g. unpaired_t_test or mann_whitney_test or fisher_exact_test'
        )
    for cor in p_cor:
        df_gene[cor] = ssm.MultiComparison(df_gene[p_val],
                                           alpha=0.05,
                                           method=cor)
    return df_gene
Exemple #9
0
def tukey(data=None, independent=None, dependent=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)

    independent = str(independent)
    dependent = str(dependent)
    if input_check_numerical_categorical(data, independent, dependent):
        return

    test = multi.MultiComparison(data[dependent], data[independent])
    res = test.tukeyhsd()
    display(res.summary())
    res.plot_simultaneous()

    return
Exemple #10
0
def anova(wine_set):
    prepared_data = add_categ_quality(wine_set)
    model1 = smf.ols(formula="total_sulfur_dioxide ~ C(quality_mark)", data=prepared_data)
    results1 = model1.fit()
    print(results1.summary())
    #
    sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]
    print("\nMeans for total sulfur dioxide by quality marks of wine")
    print(sub.groupby('quality_mark').mean())
    print("\nStandard deviations for total sulfur dioxide by quality marks of wine")
    print(sub.groupby('quality_mark').std(), '\n')
    #
    # # ------------- to perform Post hoc test (using Tukey's Honestly Significant Difference Test) -------------------------
    mc1 = multi.MultiComparison(sub['total_sulfur_dioxide'], sub['quality_mark'])
    res1 = mc1.tukeyhsd()
    print(res1.summary())
def anova(wine_set):
    prepared_data = add_categ_quality(wine_set)
    model1 = smf.ols(formula="total_sulfur_dioxide ~ C(quality_mark)",
                     data=prepared_data)
    results1 = model1.fit()
    print(results1.summary())
    #
    sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]
    print("\nMeans for total sulfur dioxide by quality marks of wine")
    print(sub.groupby('quality_mark').mean())
    print(
        "\nStandard deviations for total sulfur dioxide by quality marks of wine"
    )
    print(sub.groupby('quality_mark').std(), '\n')
    mc1 = multi.MultiComparison(sub['total_sulfur_dioxide'],
                                sub['quality_mark'])
    res1 = mc1.tukeyhsd()
    print(res1.summary())
Exemple #12
0
def test_using_kruskal(df,
                       dependent_var,
                       *independent_vars,
                       correction_method='bonf'):
    """
    Test for the significance of factors using the non-parametric Kruskal-Wallis
    test followed by a Mann-Whitney U test with Bonferroni correction
    """
    output = ''
    results = {}

    unique_values = df.groupby(
        list(independent_vars)).size().reset_index().rename(
            columns={0: 'count'})
    test_data = []
    for row in unique_values.itertuples(index=False):
        if len(independent_vars) > 1:
            selectors = [(df[v] == getattr(row, v)) for v in independent_vars]
            row_selector = np.logical_and(*selectors[:2])
            for idx in range(2, len(independent_vars)):
                row_selector = np.logical_and(row_selector, selectors[idx])
        else:
            v = independent_vars[0]
            row_selector = df[v] == getattr(row, v)
        test_data.append(df.loc[row_selector, dependent_var])
    assert len(test_data) == unique_values.shape[0]

    test_results = spstats.kruskal(*test_data)
    output += f"Kruskal-Wallis test:\n{test_results.statistic}, {test_results.pvalue}\n\n"
    results['kruskal'] = test_results

    # Perform multiple comparisons with Bonferroni correction
    try:
        mc = multicomp.MultiComparison(
            df[dependent_var],
            df.loc[:, independent_vars].astype(str).agg(','.join, axis=1))
        mc_results = mc.allpairtest(spstats.mannwhitneyu,
                                    method=correction_method)
        output += f"Pairwise Mann-Whitney U:\n{mc_results[0]}\n\n"
        results['multiple'] = mc_results[0]
    except Exception as e:
        print("ERROR:", e)

    return output, results
Exemple #13
0
def anova_test(df):

    col1 = 'job'
    col2 = 'duration'

    duration_frame = df[[col1, col2]].copy()
    groups = duration_frame.groupby(col1)

    #Job-groups
    admin = groups.get_group('admin.')['duration']
    bluecollar = groups.get_group('blue-collar')['duration']
    student = groups.get_group('student')['duration']
    housemaid = groups.get_group('housemaid')['duration']
    services = groups.get_group('services')['duration']
    unemployed = groups.get_group('unemployed')['duration']
    entrepreneur = groups.get_group('entrepreneur')['duration']
    selfemployed = groups.get_group('self-employed')['duration']
    retired = groups.get_group('retired')['duration']

    print('Admin:\n')
    print(admin.head())

    F, p = ss.f_oneway(admin, bluecollar, student, housemaid, services,
                       unemployed, entrepreneur, selfemployed, retired)

    print('{} {}\n'.format(F, p))

    if p < 0.05:

        print(
            'Null hypothesis rejected. Statistical difference found. Conduct post-hoc tests.'
        )

    else:

        print('Not much difference found. Can accept null hypothesis.')

    #Conducting a Post-Hoc test
    duration_frame[col2] = duration_frame[col2].convert_objects(
        convert_numeric=True)

    mc = multi.MultiComparison(duration_frame[col2], duration_frame[col1])
    result = mc.tukeyhsd()
    print(result.summary())
Exemple #14
0
    def run_Tukey(self, t, name_list, z):
        # Run tukey for data at t,
        # the name_list is the naming of the columns
        # z is the score to run Tukey
        total_list = []
        total_list2 = []
        for i, j in zip(t, name_list):
            total_list = np.concatenate((total_list, np.reshape(i, len(i))))
            total_list2.extend([j] * len(i))

        num_array = np.asarray(total_list)
        symbol_group = np.asarray(total_list2)
        di = {'Score': num_array, 'Group': symbol_group}
        middle_frame = pd.DataFrame(di)
        mcobj = ml.MultiComparison(
            pd.to_numeric(middle_frame.Score, errors="ignore"),
            middle_frame.Group)
        out = mcobj.tukeyhsd(z)
        return out
Exemple #15
0
pokerhand_test=data_test['CLASS']
# put into a pandas dataFrame
pokerhand_train=pd.DataFrame(pokerhand_train)
pokerhand_test=pd.DataFrame(pokerhand_test)
pokerhand_train.reset_index(level=0, inplace=True) # reset index
merged_train_all=pd.merge(pokerhand_train, merged_train, on='index') # merge the pokerhand train with merged clusters
sub1 = merged_train_all[['CLASS', 'cluster']].dropna()

import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 

# respone formula
pokermod = smf.ols(formula='CLASS ~ cluster', data=sub1).fit()
print (pokermod.summary())

print ('means for Poker hands by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)

print ('standard deviations for Poker hands by cluster')
m2= sub1.groupby('cluster').std()
print (m2)

mc1 = multi.MultiComparison(sub1['CLASS'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())




Exemple #16
0
# validate clusters in training data by examining cluster differences in GPA using ANOVA
# first have to merge GPA with clustering variables and cluster assignment data
gpa_data = data_clean['internetuserate']
# split GPA data into train and test sets
gpa_train, gpa_test = train_test_split(gpa_data,
                                       test_size=.3,
                                       random_state=123)
gpa_train1 = pd.DataFrame(gpa_train)
gpa_train1.reset_index(level=0, inplace=True)
merged_train_all = pd.merge(gpa_train1, merged_train, on='index')
sub1 = merged_train_all[['internetuserate', 'cluster']].dropna()

import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi

gpamod = smf.ols(formula='internetuserate ~ (cluster)', data=sub1).fit()
print(gpamod.summary())

print('means for internetuserate by cluster')
m1 = sub1.groupby('cluster').mean()
print(m1)

print('standard deviations for internetuserate by cluster')
m2 = sub1.groupby('cluster').std()
print(m2)

mc1 = multi.MultiComparison(sub1['internetuserate'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
aov = aov[cols]

print(aov)

print("                     ")
print(" POST-HOST TESTING ")
# POST-HOC TESTING
print("          ")
# TUKEY HONESTLY SIGNIFFICANCE DIFFERENCE
print("TUKEY HONESTLY SIGNIFFICANCE DIFFERENCE")
#from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
#                                         MultiComparison)
import statsmodels.stats.multicomp as mc

comp = mc.MultiComparison(df['Scores'], df['Method'])
post_hoc_res = comp.tukeyhsd()
post_hoc_res.summary()
print(post_hoc_res)
post_hoc_res.plot_simultaneous(ylabel="Method", xlabel="Score Differences")

#BONFERRONI CORRECTION
print("                          ")
print("BONFERRONI CORRECTION")
print("                             ")
import statsmodels.stats.multicomp as mc

comp = mc.MultiComparison(df['Scores'], df['Method'])
tbl, a1, a2 = comp.allpairtest(stats.ttest_ind, method="bonf")

print(tbl)