def analyzeData(results2):

    print('Accuracy')
    print(
        AnovaRM(data=results2,
                depvar='Accuracy',
                subject='Subject',
                within=['Condition'],
                aggregate_func='mean').fit())

    MultiComp = MultiComparison(results2['Accuracy'], results2['Condition'])
    comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf')
    print(comp[0])

    print('Reaction Time')
    print(
        AnovaRM(data=results2,
                depvar='Reaction Time',
                subject='Subject',
                within=['Condition'],
                aggregate_func='mean').fit())

    MultiComp = MultiComparison(results2['Reaction Time'],
                                results2['Condition'])
    comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf')
    print(comp[0])
Beispiel #2
0
 def get_multiplecomparisons(self, dataframe, test):
     # If distributions are different then do multiple comparisons
     dataframe = dataframe.dropna()
     print(dataframe)
     cleanbin = dataframe.melt(var_name='Bin', value_name='Value')
     MultiComp = MultiComparison(cleanbin['Value'], cleanbin['Bin'])
     if test == 'ttest':
         comp = MultiComp.allpairtest(scipy.stats.ttest_rel, method='Bonf')
     else:
         comp = MultiComp.allpairtest(scipy.stats.wilcoxon, method='Bonf')
     print(comp[0])
 def calculate_test(self):
     """It applies Holm-Bonferroni test to the dataframe. Bonferroni is a multi-comparison method.
     Discover more at https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method .
     Be sure you are working with a normal distribution"""
     
     MultiComp = MultiComparison(self.data.values,self.data.index)
     holm=MultiComp.allpairtest(stats.ttest_rel, method='Holm')
     print("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n")
     self.results.write("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n")
     
     MultiComp = MultiComparison(self.data.T.values,self.data.columns)
     holm2=MultiComp.allpairtest(stats.ttest_rel, method='Holm')
     print("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n")
     self.results.write("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n")
     
     return (holm,holm2)
def get_significance_booleans(data):
    '''
    preform multiple comparisons (t-tests).

    paramters
    ---------
    data: Series
        must have a single level index containing the group labels (id).
        values are the results to be compared

    returns
    ------
    booleans: Series
         boolean values indicating significance between the groups.
    '''

    id = data.index.values
    value = data.values

    # multiple comparison
    multiple_comparisons = MultiComparison(value, id)  # instanciate multiple comparisons object
    pairwise_holm = multiple_comparisons.allpairtest(ttest_ind, method='holm')  # preform pairwise t-test
    significance_matrix = DataFrame(pairwise_holm[2])  # store results in dataframe
    groups_as_index = significance_matrix.set_index(['group1', 'group2'])
    significance_booleans = groups_as_index['reject']

    return significance_booleans
Beispiel #5
0
def pairwise_ttest(val_vec, cnf):
    df = pd.DataFrame()
    cluster = []
    score = []
    for subc, dic_conf in val_vec.items():
        cluster += [str(subc) for idx in range(len(dic_conf[cnf]))]
        score.extend(dic_conf[cnf])
    df['subcluster'] = cluster
    df['score'] = score
    #    all_comb = list(combinations(df.subcluster, 2))
    #    p_vals = []
    #    for comb in all_comb:
    #        g1 = df[(df.subcluster == comb[0])]['score']
    #        g2 = df[(df.subcluster == comb[1])]['score']
    #        stat, pval = ttest_ind(g1, g2, equal_var=False)
    #        p_vals.append(pval)
    #    reject_list, corrected_p_vals = multipletests(p_vals, method='bonferroni')[:2]
    #    for comb, pv, cpv, r in zip(all_comb, p_vals, corrected_p_vals, reject_list):
    #        print("Comparison: {0} -- p={1}, corr_p={2}, rej={3}".format(
    #              comb, pv, cpv, r))
    MultiComp = MultiComparison(df['score'], df['subcluster'])
    comp = MultiComp.allpairtest(ttest_ind, method='bonf')
    print(comp[0])
    pd.options.display.float_format = '{:.3f}'.format
    print(df.groupby(['subcluster']).describe())
Beispiel #6
0
def Holm_Bonferroni(multiComp:MultiComparison) -> float:
    """ Instead of the Tukey's test, we can do pairwise t-test

    Parameters
    ----------
    multComp : Result of the 'MultiComparison'-test

    Returns
    -------
    checkVal : the test paramter used for checking correct execution

    """
    
    # First, with the "Holm" correction
    rtp = multiComp.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))
    
    # and then with the Bonferroni correction
    print((multiComp.allpairtest(stats.ttest_rel, method='b')[0]))
    
    # Any value, for testing the program for correct execution
    checkVal = rtp[1][0][0,0]
    return checkVal
    def get_anova_multiplecomp(self, accuracy_dataframe):
        # ANOVA and tukey test on the groups
        f, p = scipy.stats.f_oneway(accuracy_dataframe['Task1'], accuracy_dataframe['Task2'],
                                    accuracy_dataframe['Task2b'])
        print('Anova %0.5f' % p)

        # If distributions are different then do multiple comparisons
        if p < 0.05:
            df_melt = accuracy_dataframe.melt(var_name='Task', value_name='Error')
            df_melt = df_melt[df_melt.Task != 'Task3']
            # print(df_melt)
            MultiComp = MultiComparison(df_melt['Error'],
                                        df_melt['Task'])
            comp = MultiComp.allpairtest(scipy.stats.ttest_rel, method='Holm')
            print(comp[0])
    def get_anova_multiplecomp_bytimebin(self, bin_df):
        # ANOVA and tukey test on the groups
        # bin_df = bin_df.drop()
        f, p = scipy.stats.f_oneway(bin_df[bin_df.Bin == 'Bin0']['Value'], bin_df[bin_df.Bin == 'Bin1']['Value'],
                                    bin_df[bin_df.Bin == 'Bin2']['Value'], bin_df[bin_df.Bin == 'Bin3']['Value'])
        print('Anova %0.5f' % p)

        # Remove uneven bins before comparison
        clean_bin_df = bin_df[bin_df['Bin'].isin(['Bin0', 'Bin1', 'Bin2', 'Bin3'])]
        # print(clean_bin_df)
        # If distributions arse different then do multiple comparisons
        if p < 0.05:
            # print(df_melt)
            MultiComp = MultiComparison(clean_bin_df['Value'],
                                        clean_bin_df['Bin'])
            comp = MultiComp.allpairtest(scipy.stats.kruskal, method='Holm')
            print(comp[0])
Beispiel #9
0
def kruskal(*args):
    localargs = locals()['args']
    print(localargs)
    statsk = stats.kruskal(*args)
    print(statsk)
    df = pd.DataFrame(localargs)
    print(df)
    stacked_data = df.stack().reset_index()
    print(stacked_data)
    stacked_data = stacked_data.rename(columns={
        'level_0': 'genotype',
        0: 'result'
    })
    print(stacked_data)
    MultiComp = MultiComparison(stacked_data['result'],
                                stacked_data['genotype'])
    print(MultiComp.allpairtest(stats.mannwhitneyu, method='Holm'))
def hypothesis_test_four(cleaned_data):
    """
    This function takes in cleaned data, then uses create sample dists to grab
    the required categories. From there the function performs fishers lSD
    analysis and displays a chart of all the pairwise compairisons and the 
    p-values
    :param alpha: the critical value of choice
    :param cleaned_data: our cleaned dataset
    :return:
    """
    # Get data for tests
    categories = ['NISS', 'FORD', 'HOND', 'TOY']
    comparison_groups = create_sample_dists(cleaned_data,
                                            y_var='ticket',
                                            x_var='make',
                                            categories=categories,
                                            seed=4)
    list_for_lsd = []
    for i in range(len(categories)):
        cat_list = [categories[i]] * 50
        tk_lsd = zip(list(comparison_groups[i]), cat_list)
        list_for_lsd += list(tk_lsd)

    df_lsd = pd.DataFrame(list_for_lsd)

    # perform fisher LSD for the groups
    mult_comp = MultiComparison(df_lsd[0], df_lsd[1])
    result = mult_comp.allpairtest(stats.ttest_ind, method='Holm')
    # we need to convert the simpletable result object into a dataframe
    result_summary = result[0].as_html()
    lsd_df = pd.read_html(result_summary, header=0, index_col=0)[0]
    lsd_df = lsd_df.drop(columns=['stat', 'pval_corr'])
    lsd_df.reject = lsd_df.pval.apply(compare_pval_alpha_tf)
    lsd_df.columns = ["Second Make", "P Value", "Signifigant Difference?"]
    lsd_df.index.names = ['First Make']

    return lsd_df
# ## Tukey's multi-comparison method
#
# See https://en.wikipedia.org/wiki/Tukey's_range_test
#
# This method tests at P<0.05 (correcting for the fact that multiple comparisons are being made which would normally increase the probability of a significant difference being identified). A results of 'reject = True' means that a significant difference has been observed.

# In[26]:

from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison)

# Set up the data for comparison (creates a specialised object)
MultiComp = MultiComparison(stacked_data['result'], stacked_data['treatment'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp.tukeyhsd().summary())

# ## Holm-Bonferroni Method
#
# See: https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method
#
# The Holm-Bonferroni method is an alterantive method.

# In[27]:

comp = MultiComp.allpairtest(stats.ttest_rel, method='Holm')
print(comp[0])
Beispiel #12
0
plt.xlim(*xlim)
pair_labels = mod.groupsunique[np.column_stack(res2[1][0])]
plt.xticks([0,1,2], pair_labels)
plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
          '\n Pairwise Mean Differences')          

# Save to outfile
outFile = 'MultComp.png'
plt.savefig('MultComp.png', dpi=200)
print 'Figure written to {0}'.format(outFile)

plt.show()

# Instead of the Tukey's test, we can do pairwise t-test
# First, with the "Holm" correction
rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
print rtp[0]

# and then with the Bonferroni correction
print mod.allpairtest(stats.ttest_rel, method='b')[0]

# Done this way, the variance is calculated at each comparison.
# If you want the joint variance across all samples, you have to 
# use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
studentized_mean = res2[1][2]
studentized_variance = res2[1][3]

t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
dof = len(dta2) - len(mod.groupsunique)
my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided
Beispiel #13
0
def position_stats(df, name_mapping=None):

    # print '### position stats'
    from statsmodels.stats.weightstats import ztest
    from functools32 import partial, wraps
    POS = df.position.unique()
    POS.sort()
    model = 'value ~ group'
    allpvals = None
    header = None
    DF = None

    ttest_log_wrap = wraps(
        partial(ttest_ind_log, equal_var=False))(ttest_ind_log)
    ttest_ind_nev = wraps(
        partial(stats.ttest_ind, equal_var=False))(stats.ttest_ind)
    mwu_test = wraps(partial(stats.mannwhitneyu, use_continuity=False))(
        stats.mannwhitneyu)

    bootstrap_sample_num = 1000
    # print df

    stats_test = ttest_ind_nev
    GROUPS = df.group.unique()
    # GROUPS = [0,3]

    for pos in POS:
        # print pos
        data = df[df.position == pos]
        data = data.groupby(['sid']).mean()
        data = resample_data(data, num_sample_per_pos=BOOTSTRAP_NUM)
        # print data
        # print data.group.unique()
        # data = df[(df.group == 0) | (df.group == 3)]
        # print data
        # sys.exit()

        #cross = smf.ols(model, data=data).fit()
        #anova = sm.stats.anova_lm(cross, type=1)
        # print data.group

        mcp = MultiComparison(data.value, data.group.astype(int))

        rtp = mcp.allpairtest(stats_test, method='bonf')
        mheader = []
        for itest in rtp[2]:
            name1 = itest[0]
            name2 = itest[1]
            if name_mapping is not None:
                name1 = name_mapping[str(name1)]
                name2 = name_mapping[str(name2)]

            mheader.append("{} - {}".format(name1, name2))

        if not header or len(mheader) > len(header):
            header = mheader

        # get the uncorrecte pvals
        pvals = rtp[1][0][:, 1]

        ndf = pd.DataFrame(data=[pvals], columns=mheader)
        if allpvals is None:
            allpvals = ndf
        else:
            allpvals = pd.concat([allpvals, ndf])

    # return allpvals
    # corr_pvals = allpvals
    # print allpvals
    # return allpvals

    flatten = allpvals.values.ravel()
    flatten = flatten * 2
    mcpres = multipletests(flatten, alpha=0.05, method='bonf')
    # print mcpres
    corr_pvals = np.array(mcpres[1])
    # print corr_pvals
    corr_pvals = np.reshape(corr_pvals, (len(POS), -1))

    # print corr_pvals,corr_pvals.shape,header
    data = pd.DataFrame(data=corr_pvals, columns=header)
    data = data[data.columns[:3]]
    return data
Beispiel #14
0
#Check for heteroskedasticity
sm.qqplot(anova_reg.resid, line='s')
plt.show()

######
#Post Hoc Tests for One-way ANOVA

#Tukey test - good when groups are the same size and have and homogeneous variance
postHoc = pairwise_tukeyhsd(alldata['Fare_Per_Person'], alldata['Embarked'], alpha=0.05)
print(postHoc)

#Pairwise comparison using Bonferroni correction of p-values
mc = MultiComparison(alldata['Fare_Per_Person'], alldata['Embarked'])
#print(mc.allpairtest(stats.ttest_rel, method='Holm')[0])  #For paired t-test
print(mc.allpairtest(stats.ttest_ind, method='b')[0])     #For independent t-test

######
#ANCOVA

#Look for heteroskedasticity
plt.plot(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1)]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) &(alldata['Sex_male']==1)]['Group_Size'], 'bo')
plt.show()
#Second class male passengers with a fare price > 0 seem OK
#There are a couple group sizes with only 1 observation with these criteria though, so make sure to filter them out too

#Test for heteroskedasticity
print(levenes_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size']))
print(bartlett_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size']))

sub = alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]
Beispiel #15
0
def main():
    # Note: the statsmodels module is required here.
    from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                             MultiComparison)
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    
    # Set up the data, as a structured array.
    # The first and last field are 32-bit intergers; the second field is an
    # 8-byte string. Note that here we can also give names to the individual
    # fields!
    dta2 = np.rec.array([
    (  1,   'mental',  2 ),
    (  2,   'mental',  2 ),
    (  3,   'mental',  3 ),
    (  4,   'mental',  4 ),
    (  5,   'mental',  4 ),
    (  6,   'mental',  5 ),
    (  7,   'mental',  3 ),
    (  8,   'mental',  4 ),
    (  9,   'mental',  4 ),
    ( 10,   'mental',  4 ),
    ( 11, 'physical',  4 ),
    ( 12, 'physical',  4 ),
    ( 13, 'physical',  3 ),
    ( 14, 'physical',  5 ),
    ( 15, 'physical',  4 ),
    ( 16, 'physical',  1 ),
    ( 17, 'physical',  1 ),
    ( 18, 'physical',  2 ),
    ( 19, 'physical',  3 ),
    ( 20, 'physical',  3 ),
    ( 21,  'medical',  1 ),
    ( 22,  'medical',  2 ),
    ( 23,  'medical',  2 ),
    ( 24,  'medical',  2 ),
    ( 25,  'medical',  3 ),
    ( 26,  'medical',  2 ),
    ( 27,  'medical',  3 ),
    ( 28,  'medical',  1 ),
    ( 29,  'medical',  3 ),
    ( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                    ('Treatment', '|S8'),
                                    ('StressReduction', '<i4')])
    
    # First, do an one-way ANOVA
    df = pd.DataFrame(dta2)
    model = ols('StressReduction ~ C(Treatment)',df).fit()
    
    anovaResults =  anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
    
    #Then, do the multiple testing
    mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
    print((mod.tukeyhsd().summary()))
    
    # The following code produces the same printout
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    #print res2[0]
    
    # Show the group names
    print((mod.groupsunique))
    
    # Generate a print
    import matplotlib.pyplot as plt
    xvals = np.arange(3)
    plt.plot(xvals, res2.meandiffs, 'o')
    #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
    errors = np.ravel(np.diff(res2.confint)/2)
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile
    outFile = 'MultComp.png'
    plt.savefig('MultComp.png', dpi=200)
    print(('Figure written to {0}'.format(outFile)))
    
    plt.show()
    
    # Instead of the Tukey's test, we can do pairwise t-test
    # First, with the "Holm" correction
    rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))
    
    # and then with the Bonferroni correction
    print((mod.allpairtest(stats.ttest_rel, method='b')[0]))
    
    # Done this way, the variance is calculated at each comparison.
    # If you want the joint variance across all samples, you have to 
    # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    studentized_mean = res2.meandiffs
    studentized_variance = res2.variance
    
    t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
    dof = len(dta2) - len(mod.groupsunique)
    my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided
    
    # Now with the Bonferroni correction
    from statsmodels.stats.multitest import multipletests
    res_b = multipletests(my_pvalues, method='b')
    
    return res2.variance
Beispiel #16
0
w, p_bf = stats.levene(edg['WPM'], graf['WPM'], uni['WPM'], center='median')
check_p('brown forsythe test',
        assumption='homogeneity of variance',
        p_val=p_bf)
# non-significance shows we don't have a violation

# now that we know our assumptions have not been violated, we can fit the ANOVA. This is the omnibus test
alpha_lm = ols('WPM ~ C(Alphabet)', data=alpha).fit()
logger.info(f'ANOVA summary: \n\n {alpha_lm.summary()}')
# Prob (F-statistic) shows that there is some difference between the different Alphabets but does not tell us where the
# difference is. For that we do the pairwise comparisons

# tukey comparison followed by holm adjustment (not sure how to combine the two
mc = MultiComparison(alpha['WPM'], alpha['Alphabet'])
logger.info(f'tukey comparison2: \n {mc.tukeyhsd()}')
comp = mc.allpairtest(stats.ttest_ind, method='Holm')
logger.info(f'holm corrected version: \n {comp[0]}')

# non parametric version of one-way ANOVA
chi, p = stats.kruskal(edg['WPM'], graf['WPM'], uni['WPM'])
check_p(descr='Kruskal chi squared test', assumption='', p_val=p)

# mann whitney
mw, p_eg = stats.mannwhitneyu(edg['WPM'], graf['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat edg vs. graf: {mw}, p value: {p_eg}')
mw, p_ug = stats.mannwhitneyu(uni['WPM'], graf['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat uni vs. graf: {mw}, p value: {p_ug}')
mw, p_ue = stats.mannwhitneyu(uni['WPM'], edg['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat EC vs. PC: {mw}, p value: {p_ue}')
rej, p_vals, _, _ = multitest.multipletests([p_eg, p_ug, p_ue], method='holm')
for num, pv in enumerate(p_vals):
def KMWU(pathname='',
         pulsedurs=[5],
         genders=["_male", "_female", "_matedFemale"],
         neuronparts=["medial", "lateral"],
         identifiers=[".mat", "10ms", "40Hz"],
         key="pulsedff",
         compareOn="genders",
         multicompmethod='holm'):
    '''performs a Kruskal-wallis test followed by multiple comparisons with mann-whitney-U-test'''

    currentdir = os.getcwd()
    if pathname:
        if pathname[0] == '/':
            fullpath = pathname
        else:
            fullpath = os.path.join(currentdir, pathname)
    else:
        fullpath = currentdir

    dirlist = os.listdir(fullpath)
    ps = []
    for pulsedur in pulsedurs:
        if pulsedur < 1:

            pulsedurstring = str(int(1000 * pulsedur)) + 'ms'
        else:
            pulsedurstring = str(int(pulsedur)) + 's'

        filelists = []

        for n in neuronparts:
            for g in genders:
                gfiles = [
                    filename for filename in dirlist
                    if g in filename and pulsedurstring in filename and all(
                        [identifier in filename for identifier in identifiers])
                ]
                nfiles = [filename for filename in gfiles if n in filename]
                if nfiles:
                    filelists.append(nfiles)

        pulsedffs = []

        for filelist in filelists:

            fullfile = os.path.join(fullpath, filelist[0])
            data = scipy.io.loadmat(fullfile, matlab_compatible=True)
            pulsedff = [dat[0] for dat in data[key]]
            pulsedffs.append(pulsedff)

        if compareOn == "genders":
            for npart in range(len(neuronparts)):
                neuronpart = neuronparts[npart]
                print(neuronpart)
                groupnum = npart * len(genders)
                numgenders = len(genders)
                data = tuple(pulsedffs[groupnum:(groupnum + numgenders)])
                df = pd.DataFrame(pulsedffs[groupnum:(groupnum + numgenders)])
                df.rename(index={0: "female", 1: 'matedFemale', 2: 'male'})
                statsk = scipy.stats.kruskal(*data)
                print(statsk)

                stacked_data = df.stack().reset_index()
                stacked_data.rename(index={
                    0: "female",
                    1: 'matedFemale',
                    2: 'male'
                })

                stacked_data = stacked_data.rename(columns={
                    'level_0': 'genotype',
                    0: 'result'
                })

                MultiComp = MultiComparison(stacked_data['result'],
                                            stacked_data['genotype'])
                print(
                    MultiComp.allpairtest(scipy.stats.ranksums, method='Holm'))

        elif compareOn == "neuronparts":
            for gend in range(len(genders)):

                ind = [(numnpart * len(genders) + gend)
                       for numnpart in range(len(neuronparts))]
                data = tuple([pulsedffs[index] for index in ind])
                df = pd.DataFrame([pulsedffs[index] for index in ind])
                statsk = scipy.stats.kruskal(*data)
                print(statsk)

                stacked_data = df.stack().reset_index()
                stacked_data.rename(index={0: "medial", 1: 'lateral'})

                stacked_data = stacked_data.rename(columns={
                    'level_0': 'neuronpart',
                    0: 'result'
                })

                MultiComp = MultiComparison(stacked_data['result'],
                                            stacked_data['neuronpart'])
                print(
                    MultiComp.allpairtest(scipy.stats.ranksums, method='Holm'))

        else:
            print(
                "not a valid selection for compareOn - must be \"genders\" or \"neuronparts\""
            )
Beispiel #18
0
    parser.add_argument('--output',
                        required=True,
                        default='MultiComparison.csv',
                        help='out file name.')

    args = parser.parse_args()
    method = args.method
    in_path = args.input
    # output = args.output
    col1 = args.col1
    col2 = args.col2

    df = pd.read_csv(in_path)
    df[col2] = df[col2].astype("float64")
    # print(df.head(3))

    multiComp = MultiComparison(df[col2], df[col1])

    if method == 'Tukey':
        print(multiComp.tukeyhsd().summary())
        result = multiComp.tukeyhsd().summary()
        resultdf = pd.DataFrame(result)
        resultdf.to_csv(args.output, header=None, index=False)

    else:
        print(multiComp.allpairtest(stats.ttest_rel, method=method)[-1])
        result = multiComp.allpairtest(stats.ttest_rel, method=method)[-1]
        resultdf = pd.DataFrame(result)
        resultdf.to_csv(args.output, index=False)

#python MultiComparison.py --method "Tukey" --input ../../data/sample.csv --col1 Treatment --col2 "StressReduction"  --output ../../out/MultiComparison.csv
Beispiel #19
0
def main():
    # Note: the statsmodels module is required here.
    from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                             MultiComparison)
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm

    # Set up the data, as a structured array.
    # The first and last field are 32-bit intergers; the second field is an
    # 8-byte string. Note that here we can also give names to the individual
    # fields!
    dta2 = np.rec.array([(1, 'mental', 2), (2, 'mental', 2), (3, 'mental', 3),
                         (4, 'mental', 4), (5, 'mental', 4), (6, 'mental', 5),
                         (7, 'mental', 3), (8, 'mental', 4), (9, 'mental', 4),
                         (10, 'mental', 4), (11, 'physical', 4),
                         (12, 'physical', 4), (13, 'physical', 3),
                         (14, 'physical', 5), (15, 'physical', 4),
                         (16, 'physical', 1), (17, 'physical', 1),
                         (18, 'physical', 2), (19, 'physical', 3),
                         (20, 'physical', 3), (21, 'medical', 1),
                         (22, 'medical', 2), (23, 'medical', 2),
                         (24, 'medical', 2), (25, 'medical', 3),
                         (26, 'medical', 2), (27, 'medical', 3),
                         (28, 'medical', 1), (29, 'medical', 3),
                         (30, 'medical', 1)],
                        dtype=[('idx', '<i4'), ('Treatment', '|S8'),
                               ('StressReduction', '<i4')])

    # First, do an one-way ANOVA
    df = pd.DataFrame(dta2)
    model = ols('StressReduction ~ C(Treatment)', df).fit()

    anovaResults = anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')

    #Then, do the multiple testing
    mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
    print((mod.tukeyhsd().summary()))

    # The following code produces the same printout
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    #print res2[0]

    # Show the group names
    print((mod.groupsunique))

    # Generate a print
    import matplotlib.pyplot as plt
    xvals = np.arange(3)
    plt.plot(xvals, res2.meandiffs, 'o')
    #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
    errors = np.ravel(np.diff(res2.confint) / 2)
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    pair_labels = mod.groupsunique[np.column_stack(
        res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')

    # Save to outfile
    outFile = 'MultComp.png'
    plt.savefig('MultComp.png', dpi=200)
    print(('Figure written to {0}'.format(outFile)))

    plt.show()

    # Instead of the Tukey's test, we can do pairwise t-test
    # First, with the "Holm" correction
    rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))

    # and then with the Bonferroni correction
    print((mod.allpairtest(stats.ttest_rel, method='b')[0]))

    # Done this way, the variance is calculated at each comparison.
    # If you want the joint variance across all samples, you have to
    # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    studentized_mean = res2.meandiffs
    studentized_variance = res2.variance

    t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
    dof = len(dta2) - len(mod.groupsunique)
    my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided

    # Now with the Bonferroni correction
    from statsmodels.stats.multitest import multipletests
    res_b = multipletests(my_pvalues, method='b')

    return res2.variance