Example #1
0
    def test_incorrect_output(self):
        # too few groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4)
        # too many groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6)
        # just one group
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10)

        # group_order doesn't select all observations, only one group left
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                     [1, 2] * 5, group_order=[1])

        # group_order doesn't select all observations,
        # we do tukey_hsd with reduced set of observations
        data = np.arange(15)
        groups = np.repeat([1, 2, 3], 5)
        mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2])
        res1 = mod1.tukeyhsd(alpha=0.01)
        mod2 = MultiComparison(np.array(data[:10]), groups[:10])
        res2 = mod2.tukeyhsd(alpha=0.01)

        attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique',
                     'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs',
                     'variance']
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14,
                            err_msg=err_msg)

        attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique',
                      'ngroups', 'nobs', 'pairindices']
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14,
                            err_msg=err_msg)
Example #2
0
 def test_table_names_custom_group_order(self):
     # if the group_order parameter is used, the groups should
     # be reported in the specified order
     mc = MultiComparison(self.endog, self.groups,
                          group_order=[b'physical', b'medical', b'mental'])
     res = mc.tukeyhsd(alpha=self.alpha)
     #print(res)
     t = res._results_table
     expected_order = [(b'physical',b'medical'),
                       (b'physical',b'mental'),
                       (b'medical', b'mental')]
     for i in range(1, 4):
         first_group = t[i][0].data
         second_group = t[i][1].data
         assert_((first_group, second_group) == expected_order[i - 1])
Example #3
0
class CheckTuckeyHSD(object):

    @classmethod
    def setup_class_(self):
        self.mc = MultiComparison(self.endog, self.groups)
        self.res = self.mc.tukeyhsd(alpha=self.alpha)

    def test_multicomptukey(self):
        meandiff1 = self.res[1][2]
        assert_almost_equal(meandiff1, self.meandiff2, decimal=14)

        confint1 = self.res[1][4]
        assert_almost_equal(confint1, self.confint2, decimal=2)

        reject1 = self.res[1][1]
        assert_equal(reject1, self.reject2)

    def test_group_tukey(self):
        res_t = get_thsd(self.mc,alpha=self.alpha)
        assert_almost_equal(res_t[4], self.confint2, decimal=2)

    def test_shortcut_function(self):
        #check wrapper function
        res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha)
        assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
Example #4
0
	 
	print ('One-way ANOVA')
	 
	print ('F value:', f)
	if p <= 0.05:
		print ('P value: {} <= 0.05'.format(p))
		print("=> Reject H0\n")
	else:
		print ('P value: {} > 0.05'.format(p))
		print("=> Fail to reject H0\n")

	'''
	Perform Tukey T-Test
	'''

	mc = MultiComparison(data['Waiting'], data['Policy'])
	result = mc.tukeyhsd(alpha=0.05)
	with open('results/svr_rbf_'+sensor_name+'_waiting_plot_diff_means.txt', 'w') as f:
		f.write(str(result.summary()))
	 
	result.plot_simultaneous(comparison_name='policyOST')
	plt.savefig('results/svr_rbf_'+sensor_name+'_waiting_plot_diff_means'+'.png')
print("================================================")

print("\n\n===For Dataset 2 using Linear Regression===")
for sensor_name in d2_data:
	print("Sensor name", sensor_name)
	data = np.rec.array(d2_data[sensor_name], dtype = [('Policy','|U10'),('Waiting', '<i8')])
	f, p = stats.f_oneway(data[data['Policy'] == 'policyA'].Waiting,
	                      data[data['Policy'] == 'policyC'].Waiting,
	                      data[data['Policy'] == 'policyM'].Waiting,
Example #5
0
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison

tratamento = pd.read_csv('dados/anova.csv', sep=';')

tratamento.boxplot(by='Remedio', grid=False)

modelo1 = ols('Horas ~ Remedio', data=tratamento).fit()
resultados1 = sm.stats.anova_lm(modelo1)

modelo2 = ols('Horas ~ Remedio * Sexo', data=tratamento).fit()
resultados2 = sm.stats.anova_lm(modelo2)

mc = MultiComparison(tratamento['Horas'], tratamento['Remedio'])
resultado_teste = mc.tukeyhsd()
print(resultado_teste)
resultado_teste.plot_simultaneous()
Example #6
0
_, p = f_oneway(grupo_a, grupo_b, grupo_c)

print(p)

alpha = 0.05

if p <= alpha:
    print("hipotese nula rejeitada")
else:
    print("hipotese alternativa rejeitada")


#Teste de Tukey

dados = {"valores": [165, 152, 143, 140, 155, 130, 169, 164, 143, 154, 163, 158, 154, 149, 156],
         "grupo": ['A','A','A','A','A','B','B','B','B','B','C','C','C','C','C']}

import pandas as pd

dados_pd = pd.DataFrame(dados)

print(dados_pd)

from statsmodels.stats.multicomp import MultiComparison

compara_grupos = MultiComparison(dados_pd['valores'], dados_pd['grupo'])

teste = compara_grupos.tukeyhsd()

print(teste)
Example #7
0
      round(outliers, 4))

# New data-frame that omits this range and look at new computed means:

# In[11]:

df_filtered = df[(df['Pvs_per_session'] < outliers)]
df_filtered.groupby(['Variant']).mean()

# ## Significance testing

# We can now perform our parametric statistical test. The code below will execute an analysis of variance (ANOVA) test, although ANOVA can handle multiple independent variables/groups, the individual test performed between each group will be a t.test, which is the most widely recognised assessment for significance testing using a frequentist framework:

# In[12]:

mc = MultiComparison(df_filtered['Pvs_per_session'], df_filtered['Variant'])
mc_results = mc.tukeyhsd()
null_hypothesis = mc_results.reject

print(mc_results)
print(
    "Reject null hypothesis and significant difference between experimental groups:",
    null_hypothesis,
)

# As seen above, the results from this experiment are statistically non-significanct we have failed to reject our null hypothesis (that no difference exists between independent groups). Which means you can confidently tell your stakeholders that they needn't worry about the impact on Pvs_per_session by introducing the new widget. It may have increased clicks, but failed to move the needle for our consumption metric. Below, we will look at some useful and user-friendly visualisations that you can present to stakeholders.

# ## Visualisations

# Prior to plotting data, we are going to get it in a format that makes visualisations easy to interpret. This process involves taking samples (1500) from our Pvs_per_session data for each of our experimental conditions and plotting the distribution of all of the samples. As governed by the central limit this means data will follow a normal/bell-shaped distribution with a sample mean that reflects the overall mean of the data. This is useful for visualisations as there will be no heavy skews/tails in the data whilst still maintaining the mean values for the overall mean of each experimental condition:
    def GroupTukeyHSD(self, continuous, categorical):

        mc = MultiComparison(continuous, categorical)
        result = mc.tukeyhsd()
        reject = result.reject
        meandiffs = result.meandiffs
        UniqueGroup = mc.groupsunique
        group1 = [UniqueGroup[index] for index in mc.pairindices[0]]
        group2 = [UniqueGroup[index] for index in mc.pairindices[1]]
        reject = result.reject
        meandiffs = [
            round(float(meandiff), 3) for meandiff in result.meandiffs
        ]
        columns = ['Group 1', "Group 2", "Mean Difference", "Reject"]
        TukeyResult = pd.DataFrame(np.column_stack(
            (group1, group2, meandiffs, reject)),
                                   columns=columns)
        '''
			Once Tukey HSD test is done. Select only those entries, with Reject=False. 
			This implies, only entries with similar distribution is selected.
			Once selected, group them into different distributions.
		'''
        TukeyResult_false = TukeyResult[TukeyResult['Reject'] == 'False']
        overall_distribution_list = []
        same_distribution_list = []
        if len(TukeyResult_false) > 0:
            for group1 in TukeyResult_false['Group 1'].unique():
                if group1 not in overall_distribution_list:
                    temp_list = []
                    temp_result = TukeyResult_false[
                        TukeyResult_false['Group 1'] == group1]
                    overall_distribution_list.append(group1)
                    for entry in list(temp_result['Group 2'].unique()):
                        if entry not in overall_distribution_list:
                            overall_distribution_list.append(entry)
                            temp_list.append(entry)
                    temp_list.append(group1)
                    #         if temp_result['Group 2'].nunique()>1:
                    #             temp_list.extend((temp_result['Group 2'].unique()))
                    #         else:
                    #             temp_list.append((temp_result['Group 2'].unique()[0]))
                    same_distribution_list.append(
                        dict(list_name=group1.replace(" ", "_"),
                             lists=temp_list,
                             length=len(temp_list)))
            if len(set(categorical.unique()) -
                   set(overall_distribution_list)) > 0:
                missing_categories = list(
                    set(categorical.unique()) - set(overall_distribution_list))
                for group1 in missing_categories:
                    same_distribution_list.append(
                        dict(list_name=group1.replace(" ", "_"),
                             lists=[group1],
                             length=1))

        else:
            for group1 in categorical.unique():
                same_distribution_list.append(
                    dict(list_name=group1.replace(" ", "_"),
                         lists=[group1],
                         length=1))

        g1 = pd.DataFrame(same_distribution_list)
        return (g1.sort_values('length', ascending=False))
Example #9
0
Spyder Editor

This is a temporary script file.
"""

import numpy as np
import pandas as pd

datafile = "/Users/rachelrigg/Google Drive/Lab/Projects/Hsp70/Flow chamber/Stats/Flow chamber data VER PIF.csv"
data = pd.read_csv(datafile)

from scipy import stats

data.boxplot('y', by='trt', figsize=(8, 6))

import statsmodels.api as sm
from statsmodels.formula.api import ols

mod = ols('y ~ trt', data=data).fit()

aov_table = sm.stats.anova_lm(mod, typ=2)
print aov_table

from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(data['y'], data['trt'])
result = mc.tukeyhsd()

print(result)
print(mc.groupsunique)
Example #10
0
def multi_by_position(df, to_plot):
    df = df.dropna(subset=[to_plot, 'position_grouping'])
    mc = MultiComparison(df[to_plot], df['position_grouping'])
    mc_results = mc.tukeyhsd()
    return mc_results
Example #11
0
cortexsummary = rp.summary_cont(data_frame['cortex'].groupby(data_frame['Slide']))
print(cortexsummary)
cerebsummary = rp.summary_cont(data_frame['cerebellum'].groupby(data_frame['Slide']))
print(cerebsummary)

hipresults = ols('hippocampus ~ C(Slide)', data=data_frame).fit()
hip_table = sm.stats.anova_lm(hipresults, typ=2)

cerresults = ols('cerebellum ~ C(Slide)', data=data_frame).fit()
cer_table = sm.stats.anova_lm(cerresults, typ=2)

cortresults = ols('cortex ~ C(Slide)', data=data_frame).fit()
cort_table = sm.stats.anova_lm(cortresults, typ=2)

print('HIPPOCAMPUS')
hipmc = MultiComparison(data_frame['hippocampus'], data_frame['Slide'])
hipmc_results = hipmc.tukeyhsd()
print(hipmc_results)
print('')

print('CEREBELLUM')
cermc = MultiComparison(data_frame['cerebellum'], data_frame['Slide'])
cermc_results = cermc.tukeyhsd()
print(cermc_results)
print('')

print('CORTEX')
cortmc = MultiComparison(data_frame['cortex'], data_frame['Slide'])
cortmc_results = cortmc.tukeyhsd()
print(cortmc_results)
Example #12
0
# 4: Day 2, end

data = []
groups = []

for i, session in enumerate(D1):
    for trial in session[:10]:
        # print(trial, i, 1)
        data.append(trial)
        groups.append(1)
    for trial in session[-10:]:
        # print(trial, i, 2)
        data.append(trial)
        groups.append(2)

for i, session in enumerate(D2):
    for trial in session[:10]:
        # print(trial, i, 3)
        data.append(trial)
        groups.append(3)
    for trial in session[-10:]:
        # print(trial, i, 4)
        data.append(trial)
        groups.append(4)

data = np.array(data)
groups = np.array(groups)

from statsmodels.stats.multicomp import MultiComparison
print(MultiComparison(data, groups).tukeyhsd())
    for_export.insert(for_export.shape[1], 'Tumour anatomy (full description)', obj.meta.structure_name)
    for_export.to_excel(os.path.join(outdir, "ivygap_signature_scores_and_subgroups.xlsx"))


    # boxplots showing signature scores in the different niches
    bplot = {}
    anova_res = {}
    tukey_res = {}
    for k in genesets:
        the_data = es_z.loc[k]
        bplot[k] = collections.OrderedDict()
        for sg in group_list:
            bplot[k][sg] = the_data.loc[groups.fillna('').str.contains(sg)].values

        anova_res[k] = stats.f_oneway(*bplot[k].values())
        mc = MultiComparison(the_data, groups, group_order=group_list)
        tukey_res[k] = mc.tukeyhsd(alpha=alpha)

        lbl, tmp = zip(*bplot[k].items())
        tmp = [list(t) for t in tmp]
        fig = plt.figure(num=k, figsize=(5, 4))
        ax = fig.add_subplot(111)
        sns.boxplot(data=tmp, orient='v', ax=ax, color='0.5')
        ax.set_xticklabels(lbl, rotation=45)
        ax.set_ylabel("Normalised ssGSEA score")
        fig.tight_layout()
        fig.savefig(os.path.join(outdir, '%s_ssgsea_by_subgroup_tcga.png' % k.lower()), dpi=200)
        fig.savefig(os.path.join(outdir, '%s_ssgsea_by_subgroup_tcga.pdf' % k.lower()))

    # can annotate these manually based on statistics?
Example #14
0
gtvs = sorted(glob.glob('/run/user/1000/gvfs/smb-share:server=ad,share=fs'
                              '/E210-Projekte/miss-classified/big_gtv/Tumor_*.nrrd'))
mask_dir = '/run/user/1000/gvfs/smb-share:server=ad,share=fs/E210-Projekte/miss-classified/big_gtv_bet/'

gtv_vols_big, distances_big = feat_calc_miss_classified(gtvs, mask_dir)

gtvs = sorted(glob.glob('/run/user/1000/gvfs/smb-share:server=ad,share=fs'
                              '/E210-Projekte/miss-classified/gtv_middle/Tumor_*.nrrd'))
mask_dir = '/run/user/1000/gvfs/smb-share:server=ad,share=fs/E210-Projekte/miss-classified/gtv_middle_bet/'

gtv_vols_mid, distances_mid = feat_calc_miss_classified(gtvs, mask_dir)

gt_als = np.transpose(np.hstack((gtv_vols_mid, gtv_vols_big, gtv_vols_cor)))
ds_als = np.transpose(np.hstack((distances_mid, distances_big, distances_cor)))
labels = np.transpose(np.hstack((['middle']*len(gtv_vols_mid), ['big']*len(gtv_vols_big), ['corr']*len(gtv_vols_cor))))
mod_gtv = MultiComparison(gt_als, labels)
print(mod_gtv.tukeyhsd())
mod_ds = MultiComparison(ds_als, labels)
print(mod_ds.tukeyhsd())

print(distances_big)
# gtv_vols, distances = feat_calc_corr_classified()

# print('Correlation between distances and volumes: {}'.format(np.corrcoef(gtv_vols, distances)[0, 1]))
# print('Median distance: {}'.format(np.median(distances)))
# print('Median GTV volume: {}'.format(np.median(gtv_vols)))
# plot.scatter(gtv_vols, distances)
# plot.show()
# print(distances)
Example #15
0
import matplotlib.pyplot as plt

for i in ["regular", "death", "emergentjobs", "hardjobs", "suddentasks"]:
    for joblength in [0, 5, 10, 15]:
        print("starting to show results for {} with job length {}".format(
            i, joblength))
        data = np.loadtxt(
            "/home/drew/tmp/jumpNN/{}/{}/allTimeResults.txt".format(
                i, joblength),
            dtype={
                'names': ('mean', 'group'),
                'formats': ('f4', 'S20')
            })

        # http://www.statsmodels.org/stable/_modules/statsmodels/sandbox/stats/multicomp.html#MultiComparison
        mc = MultiComparison(data['mean'], data['group'])
        result = mc.tukeyhsd()

        print(result)
        print(mc.groupsunique)

        a = result.plot_simultaneous()
        #a.title("Mean Wait Time Tukey Test Results for {} with Job Length {}".format(i, joblength))
        a.show()

        raw_input("Press Enter to continue...")

# for i in ["regular", "emergentjobs", "hardjobs", "death", "suddentasks"]:
# 	for joblength in [0, 5, 10, 15]:
# 		print("starting to show results for {} with job length {}".format(i, joblength))
# 		data = np.loadtxt("/home/drew/tmp/jumpNN/{}/{}/allBountyResults.txt".format(i, joblength), dtype={'names': ('mean', 'group'), 'formats': ('f4', 'S20')})
Example #16
0
print(train_data2)
#    Performed an ANOVA on the 'Sex' column using 'Survived' as the independent variable. 


#for the anova we need to create our one-way model:
model = ols('Sex ~ Survived', data=train_data2).fit()

#actually run the anova:
aov_table = sm.stats.anova_lm(model, typ=2)
print("ANOVA 1 results:")
print(aov_table)

#So, we will compare the three groups against each other:
print("\nTukey HSD results:")
mc = MultiComparison(train_data2['Sex'], train_data2['Survived'])
result = mc.tukeyhsd()

print(result)

print("There is a correlation to sex to survival. The female's had a higher chance of surviving.")


#    Perform a similiar ANOVA on PClass using 'Survived' as the independent variable. 

#for the anova we need to create our one-way model:
model = ols('Pclass ~ Survived', data=train_data2).fit()

#actually run the anova:
aov_table = sm.stats.anova_lm(model, typ=2)
print("ANOVA 2 results:")
Example #17
0
fontsize = 20    
fig, axes = plt.subplots()

test = sns.violinplot('Coating',param,data=df,ax=axes)
axes.set_ylabel('')    
axes.set_xlabel('')

figure = test.get_figure()    
figure.savefig(param+ '.png', dpi=400)


f,p = stats.kruskal(df[df['Coating']=='L2P'][param],
                     df[df['Coating']=='N2P'][param],
                     df[df['Coating']=='P10'][param])

mc = MultiComparison(df[param], df['Coating'])
result = mc.tukeyhsd()

print ('Nonparametric One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

print(result)
print(mc.groupsunique)


# PCA of dfCoat

X = df.iloc[:,3:13] # table of all the values (ignore well and duplicate, cols 1 and 2)
Example #18
0
# brown-forsythe test
w, p_bf = stats.levene(edg['WPM'], graf['WPM'], uni['WPM'], center='median')
check_p('brown forsythe test',
        assumption='homogeneity of variance',
        p_val=p_bf)
# non-significance shows we don't have a violation

# now that we know our assumptions have not been violated, we can fit the ANOVA. This is the omnibus test
alpha_lm = ols('WPM ~ C(Alphabet)', data=alpha).fit()
logger.info(f'ANOVA summary: \n\n {alpha_lm.summary()}')
# Prob (F-statistic) shows that there is some difference between the different Alphabets but does not tell us where the
# difference is. For that we do the pairwise comparisons

# tukey comparison followed by holm adjustment (not sure how to combine the two
mc = MultiComparison(alpha['WPM'], alpha['Alphabet'])
logger.info(f'tukey comparison2: \n {mc.tukeyhsd()}')
comp = mc.allpairtest(stats.ttest_ind, method='Holm')
logger.info(f'holm corrected version: \n {comp[0]}')

# non parametric version of one-way ANOVA
chi, p = stats.kruskal(edg['WPM'], graf['WPM'], uni['WPM'])
check_p(descr='Kruskal chi squared test', assumption='', p_val=p)

# mann whitney
mw, p_eg = stats.mannwhitneyu(edg['WPM'], graf['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat edg vs. graf: {mw}, p value: {p_eg}')
mw, p_ug = stats.mannwhitneyu(uni['WPM'], graf['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat uni vs. graf: {mw}, p value: {p_ug}')
mw, p_ue = stats.mannwhitneyu(uni['WPM'], edg['WPM'], alternative='two-sided')
logger.info(f'mann-whitney stat EC vs. PC: {mw}, p value: {p_ue}')
Example #19
0
    ##
    ##   TUKEY
    ##
    print ('\n\n\n#### TESTE TUKEY ####')

    # arrayegua = []
    # for i in range(num_execs): arrayegua.append(("GA", best_ga[i]))
    # for i in range(num_execs): arrayegua.append(("ED", best_ed[i]))
    # for i in range(num_execs): arrayegua.append(("PSO", best_pso[i]))
    # for i in range(num_execs): arrayegua.append(("EP", best_ep[i]))
    # for i in range(num_execs): arrayegua.append(("ABC", best_abc[i]))
    #
    # data_arr = np.rec.array(arrayegua, dtype=[('Algoritmo', '|U5'), ('Fitness', float)])

    mc = MultiComparison(data_arr['Fitness'], data_arr['Algoritmo'])
    turkey_result = mc.tukeyhsd()

    print(turkey_result)
    print(mc.groupsunique)

##########################33

# bounds = [(-5,5)]*30
# result = differential_evolution(rosen, bounds)
# print (result.x)
# print("\n")
# print (result.fun)

#########################3
def run_stats(experiment):
    '''Run independent T-test or one-way ANOVA dependent on number of groups.

    Args:
        experiment (Experiment instance): An instance of the Experiment class.

    Returns:
        A new Pandas data frame with p values, adjusted p values and Tukey HSD
        post-hoc results if there are > 2 groups.

    '''

    groups = experiment.get_groups()
    samples = experiment.get_sampleids()
    df = experiment.df
    all_vals = []

## Get values for each group, ready for T-test or ANOVA.

    for group in groups:
        sample_re = re.compile(group + "_\d+$")
        ids = [sample for sample in samples if sample_re.match(sample)]
        vals = list(map(list, df[ids].values))
        all_vals.append(vals)

## Decide whether to use T-test or ANOVA dependent on number of groups.
    if len(groups) == 2:
        p_vals = [ttest_ind(all_vals[0][i], all_vals[1][i])[1] for i in range(len(all_vals[0]))]
    else:
        p_vals = []
        for i in range(len(all_vals[0])):
            row_vals = [all_vals[j][i] for j in range(len(groups))]
            p_val = f_oneway(*row_vals)[1]
            p_vals.append(p_val)

## Adjust the p values and create a new data frame with them in.
    p_val_adj = list(multipletests(p_vals, method='fdr_bh')[1])
    new_df = df.ix[:, :5].copy()
    new_df['p_val'] = pd.Series(p_vals, index=new_df.index)
    new_df['p_val_adj'] = pd.Series(p_val_adj, index=new_df.index)

    ## Post-hoc test.

    ## Only do the post-hoc test if there are more than 2 groups, duh!
    if len(groups) > 2:
        vals_df = df[samples]
        group_ids = [sample.split('_')[0] for sample in vals_df.columns.values]
        posthoc_results = {}

        ## Run the post-hoc test on each row.
        for row in range(len(vals_df)):
            row_vals = vals_df.ix[row]
            mc = MultiComparison(row_vals, group_ids)
            mc_groups = mc.groupsunique
            results = mc.tukeyhsd()
            significant = results.reject
            pairs = list(zip(*[x.tolist() for x in mc.pairindices]))

            ## Go through each pair and add results to the posthoc_results dictionary.
            for i in range(len(pairs)):
                pair = list(pairs[i])
                pair.sort()
                pair_name = str(mc_groups[pair[0]]) + '_' + str(mc_groups[pair[1]])
                if pair_name in posthoc_results:
                    posthoc_results[pair_name].append(significant[i])
                else:
                    posthoc_results[pair_name] = [significant[i]]

        ## Add the post-hoc results to the data frame.
        for pair_name in posthoc_results:
            new_df['significant_' + pair_name] = posthoc_results[pair_name]

    return new_df
Example #21
0
        #   order=['Sibs','Tau','Lesion'],
    )
    p.legend_.remove()
    # p.set_yticks(np.arange(0.1,0.52,0.04))
    sns.despine(trim=True)

    condition_s = set(std_plt['condition'].values)
    condition_s = list(condition_s)

    if len(condition_s) == 2:
        # Paired T Test for 2 conditions
        # Separate data by condition.
        std_cond1 = std_plt.loc[std_plt['condition'] ==
                                condition_s[0]].sort_values(by='excluded_exp')
        std_cond2 = std_plt.loc[std_plt['condition'] ==
                                condition_s[1]].sort_values(by='excluded_exp')
        ttest_res, ttest_p = ttest_rel(std_cond1['std(posture)'],
                                       std_cond2['std(posture)'])
        print(
            f'* Age {age}: {condition_s[0]} v.s. {condition_s[1]} paired t-test p-value = {ttest_p}'
        )
    elif len(condition_s) > 2:
        multi_comp = MultiComparison(
            ang_std_all['std(posture)'],
            ang_std_all['dpf'] + ang_std_all['condition'])
        print(f'* Age {age}:')
        print(multi_comp.tukeyhsd().summary())
    else:
        pass
plt.show()
Example #22
0
( 29,  'medical',  3 ),
( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                ('Treatment', '|S8'),
                                ('StressReduction', '<i4')])

# First, do an one-way ANOVA
df = pd.DataFrame(dta2)
model = ols('StressReduction ~ C(Treatment)',df).fit()

anovaResults =  anova_lm(model)
print anovaResults
if anovaResults['PR(>F)'][0] < 0.05:
    print('One of the groups is different.')

#Then, do the multiple testing
mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
print mod.tukeyhsd()[0]

# The following code produces the same printout
res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
#print res2[0]

# Show the group names
print mod.groupsunique

# Generate a print
import matplotlib.pyplot as plt
plt.plot([0,1,2], res2[1][2], 'o')
plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
Example #23
0
anova_results = anova_lm(anova_reg)
print('\nANOVA results\n', anova_results)

#Check for heteroskedasticity
sm.qqplot(anova_reg.resid, line='s')
plt.show()

######
#Post Hoc Tests for One-way ANOVA

#Tukey test - good when groups are the same size and have and homogeneous variance
postHoc = pairwise_tukeyhsd(alldata['Fare_Per_Person'], alldata['Embarked'], alpha=0.05)
print(postHoc)

#Pairwise comparison using Bonferroni correction of p-values
mc = MultiComparison(alldata['Fare_Per_Person'], alldata['Embarked'])
#print(mc.allpairtest(stats.ttest_rel, method='Holm')[0])  #For paired t-test
print(mc.allpairtest(stats.ttest_ind, method='b')[0])     #For independent t-test

######
#ANCOVA

#Look for heteroskedasticity
plt.plot(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1)]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) &(alldata['Sex_male']==1)]['Group_Size'], 'bo')
plt.show()
#Second class male passengers with a fare price > 0 seem OK
#There are a couple group sizes with only 1 observation with these criteria though, so make sure to filter them out too

#Test for heteroskedasticity
print(levenes_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size']))
print(bartlett_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size']))
Example #24
0
                      data[data['Game'] == groupNames[1]].Score,
                      data[data['Game'] == groupNames[2]].Score)

# print output and conclusion of the one-way ANOVA between the groups
anovaPrintout = 'One-way ANOVA\n=============\nF-value: ' + str(
    f) + '\np-value: ' + str(p) + '\n'
anovaConclusion = 'CONCLUSION:\n-> Significant DIFFERENCE found!' if p < 0.05 else 'CONCLUSION:\n-> NO significant difference found...'

# append to output file
outputFileName = 'output_cohendadded.txt'
with open(outputFileName, 'a') as fl:
    fl.write('\nData File: ' + fileName + '\n===========================\n' +
             anovaPrintout + anovaConclusion + '\n')

if p < 0.05:
    mc = MultiComparison(data['Score'], data['Game'])
    result = mc.tukeyhsd()
    with open(outputFileName, 'a') as f:
        f.write(str(result) + '\n')
        f.write('\t\tmeandiff = mean(group2) - mean(group1)\n')
        # f.write('Unique groups:\n')
        # f.write(str(mc.groupsunique)+'\n')
        f.write('Significantly higher:\n')
        f.write(checkWhichSigHigher(str(result), groupNames))

with open(outputFileName, 'a') as f:
    f.write('Cohen\'s d effect sizes\n======================\n')

# now get Cohen's d effect sizes:
fileData = formatDataForCohenCalc(fileData)
for i in range(len(groupNames)):
Example #25
0
                  jitter=True,
                  data=final_df[(final_df.Event == 'Cells') |
                                (final_df.Event == treatments[ind])],
                  ax=panel)

plt.savefig('respiration.svg')
plt.show()

#statistical analysis

final_df['Slope'] = final_df.Slope.astype(np.float)

#fit linear model
model = ols('Slope ~ Event + Experiment', data=final_df).fit()
print(model.summary())

print('ANOVA analysis')

aov_table = sm.stats.anova_lm(model, type=2)
print(aov_table)

print('Post hoc tukey')

mc = MultiComparison(final_df['Slope'], final_df['Event'])
mc_results = mc.tukeyhsd()
print(mc_results.summary())

p_values = psturng(np.abs(mc_results.meandiffs / mc_results.std_pairs),
                   len(mc_results.groupsunique), mc_results.df_total)

print('p_values: ', p_values)
Example #26
0
                   virginica['sepal_length']))
print("\nPetal Width:")
print(
    stats.f_oneway(setosa['petal_width'], versicolor['petal_width'],
                   virginica['petal_width']))
print("\nPetal Length:")
print(
    stats.f_oneway(setosa['petal_length'], versicolor['petal_length'],
                   virginica['petal_length']))

# Posthoc test- Tukey's HSD
print(
    "\nPost-hoc test performed to determine which groups are statistically significantly different from each other"
)
print("Sepal Width:")
mc1 = MultiComparison(df['sepal_width'], df['variety'])
print(mc1.tukeyhsd())
print()
print("\nSepal Length:")
mc2 = MultiComparison(df['sepal_length'], df['variety'])
print(mc2.tukeyhsd())
print()
print("\nPetal Width:")
mc3 = MultiComparison(df['petal_width'], df['variety'])
print(mc3.tukeyhsd())
print()
print("\nPetal Length:")
mc4 = MultiComparison(df['petal_length'], df['variety'])
print(mc4.tukeyhsd())

#Using Matplotlib to generate histograms of each variable
Example #27
0
def main():
    # Note: the statsmodels module is required here.
    from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                             MultiComparison)
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    
    # Set up the data, as a structured array.
    # The first and last field are 32-bit intergers; the second field is an
    # 8-byte string. Note that here we can also give names to the individual
    # fields!
    dta2 = np.rec.array([
    (  1,   'mental',  2 ),
    (  2,   'mental',  2 ),
    (  3,   'mental',  3 ),
    (  4,   'mental',  4 ),
    (  5,   'mental',  4 ),
    (  6,   'mental',  5 ),
    (  7,   'mental',  3 ),
    (  8,   'mental',  4 ),
    (  9,   'mental',  4 ),
    ( 10,   'mental',  4 ),
    ( 11, 'physical',  4 ),
    ( 12, 'physical',  4 ),
    ( 13, 'physical',  3 ),
    ( 14, 'physical',  5 ),
    ( 15, 'physical',  4 ),
    ( 16, 'physical',  1 ),
    ( 17, 'physical',  1 ),
    ( 18, 'physical',  2 ),
    ( 19, 'physical',  3 ),
    ( 20, 'physical',  3 ),
    ( 21,  'medical',  1 ),
    ( 22,  'medical',  2 ),
    ( 23,  'medical',  2 ),
    ( 24,  'medical',  2 ),
    ( 25,  'medical',  3 ),
    ( 26,  'medical',  2 ),
    ( 27,  'medical',  3 ),
    ( 28,  'medical',  1 ),
    ( 29,  'medical',  3 ),
    ( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                    ('Treatment', '|S8'),
                                    ('StressReduction', '<i4')])
    
    # First, do an one-way ANOVA
    df = pd.DataFrame(dta2)
    model = ols('StressReduction ~ C(Treatment)',df).fit()
    
    anovaResults =  anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
    
    #Then, do the multiple testing
    mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
    print((mod.tukeyhsd().summary()))
    
    # The following code produces the same printout
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    #print res2[0]
    
    # Show the group names
    print((mod.groupsunique))
    
    # Generate a print
    import matplotlib.pyplot as plt
    xvals = np.arange(3)
    plt.plot(xvals, res2.meandiffs, 'o')
    #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
    errors = np.ravel(np.diff(res2.confint)/2)
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile
    outFile = 'MultComp.png'
    plt.savefig('MultComp.png', dpi=200)
    print(('Figure written to {0}'.format(outFile)))
    
    plt.show()
    
    # Instead of the Tukey's test, we can do pairwise t-test
    # First, with the "Holm" correction
    rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))
    
    # and then with the Bonferroni correction
    print((mod.allpairtest(stats.ttest_rel, method='b')[0]))
    
    # Done this way, the variance is calculated at each comparison.
    # If you want the joint variance across all samples, you have to 
    # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    studentized_mean = res2.meandiffs
    studentized_variance = res2.variance
    
    t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
    dof = len(dta2) - len(mod.groupsunique)
    my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided
    
    # Now with the Bonferroni correction
    from statsmodels.stats.multitest import multipletests
    res_b = multipletests(my_pvalues, method='b')
    
    return res2.variance
Example #28
0
 def setup_class_(self):
     self.mc = MultiComparison(self.endog, self.groups)
     self.res = self.mc.tukeyhsd(alpha=self.alpha)
Example #29
0
 def setup_class_(cls):
     cls.mc = MultiComparison(cls.endog, cls.groups)
     cls.res = cls.mc.tukeyhsd(alpha=cls.alpha)
Example #30
0
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison

tratamento = pd.read_csv('database/anova.csv',
                         sep=';')  #dados q serao utilizados

modelo1 = ols('Horas ~ Remedio', data=tratamento).fit(
)  #cria e treina o modelo ('V.Dependente ~ V.Independente')
resul1 = sm.stats.anova_lm(modelo1)  #realiza a avalição cm a anova

modelo2 = ols('Horas ~ Remedio * Sexo', data=tratamento).fit(
)  #cria e treina o modelo ('V.Dependente ~ V.Independente * V.Independente')
resul2 = sm.stats.anova_lm(modelo2)  #realiza a avalição cm a anova

mc = MultiComparison(
    tratamento['Horas'],
    tratamento['Remedio'])  #faz uma comparação entre as duas variaveis
resul_test = mc.tukeyhsd()  #realiza o teste TukeyHSD

grafico = resul_test.plot_simultaneous(
)  #grafico especial para demonstrar os dados
#plt.show()
Example #31
0
ms_8 = [36,	39,	39,	38,	44,	42,	48,	38,	46,	37,	46]
ms_9 = [35,	29,	39,	37,	40,	36,	43,	48,	41,	44,	42]

data = np.rec.array([
(  47,   'ms_1'), (  47,   'ms_1'), (  49,   'ms_1'), (  45,   'ms_1'), (  42,   'ms_1'), ( 43 ,   'ms_1'), (  39,   'ms_1'), (  48,   'ms_1'), (  43,   'ms_1'),
(  41,   'ms_2'), (  45,   'ms_2'), (  39,   'ms_2'), (  48,   'ms_2'), (  39,   'ms_2'), (  37,   'ms_2'), (  42,   'ms_2'), (  47,   'ms_2'), (  44,   'ms_2'), ( 42,   'ms_2'),
(  47,   'ms_3'), (  42,   'ms_3'), (  45 ,  'ms_3'), (  43,   'ms_3'), (  47,   'ms_3'), (  37,   'ms_3'), (  43,   'ms_3'), (  47,   'ms_3'), (  41,   'ms_3'), ( 39,   'ms_3'),
(  38,   'ms_4'), (  38,   'ms_4'), (  37,   'ms_4'), (  45,   'ms_4'), (  41,   'ms_4'), (  41,   'ms_4'), (  41,   'ms_4'), (  46,   'ms_4'), (  53,   'ms_4'), (  45,   'ms_4'), (  44,   'ms_4'),
(  47,   'ms_5'), (  47,   'ms_5'), (  49,   'ms_5'), (  44,   'ms_5'), (  49,   'ms_5'), (  46,   'ms_5'), (  41,   'ms_5'), (  41,   'ms_5'), (  42,   'ms_5'), (  46,   'ms_5'),
(  41,   'ms_6'), (  41,   'ms_6'), (  45,   'ms_6'), (  38,   'ms_6'), (  42,   'ms_6'), (  33,   'ms_6'), (  45,   'ms_6'), (  43,   'ms_6'), (  44,   'ms_6'), (  44,   'ms_6'), (  46,   'ms_6'),
(  35,   'ms_7'), (  36,   'ms_7'), (  44,   'ms_7'), (  32,   'ms_7'), (  40,   'ms_7'), (  41,   'ms_7'), (  43,   'ms_7'), (  45,   'ms_7'), (  48,   'ms_7'), (  48,   'ms_7'),
(  36,   'ms_8'), (  39,   'ms_8'), (  39,   'ms_8'), (  38,   'ms_8'), (  44,   'ms_8'), (  42,   'ms_8'), (  48,   'ms_8'), (  38,   'ms_8'), (  46,   'ms_8'), (  37,   'ms_8'), (  46,   'ms_8'),
(  35,   'ms_9'), (  29,   'ms_9'), (  39,   'ms_9'), (  37,   'ms_9'), (  40,   'ms_9'), (  36,   'ms_9'), (  43,   'ms_9'), (  48,   'ms_9'), (  41,   'ms_9'), (  44,   'ms_9'), (  42,   'ms_9')],
dtype = [('score', '<i4'), ('student', '|S8')])


print("Results from Levene's test, dealing with homogeneity of variance:")
print(stats.levene(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9))

print("Results from one-way ANOVA:")
print(stats.f_oneway(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9))

print("Results of the Kruskla Wallis Test:")
print(stats.mstats.kruskalwallis(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9))

mc = MultiComparison(data['score'], data['student'])
results = mc.tukeyhsd(alpha=0.1)


print(results)
        sigfeats_out.to_csv(sigfeats_outpath, header=False)

        if TukeyHSD:
            # Tally total number of significantly different pairwise comparisons
            n_sigdiff_pairwise_beforeBF = 0
            n_sigdiff_pairwise_afterBF = 0

            # Tukey HSD post-hoc pairwise differences between dates for each feature
            for feature in feature_colnames:
                # Tukey HSD post-hoc analysis (no Bonferroni correction!)
                tukeyHSD = pairwise_tukeyhsd(OP50_control_df[feature],
                                             OP50_control_df['date_yyyymmdd'])
                n_sigdiff_pairwise_beforeBF += sum(tukeyHSD.reject)

                # Tukey HSD post-hoc analysis (Bonferroni correction)
                tukeyHSD_BF = MultiComparison(OP50_control_df[feature],
                                              OP50_control_df['date_yyyymmdd'])
                n_sigdiff_pairwise_afterBF += sum(
                    tukeyHSD_BF.tukeyhsd().reject)

            total_comparisons = len(feature_colnames) * 6
            reject_H0_percentage = n_sigdiff_pairwise_afterBF / total_comparisons * 100
            print("%d / %d (%.1f%%) of pairwise-comparisons of imaging dates (%d features) show significant variation for OP50 control (TukeyHSD)" %\
                  (n_sigdiff_pairwise_afterBF, total_comparisons, reject_H0_percentage, len(feature_colnames)))

            # TODO: Reverse-engineer p-values using mean/std
            #from statsmodels.stats.libqsturng import psturng
            ##studentized range statistic
            #rs = res2[1][2] / res2[1][3]
            #pvalues = psturng(np.abs(rs), 3, 27)

            # Mantel test?
Example #33
0
print('F value:', f)
print('P value:', p, '\n')

from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
import itertools

headers = [
    'P-PG', 'P-TD', 'P-RMSPROP', 'NEAT', 'NEAT-EM-P-PG', 'NEAT-EM-P-TD',
    'NEAT-EM-P-RMSPROP'
]
group_names = []
for header in headers:
    group_names += list(itertools.repeat(header, min_length))

mc = MultiComparison(np.asarray(stripped_groups).flatten(), group_names)
result = mc.tukeyhsd()

from statsmodels.stats.libqsturng import psturng

print(result)
print(mc.groupsunique)
print(
    psturng(np.abs(result.meandiffs / result.std_pairs),
            len(result.groupsunique), result.df_total))


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
Example #34
0
print('Accuracy')

path_to_data = 'D:/Sasha/subversion/trunk/AuthorshipAttributionRussianTexts/results/statistical_tests_data.json'
with codecs.open(path_to_data, 'r') as f:
    data = f.read()
    data = json.loads(data)

for k in data:
    values = {}
    mesure = data[k]
    values[k] = {}

    values[k]['SGD'] = [m['value'] for m in mesure if m['key'] == 'SGD']
    values[k]['LSV'] = [m['value'] for m in mesure if m['key'] == 'LSV']
    values[k]['PA'] = [m['value'] for m in mesure if m['key'] == 'PA']
    values[k]['COMP'] = [m['value'] for m in mesure if m['key'] == 'COMP']

    f, p = stats.f_oneway(values[k]['SGD'], values[k]['LSV'], values[k]['PA'],
                          values[k]['COMP'])

    print('One-way ANOVA')
    print('=============')
    print('F value:', f)
    print('P value:', p, '\n')

    mc = MultiComparison([int(m['value']) for m in mesure],
                         [m['key'] for m in mesure])
    result = mc.tukeyhsd()
    print(result)
    print(mc.groupsunique)
Example #35
0
 def setup_class_(self):
     self.mc = MultiComparison(self.endog, self.groups)
     self.res = self.mc.tukeyhsd(alpha=self.alpha)
Example #36
0
    "The standard deviation of the CUPED-adjusted metric is % s." %
    round(std_CUPED, 4))

print(
    "The relative reduction in standard deviation was % s" %
    round(relative_diff * 100, 5),
    "%",
)

# As you can see, we have managed to reduce the relative degree of variance (as measured by standard deviation) by ~4%; now we can perform statistical analysis on our newly computed metric, in order to determine whether there was a statistical effect or not:

# ## Significance testing (post CUPED-Analysis)

# In[10]:

mc = MultiComparison(df['CUPED-adjusted_metric'], df['Variant'])
mc_results = mc.tukeyhsd()
null_hypothesis = mc_results.reject
df_grouped = df[['Variant', 'CUPED-adjusted_metric']]
Control_Matrix = df_grouped[(df_grouped['Variant'] == 'Control_Matrix')]
Variant_BT = df_grouped[(df_grouped['Variant'] == 'Variant_BT')]
Mean_control = round(statistics.mean(Control_Matrix['CUPED-adjusted_metric']),
                     4)
Mean_variant = round(statistics.mean(Variant_BT['CUPED-adjusted_metric']), 4)

print(mc_results)
print(
    "The mean of the Control (Matrix) group is:",
    round(statistics.mean(Control_Matrix['CUPED-adjusted_metric']), 4),
)
print(
Example #37
0
def position_stats(df, name_mapping=None):

    # print '### position stats'
    from statsmodels.stats.weightstats import ztest
    from functools32 import partial, wraps
    POS = df.position.unique()
    POS.sort()
    model = 'value ~ group'
    allpvals = None
    header = None
    DF = None

    ttest_log_wrap = wraps(
        partial(ttest_ind_log, equal_var=False))(ttest_ind_log)
    ttest_ind_nev = wraps(
        partial(stats.ttest_ind, equal_var=False))(stats.ttest_ind)
    mwu_test = wraps(partial(stats.mannwhitneyu, use_continuity=False))(
        stats.mannwhitneyu)

    bootstrap_sample_num = 1000
    # print df

    stats_test = ttest_ind_nev
    GROUPS = df.group.unique()
    # GROUPS = [0,3]

    for pos in POS:
        # print pos
        data = df[df.position == pos]
        data = data.groupby(['sid']).mean()
        data = resample_data(data, num_sample_per_pos=BOOTSTRAP_NUM)
        # print data
        # print data.group.unique()
        # data = df[(df.group == 0) | (df.group == 3)]
        # print data
        # sys.exit()

        #cross = smf.ols(model, data=data).fit()
        #anova = sm.stats.anova_lm(cross, type=1)
        # print data.group

        mcp = MultiComparison(data.value, data.group.astype(int))

        rtp = mcp.allpairtest(stats_test, method='bonf')
        mheader = []
        for itest in rtp[2]:
            name1 = itest[0]
            name2 = itest[1]
            if name_mapping is not None:
                name1 = name_mapping[str(name1)]
                name2 = name_mapping[str(name2)]

            mheader.append("{} - {}".format(name1, name2))

        if not header or len(mheader) > len(header):
            header = mheader

        # get the uncorrecte pvals
        pvals = rtp[1][0][:, 1]

        ndf = pd.DataFrame(data=[pvals], columns=mheader)
        if allpvals is None:
            allpvals = ndf
        else:
            allpvals = pd.concat([allpvals, ndf])

    # return allpvals
    # corr_pvals = allpvals
    # print allpvals
    # return allpvals

    flatten = allpvals.values.ravel()
    flatten = flatten * 2
    mcpres = multipletests(flatten, alpha=0.05, method='bonf')
    # print mcpres
    corr_pvals = np.array(mcpres[1])
    # print corr_pvals
    corr_pvals = np.reshape(corr_pvals, (len(POS), -1))

    # print corr_pvals,corr_pvals.shape,header
    data = pd.DataFrame(data=corr_pvals, columns=header)
    data = data[data.columns[:3]]
    return data
Example #38
0
def ttp_histogram():
    geometries = ("G1", "G3", "G7", "GX")
    ttp = []
    fluomax = []
    for i, pattern in enumerate(geometries):
        ttp.append([])
        fluomax.append([])
        for infile in glob.glob("data/raw/U1_{}_*.h5".format(pattern)):
            data = filereader.Simulation(infile)
            ttp[i].append(data.ttp)
            fluomax[i].append(data.fluomax)

        ttp[i] = np.array(ttp[i])
        fluomax[i] = np.array(fluomax[i])

    for i in range(4):
        ttp[i] = ttp[i][fluomax[i] > 0.4]

    nr_sparks = [len(data) for data in ttp]
    nr_runs = [len(data) for data in fluomax]

    print("  Case | Runs | Sparks | Fidelity")
    fidstring = "{:>6} | {:>4} | {:>6} | {:>7.1%}"
    print("-----------------------------------")
    for geom, sparks, runs in zip(geometries, nr_sparks, nr_runs):
        print fidstring.format(geom, runs, sparks, sparks / float(runs))
    print("-----------------------------------\n")

    print("  Case | Mean TTP ± std. err [ms]")
    print("-----------------------------------")
    for geom, data, n in zip(geometries, ttp, nr_sparks):
        print("{:>6} |    {:.3f} ± {:.3f}".format(geom, np.mean(data),
                                                  np.std(data) / np.sqrt(n)))
    print("-----------------------------------\n")

    bins = np.arange(0, 25, 2.5)
    weights = [np.ones(len(t)) / len(t) for t in ttp]

    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_axes((0.2, 0.2, 0.7, 0.7))
    color = ('blue', (1, 0.28, 0.1), (0.9, 0.18, 0), (0.7, 0.14, 0))

    ax.hist(ttp, bins, weights=weights, histtype='bar', color=color)

    ax.set_xticks((5, 10, 15, 20, 25))
    ax.set_xlabel('Time to Peak [ms]')
    ax.set_ylabel('Fraction')
    plotutil.simpleax(ax)

    ax = fig.add_axes((0.6, 0.6, 0.3, 0.3))
    for i, t in enumerate(ttp):
        ax.bar(i + 1,
               np.mean(t),
               yerr=np.std(t) / np.sqrt(len(t)),
               color=color[i],
               ecolor='black')

    ax.axis((0.5, 5.3, 0, 12))
    ax.set_ylabel('TTP [ms]')
    ax.set_xticks(())
    ax.set_yticks((0, 4, 8, 12))

    print(scipy.stats.f_oneway(*ttp))

    grouplabels = []
    grouplabels.extend(['1'] * nr_sparks[0])
    grouplabels.extend(['3'] * nr_sparks[1])
    grouplabels.extend(['7'] * nr_sparks[2])
    grouplabels.extend(['X'] * nr_sparks[3])
    endog = []
    endog.extend(ttp[0])
    endog.extend(ttp[1])
    endog.extend(ttp[2])
    endog.extend(ttp[3])

    mc = MultiComparison(np.array(endog), np.array(grouplabels))
    result = mc.tukeyhsd(alpha=0.05)
    print(result)
    print(mc.groupsunique)
                                        spectraTransform[np.where(dominant == listDominant[10])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[11])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[12])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[13])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[14])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[15])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[16])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[17])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[18])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[19])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[20])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[21])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[22])[0], w])
        # If the anova turns back a pvalue < 0.05, do multicomparison to figure out what samples are different
        if anovaResults[w, 1] < 0.05:
            mc = MultiComparison(spectraTransform[:, w], dominant)  # http://statsmodels.sourceforge.net/0.6.0/_modules/statsmodels/stats/multicomp.html
            result = mc.tukeyhsd()  # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.MultiComparison.tukeyhsd.html
            inResults = np.array([mc.groupsunique[mc.pairindices[0]], mc.groupsunique[mc.pairindices[1]], result.meandiffs, result.confint[:, 0], result.confint[:, 1], result.std_pairs, result.reject]).T
            inResults = np.column_stack((np.repeat(wavelengths[w], len(result.reject)), inResults))
            tukeyResults = np.vstack((tukeyResults, inResults))

# Set up csv file to output statistical results
outStats = file(outLocation + dateTag + '_statistical_analysis.csv', 'wb')  # Opening in append mode
row1 = np.hstack(('normal distribution p value for original spectra', normalStats))
row2 = np.hstack(('kurtosis p value for original spectra', kurtosisStats))
row3 = np.hstack(('skew p value for original spectra', skewStats))
row4 = np.hstack(('normal distribution p value for transformed spectra', normalTransformStats))
row5 = np.hstack(('kurtosis p value for transformed spectra', kurtosisTransformStats))
row6 = np.hstack(('skew p value for transformed spectra', skewTransformStats))
row7 = np.hstack(('anova results for transformed spectra', anovaResults[:, 1]))
inRows = np.vstack((row1, row2, row3, row4, row5, row6, row7))
Example #40
-1
('Pat', 9),
('Pat', 4),
('Jack', 4),
('Jack', 8),
('Jack', 7),
('Jack', 5),
('Jack', 1),
('Jack', 5),
('Alex', 9),
('Alex', 8),
('Alex', 8),
('Alex', 10),
('Alex', 5),
('Alex', 10)], dtype = [('Archer','|U5'),('Score', '<i8')])

f, p = stats.f_oneway(data[data['Archer'] == 'Pat'].Score,
	              data[data['Archer'] == 'Jack'].Score,
                      data[data['Archer'] == 'Alex'].Score)

print ('One-way ANOVA')
print ('=============')

print ('F value:', f)
print ('P value:', p, '\n')

mc = MultiComparison(data['Score'], data['Archer'])
result = mc.tukeyhsd()

print(result)
print(mc.groupsunique)