def test_incorrect_output(self): # too few groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4) # too many groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6) # just one group assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10) # group_order doesn't select all observations, only one group left assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 5, group_order=[1]) # group_order doesn't select all observations, # we do tukey_hsd with reduced set of observations data = np.arange(15) groups = np.repeat([1, 2, 3], 5) mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2]) res1 = mod1.tukeyhsd(alpha=0.01) mod2 = MultiComparison(np.array(data[:10]), groups[:10]) res2 = mod2.tukeyhsd(alpha=0.01) attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique', 'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14, err_msg=err_msg) attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique', 'ngroups', 'nobs', 'pairindices'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14, err_msg=err_msg)
def test_table_names_custom_group_order(self): # if the group_order parameter is used, the groups should # be reported in the specified order mc = MultiComparison(self.endog, self.groups, group_order=[b'physical', b'medical', b'mental']) res = mc.tukeyhsd(alpha=self.alpha) #print(res) t = res._results_table expected_order = [(b'physical',b'medical'), (b'physical',b'mental'), (b'medical', b'mental')] for i in range(1, 4): first_group = t[i][0].data second_group = t[i][1].data assert_((first_group, second_group) == expected_order[i - 1])
class CheckTuckeyHSD(object): @classmethod def setup_class_(self): self.mc = MultiComparison(self.endog, self.groups) self.res = self.mc.tukeyhsd(alpha=self.alpha) def test_multicomptukey(self): meandiff1 = self.res[1][2] assert_almost_equal(meandiff1, self.meandiff2, decimal=14) confint1 = self.res[1][4] assert_almost_equal(confint1, self.confint2, decimal=2) reject1 = self.res[1][1] assert_equal(reject1, self.reject2) def test_group_tukey(self): res_t = get_thsd(self.mc,alpha=self.alpha) assert_almost_equal(res_t[4], self.confint2, decimal=2) def test_shortcut_function(self): #check wrapper function res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha) assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
print ('One-way ANOVA') print ('F value:', f) if p <= 0.05: print ('P value: {} <= 0.05'.format(p)) print("=> Reject H0\n") else: print ('P value: {} > 0.05'.format(p)) print("=> Fail to reject H0\n") ''' Perform Tukey T-Test ''' mc = MultiComparison(data['Waiting'], data['Policy']) result = mc.tukeyhsd(alpha=0.05) with open('results/svr_rbf_'+sensor_name+'_waiting_plot_diff_means.txt', 'w') as f: f.write(str(result.summary())) result.plot_simultaneous(comparison_name='policyOST') plt.savefig('results/svr_rbf_'+sensor_name+'_waiting_plot_diff_means'+'.png') print("================================================") print("\n\n===For Dataset 2 using Linear Regression===") for sensor_name in d2_data: print("Sensor name", sensor_name) data = np.rec.array(d2_data[sensor_name], dtype = [('Policy','|U10'),('Waiting', '<i8')]) f, p = stats.f_oneway(data[data['Policy'] == 'policyA'].Waiting, data[data['Policy'] == 'policyC'].Waiting, data[data['Policy'] == 'policyM'].Waiting,
import pandas as pd from scipy import stats import statsmodels.api as sm from statsmodels.formula.api import ols from statsmodels.stats.multicomp import MultiComparison tratamento = pd.read_csv('dados/anova.csv', sep=';') tratamento.boxplot(by='Remedio', grid=False) modelo1 = ols('Horas ~ Remedio', data=tratamento).fit() resultados1 = sm.stats.anova_lm(modelo1) modelo2 = ols('Horas ~ Remedio * Sexo', data=tratamento).fit() resultados2 = sm.stats.anova_lm(modelo2) mc = MultiComparison(tratamento['Horas'], tratamento['Remedio']) resultado_teste = mc.tukeyhsd() print(resultado_teste) resultado_teste.plot_simultaneous()
_, p = f_oneway(grupo_a, grupo_b, grupo_c) print(p) alpha = 0.05 if p <= alpha: print("hipotese nula rejeitada") else: print("hipotese alternativa rejeitada") #Teste de Tukey dados = {"valores": [165, 152, 143, 140, 155, 130, 169, 164, 143, 154, 163, 158, 154, 149, 156], "grupo": ['A','A','A','A','A','B','B','B','B','B','C','C','C','C','C']} import pandas as pd dados_pd = pd.DataFrame(dados) print(dados_pd) from statsmodels.stats.multicomp import MultiComparison compara_grupos = MultiComparison(dados_pd['valores'], dados_pd['grupo']) teste = compara_grupos.tukeyhsd() print(teste)
round(outliers, 4)) # New data-frame that omits this range and look at new computed means: # In[11]: df_filtered = df[(df['Pvs_per_session'] < outliers)] df_filtered.groupby(['Variant']).mean() # ## Significance testing # We can now perform our parametric statistical test. The code below will execute an analysis of variance (ANOVA) test, although ANOVA can handle multiple independent variables/groups, the individual test performed between each group will be a t.test, which is the most widely recognised assessment for significance testing using a frequentist framework: # In[12]: mc = MultiComparison(df_filtered['Pvs_per_session'], df_filtered['Variant']) mc_results = mc.tukeyhsd() null_hypothesis = mc_results.reject print(mc_results) print( "Reject null hypothesis and significant difference between experimental groups:", null_hypothesis, ) # As seen above, the results from this experiment are statistically non-significanct we have failed to reject our null hypothesis (that no difference exists between independent groups). Which means you can confidently tell your stakeholders that they needn't worry about the impact on Pvs_per_session by introducing the new widget. It may have increased clicks, but failed to move the needle for our consumption metric. Below, we will look at some useful and user-friendly visualisations that you can present to stakeholders. # ## Visualisations # Prior to plotting data, we are going to get it in a format that makes visualisations easy to interpret. This process involves taking samples (1500) from our Pvs_per_session data for each of our experimental conditions and plotting the distribution of all of the samples. As governed by the central limit this means data will follow a normal/bell-shaped distribution with a sample mean that reflects the overall mean of the data. This is useful for visualisations as there will be no heavy skews/tails in the data whilst still maintaining the mean values for the overall mean of each experimental condition:
def GroupTukeyHSD(self, continuous, categorical): mc = MultiComparison(continuous, categorical) result = mc.tukeyhsd() reject = result.reject meandiffs = result.meandiffs UniqueGroup = mc.groupsunique group1 = [UniqueGroup[index] for index in mc.pairindices[0]] group2 = [UniqueGroup[index] for index in mc.pairindices[1]] reject = result.reject meandiffs = [ round(float(meandiff), 3) for meandiff in result.meandiffs ] columns = ['Group 1', "Group 2", "Mean Difference", "Reject"] TukeyResult = pd.DataFrame(np.column_stack( (group1, group2, meandiffs, reject)), columns=columns) ''' Once Tukey HSD test is done. Select only those entries, with Reject=False. This implies, only entries with similar distribution is selected. Once selected, group them into different distributions. ''' TukeyResult_false = TukeyResult[TukeyResult['Reject'] == 'False'] overall_distribution_list = [] same_distribution_list = [] if len(TukeyResult_false) > 0: for group1 in TukeyResult_false['Group 1'].unique(): if group1 not in overall_distribution_list: temp_list = [] temp_result = TukeyResult_false[ TukeyResult_false['Group 1'] == group1] overall_distribution_list.append(group1) for entry in list(temp_result['Group 2'].unique()): if entry not in overall_distribution_list: overall_distribution_list.append(entry) temp_list.append(entry) temp_list.append(group1) # if temp_result['Group 2'].nunique()>1: # temp_list.extend((temp_result['Group 2'].unique())) # else: # temp_list.append((temp_result['Group 2'].unique()[0])) same_distribution_list.append( dict(list_name=group1.replace(" ", "_"), lists=temp_list, length=len(temp_list))) if len(set(categorical.unique()) - set(overall_distribution_list)) > 0: missing_categories = list( set(categorical.unique()) - set(overall_distribution_list)) for group1 in missing_categories: same_distribution_list.append( dict(list_name=group1.replace(" ", "_"), lists=[group1], length=1)) else: for group1 in categorical.unique(): same_distribution_list.append( dict(list_name=group1.replace(" ", "_"), lists=[group1], length=1)) g1 = pd.DataFrame(same_distribution_list) return (g1.sort_values('length', ascending=False))
Spyder Editor This is a temporary script file. """ import numpy as np import pandas as pd datafile = "/Users/rachelrigg/Google Drive/Lab/Projects/Hsp70/Flow chamber/Stats/Flow chamber data VER PIF.csv" data = pd.read_csv(datafile) from scipy import stats data.boxplot('y', by='trt', figsize=(8, 6)) import statsmodels.api as sm from statsmodels.formula.api import ols mod = ols('y ~ trt', data=data).fit() aov_table = sm.stats.anova_lm(mod, typ=2) print aov_table from statsmodels.stats.multicomp import MultiComparison mc = MultiComparison(data['y'], data['trt']) result = mc.tukeyhsd() print(result) print(mc.groupsunique)
def multi_by_position(df, to_plot): df = df.dropna(subset=[to_plot, 'position_grouping']) mc = MultiComparison(df[to_plot], df['position_grouping']) mc_results = mc.tukeyhsd() return mc_results
cortexsummary = rp.summary_cont(data_frame['cortex'].groupby(data_frame['Slide'])) print(cortexsummary) cerebsummary = rp.summary_cont(data_frame['cerebellum'].groupby(data_frame['Slide'])) print(cerebsummary) hipresults = ols('hippocampus ~ C(Slide)', data=data_frame).fit() hip_table = sm.stats.anova_lm(hipresults, typ=2) cerresults = ols('cerebellum ~ C(Slide)', data=data_frame).fit() cer_table = sm.stats.anova_lm(cerresults, typ=2) cortresults = ols('cortex ~ C(Slide)', data=data_frame).fit() cort_table = sm.stats.anova_lm(cortresults, typ=2) print('HIPPOCAMPUS') hipmc = MultiComparison(data_frame['hippocampus'], data_frame['Slide']) hipmc_results = hipmc.tukeyhsd() print(hipmc_results) print('') print('CEREBELLUM') cermc = MultiComparison(data_frame['cerebellum'], data_frame['Slide']) cermc_results = cermc.tukeyhsd() print(cermc_results) print('') print('CORTEX') cortmc = MultiComparison(data_frame['cortex'], data_frame['Slide']) cortmc_results = cortmc.tukeyhsd() print(cortmc_results)
# 4: Day 2, end data = [] groups = [] for i, session in enumerate(D1): for trial in session[:10]: # print(trial, i, 1) data.append(trial) groups.append(1) for trial in session[-10:]: # print(trial, i, 2) data.append(trial) groups.append(2) for i, session in enumerate(D2): for trial in session[:10]: # print(trial, i, 3) data.append(trial) groups.append(3) for trial in session[-10:]: # print(trial, i, 4) data.append(trial) groups.append(4) data = np.array(data) groups = np.array(groups) from statsmodels.stats.multicomp import MultiComparison print(MultiComparison(data, groups).tukeyhsd())
for_export.insert(for_export.shape[1], 'Tumour anatomy (full description)', obj.meta.structure_name) for_export.to_excel(os.path.join(outdir, "ivygap_signature_scores_and_subgroups.xlsx")) # boxplots showing signature scores in the different niches bplot = {} anova_res = {} tukey_res = {} for k in genesets: the_data = es_z.loc[k] bplot[k] = collections.OrderedDict() for sg in group_list: bplot[k][sg] = the_data.loc[groups.fillna('').str.contains(sg)].values anova_res[k] = stats.f_oneway(*bplot[k].values()) mc = MultiComparison(the_data, groups, group_order=group_list) tukey_res[k] = mc.tukeyhsd(alpha=alpha) lbl, tmp = zip(*bplot[k].items()) tmp = [list(t) for t in tmp] fig = plt.figure(num=k, figsize=(5, 4)) ax = fig.add_subplot(111) sns.boxplot(data=tmp, orient='v', ax=ax, color='0.5') ax.set_xticklabels(lbl, rotation=45) ax.set_ylabel("Normalised ssGSEA score") fig.tight_layout() fig.savefig(os.path.join(outdir, '%s_ssgsea_by_subgroup_tcga.png' % k.lower()), dpi=200) fig.savefig(os.path.join(outdir, '%s_ssgsea_by_subgroup_tcga.pdf' % k.lower())) # can annotate these manually based on statistics?
gtvs = sorted(glob.glob('/run/user/1000/gvfs/smb-share:server=ad,share=fs' '/E210-Projekte/miss-classified/big_gtv/Tumor_*.nrrd')) mask_dir = '/run/user/1000/gvfs/smb-share:server=ad,share=fs/E210-Projekte/miss-classified/big_gtv_bet/' gtv_vols_big, distances_big = feat_calc_miss_classified(gtvs, mask_dir) gtvs = sorted(glob.glob('/run/user/1000/gvfs/smb-share:server=ad,share=fs' '/E210-Projekte/miss-classified/gtv_middle/Tumor_*.nrrd')) mask_dir = '/run/user/1000/gvfs/smb-share:server=ad,share=fs/E210-Projekte/miss-classified/gtv_middle_bet/' gtv_vols_mid, distances_mid = feat_calc_miss_classified(gtvs, mask_dir) gt_als = np.transpose(np.hstack((gtv_vols_mid, gtv_vols_big, gtv_vols_cor))) ds_als = np.transpose(np.hstack((distances_mid, distances_big, distances_cor))) labels = np.transpose(np.hstack((['middle']*len(gtv_vols_mid), ['big']*len(gtv_vols_big), ['corr']*len(gtv_vols_cor)))) mod_gtv = MultiComparison(gt_als, labels) print(mod_gtv.tukeyhsd()) mod_ds = MultiComparison(ds_als, labels) print(mod_ds.tukeyhsd()) print(distances_big) # gtv_vols, distances = feat_calc_corr_classified() # print('Correlation between distances and volumes: {}'.format(np.corrcoef(gtv_vols, distances)[0, 1])) # print('Median distance: {}'.format(np.median(distances))) # print('Median GTV volume: {}'.format(np.median(gtv_vols))) # plot.scatter(gtv_vols, distances) # plot.show() # print(distances)
import matplotlib.pyplot as plt for i in ["regular", "death", "emergentjobs", "hardjobs", "suddentasks"]: for joblength in [0, 5, 10, 15]: print("starting to show results for {} with job length {}".format( i, joblength)) data = np.loadtxt( "/home/drew/tmp/jumpNN/{}/{}/allTimeResults.txt".format( i, joblength), dtype={ 'names': ('mean', 'group'), 'formats': ('f4', 'S20') }) # http://www.statsmodels.org/stable/_modules/statsmodels/sandbox/stats/multicomp.html#MultiComparison mc = MultiComparison(data['mean'], data['group']) result = mc.tukeyhsd() print(result) print(mc.groupsunique) a = result.plot_simultaneous() #a.title("Mean Wait Time Tukey Test Results for {} with Job Length {}".format(i, joblength)) a.show() raw_input("Press Enter to continue...") # for i in ["regular", "emergentjobs", "hardjobs", "death", "suddentasks"]: # for joblength in [0, 5, 10, 15]: # print("starting to show results for {} with job length {}".format(i, joblength)) # data = np.loadtxt("/home/drew/tmp/jumpNN/{}/{}/allBountyResults.txt".format(i, joblength), dtype={'names': ('mean', 'group'), 'formats': ('f4', 'S20')})
print(train_data2) # Performed an ANOVA on the 'Sex' column using 'Survived' as the independent variable. #for the anova we need to create our one-way model: model = ols('Sex ~ Survived', data=train_data2).fit() #actually run the anova: aov_table = sm.stats.anova_lm(model, typ=2) print("ANOVA 1 results:") print(aov_table) #So, we will compare the three groups against each other: print("\nTukey HSD results:") mc = MultiComparison(train_data2['Sex'], train_data2['Survived']) result = mc.tukeyhsd() print(result) print("There is a correlation to sex to survival. The female's had a higher chance of surviving.") # Perform a similiar ANOVA on PClass using 'Survived' as the independent variable. #for the anova we need to create our one-way model: model = ols('Pclass ~ Survived', data=train_data2).fit() #actually run the anova: aov_table = sm.stats.anova_lm(model, typ=2) print("ANOVA 2 results:")
fontsize = 20 fig, axes = plt.subplots() test = sns.violinplot('Coating',param,data=df,ax=axes) axes.set_ylabel('') axes.set_xlabel('') figure = test.get_figure() figure.savefig(param+ '.png', dpi=400) f,p = stats.kruskal(df[df['Coating']=='L2P'][param], df[df['Coating']=='N2P'][param], df[df['Coating']=='P10'][param]) mc = MultiComparison(df[param], df['Coating']) result = mc.tukeyhsd() print ('Nonparametric One-way ANOVA') print ('=============') print ('F value:', f) print ('P value:', p, '\n') print(result) print(mc.groupsunique) # PCA of dfCoat X = df.iloc[:,3:13] # table of all the values (ignore well and duplicate, cols 1 and 2)
# brown-forsythe test w, p_bf = stats.levene(edg['WPM'], graf['WPM'], uni['WPM'], center='median') check_p('brown forsythe test', assumption='homogeneity of variance', p_val=p_bf) # non-significance shows we don't have a violation # now that we know our assumptions have not been violated, we can fit the ANOVA. This is the omnibus test alpha_lm = ols('WPM ~ C(Alphabet)', data=alpha).fit() logger.info(f'ANOVA summary: \n\n {alpha_lm.summary()}') # Prob (F-statistic) shows that there is some difference between the different Alphabets but does not tell us where the # difference is. For that we do the pairwise comparisons # tukey comparison followed by holm adjustment (not sure how to combine the two mc = MultiComparison(alpha['WPM'], alpha['Alphabet']) logger.info(f'tukey comparison2: \n {mc.tukeyhsd()}') comp = mc.allpairtest(stats.ttest_ind, method='Holm') logger.info(f'holm corrected version: \n {comp[0]}') # non parametric version of one-way ANOVA chi, p = stats.kruskal(edg['WPM'], graf['WPM'], uni['WPM']) check_p(descr='Kruskal chi squared test', assumption='', p_val=p) # mann whitney mw, p_eg = stats.mannwhitneyu(edg['WPM'], graf['WPM'], alternative='two-sided') logger.info(f'mann-whitney stat edg vs. graf: {mw}, p value: {p_eg}') mw, p_ug = stats.mannwhitneyu(uni['WPM'], graf['WPM'], alternative='two-sided') logger.info(f'mann-whitney stat uni vs. graf: {mw}, p value: {p_ug}') mw, p_ue = stats.mannwhitneyu(uni['WPM'], edg['WPM'], alternative='two-sided') logger.info(f'mann-whitney stat EC vs. PC: {mw}, p value: {p_ue}')
## ## TUKEY ## print ('\n\n\n#### TESTE TUKEY ####') # arrayegua = [] # for i in range(num_execs): arrayegua.append(("GA", best_ga[i])) # for i in range(num_execs): arrayegua.append(("ED", best_ed[i])) # for i in range(num_execs): arrayegua.append(("PSO", best_pso[i])) # for i in range(num_execs): arrayegua.append(("EP", best_ep[i])) # for i in range(num_execs): arrayegua.append(("ABC", best_abc[i])) # # data_arr = np.rec.array(arrayegua, dtype=[('Algoritmo', '|U5'), ('Fitness', float)]) mc = MultiComparison(data_arr['Fitness'], data_arr['Algoritmo']) turkey_result = mc.tukeyhsd() print(turkey_result) print(mc.groupsunique) ##########################33 # bounds = [(-5,5)]*30 # result = differential_evolution(rosen, bounds) # print (result.x) # print("\n") # print (result.fun) #########################3
def run_stats(experiment): '''Run independent T-test or one-way ANOVA dependent on number of groups. Args: experiment (Experiment instance): An instance of the Experiment class. Returns: A new Pandas data frame with p values, adjusted p values and Tukey HSD post-hoc results if there are > 2 groups. ''' groups = experiment.get_groups() samples = experiment.get_sampleids() df = experiment.df all_vals = [] ## Get values for each group, ready for T-test or ANOVA. for group in groups: sample_re = re.compile(group + "_\d+$") ids = [sample for sample in samples if sample_re.match(sample)] vals = list(map(list, df[ids].values)) all_vals.append(vals) ## Decide whether to use T-test or ANOVA dependent on number of groups. if len(groups) == 2: p_vals = [ttest_ind(all_vals[0][i], all_vals[1][i])[1] for i in range(len(all_vals[0]))] else: p_vals = [] for i in range(len(all_vals[0])): row_vals = [all_vals[j][i] for j in range(len(groups))] p_val = f_oneway(*row_vals)[1] p_vals.append(p_val) ## Adjust the p values and create a new data frame with them in. p_val_adj = list(multipletests(p_vals, method='fdr_bh')[1]) new_df = df.ix[:, :5].copy() new_df['p_val'] = pd.Series(p_vals, index=new_df.index) new_df['p_val_adj'] = pd.Series(p_val_adj, index=new_df.index) ## Post-hoc test. ## Only do the post-hoc test if there are more than 2 groups, duh! if len(groups) > 2: vals_df = df[samples] group_ids = [sample.split('_')[0] for sample in vals_df.columns.values] posthoc_results = {} ## Run the post-hoc test on each row. for row in range(len(vals_df)): row_vals = vals_df.ix[row] mc = MultiComparison(row_vals, group_ids) mc_groups = mc.groupsunique results = mc.tukeyhsd() significant = results.reject pairs = list(zip(*[x.tolist() for x in mc.pairindices])) ## Go through each pair and add results to the posthoc_results dictionary. for i in range(len(pairs)): pair = list(pairs[i]) pair.sort() pair_name = str(mc_groups[pair[0]]) + '_' + str(mc_groups[pair[1]]) if pair_name in posthoc_results: posthoc_results[pair_name].append(significant[i]) else: posthoc_results[pair_name] = [significant[i]] ## Add the post-hoc results to the data frame. for pair_name in posthoc_results: new_df['significant_' + pair_name] = posthoc_results[pair_name] return new_df
# order=['Sibs','Tau','Lesion'], ) p.legend_.remove() # p.set_yticks(np.arange(0.1,0.52,0.04)) sns.despine(trim=True) condition_s = set(std_plt['condition'].values) condition_s = list(condition_s) if len(condition_s) == 2: # Paired T Test for 2 conditions # Separate data by condition. std_cond1 = std_plt.loc[std_plt['condition'] == condition_s[0]].sort_values(by='excluded_exp') std_cond2 = std_plt.loc[std_plt['condition'] == condition_s[1]].sort_values(by='excluded_exp') ttest_res, ttest_p = ttest_rel(std_cond1['std(posture)'], std_cond2['std(posture)']) print( f'* Age {age}: {condition_s[0]} v.s. {condition_s[1]} paired t-test p-value = {ttest_p}' ) elif len(condition_s) > 2: multi_comp = MultiComparison( ang_std_all['std(posture)'], ang_std_all['dpf'] + ang_std_all['condition']) print(f'* Age {age}:') print(multi_comp.tukeyhsd().summary()) else: pass plt.show()
( 29, 'medical', 3 ), ( 30, 'medical', 1 )], dtype=[('idx', '<i4'), ('Treatment', '|S8'), ('StressReduction', '<i4')]) # First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print anovaResults if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print mod.tukeyhsd()[0] # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print mod.groupsunique # Generate a print import matplotlib.pyplot as plt plt.plot([0,1,2], res2[1][2], 'o') plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim)
anova_results = anova_lm(anova_reg) print('\nANOVA results\n', anova_results) #Check for heteroskedasticity sm.qqplot(anova_reg.resid, line='s') plt.show() ###### #Post Hoc Tests for One-way ANOVA #Tukey test - good when groups are the same size and have and homogeneous variance postHoc = pairwise_tukeyhsd(alldata['Fare_Per_Person'], alldata['Embarked'], alpha=0.05) print(postHoc) #Pairwise comparison using Bonferroni correction of p-values mc = MultiComparison(alldata['Fare_Per_Person'], alldata['Embarked']) #print(mc.allpairtest(stats.ttest_rel, method='Holm')[0]) #For paired t-test print(mc.allpairtest(stats.ttest_ind, method='b')[0]) #For independent t-test ###### #ANCOVA #Look for heteroskedasticity plt.plot(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1)]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) &(alldata['Sex_male']==1)]['Group_Size'], 'bo') plt.show() #Second class male passengers with a fare price > 0 seem OK #There are a couple group sizes with only 1 observation with these criteria though, so make sure to filter them out too #Test for heteroskedasticity print(levenes_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size'])) print(bartlett_test(alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Fare_Per_Person'], alldata[(alldata['Pclass']==2) & (alldata['Sex_male']==1) & (alldata['Fare']>0) & (alldata['Group_Size'].isin([1,2,3,4,8,9,10,11]))]['Group_Size']))
data[data['Game'] == groupNames[1]].Score, data[data['Game'] == groupNames[2]].Score) # print output and conclusion of the one-way ANOVA between the groups anovaPrintout = 'One-way ANOVA\n=============\nF-value: ' + str( f) + '\np-value: ' + str(p) + '\n' anovaConclusion = 'CONCLUSION:\n-> Significant DIFFERENCE found!' if p < 0.05 else 'CONCLUSION:\n-> NO significant difference found...' # append to output file outputFileName = 'output_cohendadded.txt' with open(outputFileName, 'a') as fl: fl.write('\nData File: ' + fileName + '\n===========================\n' + anovaPrintout + anovaConclusion + '\n') if p < 0.05: mc = MultiComparison(data['Score'], data['Game']) result = mc.tukeyhsd() with open(outputFileName, 'a') as f: f.write(str(result) + '\n') f.write('\t\tmeandiff = mean(group2) - mean(group1)\n') # f.write('Unique groups:\n') # f.write(str(mc.groupsunique)+'\n') f.write('Significantly higher:\n') f.write(checkWhichSigHigher(str(result), groupNames)) with open(outputFileName, 'a') as f: f.write('Cohen\'s d effect sizes\n======================\n') # now get Cohen's d effect sizes: fileData = formatDataForCohenCalc(fileData) for i in range(len(groupNames)):
jitter=True, data=final_df[(final_df.Event == 'Cells') | (final_df.Event == treatments[ind])], ax=panel) plt.savefig('respiration.svg') plt.show() #statistical analysis final_df['Slope'] = final_df.Slope.astype(np.float) #fit linear model model = ols('Slope ~ Event + Experiment', data=final_df).fit() print(model.summary()) print('ANOVA analysis') aov_table = sm.stats.anova_lm(model, type=2) print(aov_table) print('Post hoc tukey') mc = MultiComparison(final_df['Slope'], final_df['Event']) mc_results = mc.tukeyhsd() print(mc_results.summary()) p_values = psturng(np.abs(mc_results.meandiffs / mc_results.std_pairs), len(mc_results.groupsunique), mc_results.df_total) print('p_values: ', p_values)
virginica['sepal_length'])) print("\nPetal Width:") print( stats.f_oneway(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])) print("\nPetal Length:") print( stats.f_oneway(setosa['petal_length'], versicolor['petal_length'], virginica['petal_length'])) # Posthoc test- Tukey's HSD print( "\nPost-hoc test performed to determine which groups are statistically significantly different from each other" ) print("Sepal Width:") mc1 = MultiComparison(df['sepal_width'], df['variety']) print(mc1.tukeyhsd()) print() print("\nSepal Length:") mc2 = MultiComparison(df['sepal_length'], df['variety']) print(mc2.tukeyhsd()) print() print("\nPetal Width:") mc3 = MultiComparison(df['petal_width'], df['variety']) print(mc3.tukeyhsd()) print() print("\nPetal Length:") mc4 = MultiComparison(df['petal_length'], df['variety']) print(mc4.tukeyhsd()) #Using Matplotlib to generate histograms of each variable
def main(): # Note: the statsmodels module is required here. from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison) from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm # Set up the data, as a structured array. # The first and last field are 32-bit intergers; the second field is an # 8-byte string. Note that here we can also give names to the individual # fields! dta2 = np.rec.array([ ( 1, 'mental', 2 ), ( 2, 'mental', 2 ), ( 3, 'mental', 3 ), ( 4, 'mental', 4 ), ( 5, 'mental', 4 ), ( 6, 'mental', 5 ), ( 7, 'mental', 3 ), ( 8, 'mental', 4 ), ( 9, 'mental', 4 ), ( 10, 'mental', 4 ), ( 11, 'physical', 4 ), ( 12, 'physical', 4 ), ( 13, 'physical', 3 ), ( 14, 'physical', 5 ), ( 15, 'physical', 4 ), ( 16, 'physical', 1 ), ( 17, 'physical', 1 ), ( 18, 'physical', 2 ), ( 19, 'physical', 3 ), ( 20, 'physical', 3 ), ( 21, 'medical', 1 ), ( 22, 'medical', 2 ), ( 23, 'medical', 2 ), ( 24, 'medical', 2 ), ( 25, 'medical', 3 ), ( 26, 'medical', 2 ), ( 27, 'medical', 3 ), ( 28, 'medical', 1 ), ( 29, 'medical', 3 ), ( 30, 'medical', 1 )], dtype=[('idx', '<i4'), ('Treatment', '|S8'), ('StressReduction', '<i4')]) # First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print(anovaResults) if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print((mod.tukeyhsd().summary())) # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print((mod.groupsunique)) # Generate a print import matplotlib.pyplot as plt xvals = np.arange(3) plt.plot(xvals, res2.meandiffs, 'o') #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') errors = np.ravel(np.diff(res2.confint)/2) plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)] plt.xticks(xvals, pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') # Save to outfile outFile = 'MultComp.png' plt.savefig('MultComp.png', dpi=200) print(('Figure written to {0}'.format(outFile))) plt.show() # Instead of the Tukey's test, we can do pairwise t-test # First, with the "Holm" correction rtp = mod.allpairtest(stats.ttest_rel, method='Holm') print((rtp[0])) # and then with the Bonferroni correction print((mod.allpairtest(stats.ttest_rel, method='b')[0])) # Done this way, the variance is calculated at each comparison. # If you want the joint variance across all samples, you have to # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html) res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) studentized_mean = res2.meandiffs studentized_variance = res2.variance t_stat = (studentized_mean / studentized_variance) / np.sqrt(2) dof = len(dta2) - len(mod.groupsunique) my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2 # two-sided # Now with the Bonferroni correction from statsmodels.stats.multitest import multipletests res_b = multipletests(my_pvalues, method='b') return res2.variance
def setup_class_(self): self.mc = MultiComparison(self.endog, self.groups) self.res = self.mc.tukeyhsd(alpha=self.alpha)
def setup_class_(cls): cls.mc = MultiComparison(cls.endog, cls.groups) cls.res = cls.mc.tukeyhsd(alpha=cls.alpha)
import pandas as pd import matplotlib.pyplot as plt from scipy import stats import statsmodels.api as sm from statsmodels.formula.api import ols from statsmodels.stats.multicomp import MultiComparison tratamento = pd.read_csv('database/anova.csv', sep=';') #dados q serao utilizados modelo1 = ols('Horas ~ Remedio', data=tratamento).fit( ) #cria e treina o modelo ('V.Dependente ~ V.Independente') resul1 = sm.stats.anova_lm(modelo1) #realiza a avalição cm a anova modelo2 = ols('Horas ~ Remedio * Sexo', data=tratamento).fit( ) #cria e treina o modelo ('V.Dependente ~ V.Independente * V.Independente') resul2 = sm.stats.anova_lm(modelo2) #realiza a avalição cm a anova mc = MultiComparison( tratamento['Horas'], tratamento['Remedio']) #faz uma comparação entre as duas variaveis resul_test = mc.tukeyhsd() #realiza o teste TukeyHSD grafico = resul_test.plot_simultaneous( ) #grafico especial para demonstrar os dados #plt.show()
ms_8 = [36, 39, 39, 38, 44, 42, 48, 38, 46, 37, 46] ms_9 = [35, 29, 39, 37, 40, 36, 43, 48, 41, 44, 42] data = np.rec.array([ ( 47, 'ms_1'), ( 47, 'ms_1'), ( 49, 'ms_1'), ( 45, 'ms_1'), ( 42, 'ms_1'), ( 43 , 'ms_1'), ( 39, 'ms_1'), ( 48, 'ms_1'), ( 43, 'ms_1'), ( 41, 'ms_2'), ( 45, 'ms_2'), ( 39, 'ms_2'), ( 48, 'ms_2'), ( 39, 'ms_2'), ( 37, 'ms_2'), ( 42, 'ms_2'), ( 47, 'ms_2'), ( 44, 'ms_2'), ( 42, 'ms_2'), ( 47, 'ms_3'), ( 42, 'ms_3'), ( 45 , 'ms_3'), ( 43, 'ms_3'), ( 47, 'ms_3'), ( 37, 'ms_3'), ( 43, 'ms_3'), ( 47, 'ms_3'), ( 41, 'ms_3'), ( 39, 'ms_3'), ( 38, 'ms_4'), ( 38, 'ms_4'), ( 37, 'ms_4'), ( 45, 'ms_4'), ( 41, 'ms_4'), ( 41, 'ms_4'), ( 41, 'ms_4'), ( 46, 'ms_4'), ( 53, 'ms_4'), ( 45, 'ms_4'), ( 44, 'ms_4'), ( 47, 'ms_5'), ( 47, 'ms_5'), ( 49, 'ms_5'), ( 44, 'ms_5'), ( 49, 'ms_5'), ( 46, 'ms_5'), ( 41, 'ms_5'), ( 41, 'ms_5'), ( 42, 'ms_5'), ( 46, 'ms_5'), ( 41, 'ms_6'), ( 41, 'ms_6'), ( 45, 'ms_6'), ( 38, 'ms_6'), ( 42, 'ms_6'), ( 33, 'ms_6'), ( 45, 'ms_6'), ( 43, 'ms_6'), ( 44, 'ms_6'), ( 44, 'ms_6'), ( 46, 'ms_6'), ( 35, 'ms_7'), ( 36, 'ms_7'), ( 44, 'ms_7'), ( 32, 'ms_7'), ( 40, 'ms_7'), ( 41, 'ms_7'), ( 43, 'ms_7'), ( 45, 'ms_7'), ( 48, 'ms_7'), ( 48, 'ms_7'), ( 36, 'ms_8'), ( 39, 'ms_8'), ( 39, 'ms_8'), ( 38, 'ms_8'), ( 44, 'ms_8'), ( 42, 'ms_8'), ( 48, 'ms_8'), ( 38, 'ms_8'), ( 46, 'ms_8'), ( 37, 'ms_8'), ( 46, 'ms_8'), ( 35, 'ms_9'), ( 29, 'ms_9'), ( 39, 'ms_9'), ( 37, 'ms_9'), ( 40, 'ms_9'), ( 36, 'ms_9'), ( 43, 'ms_9'), ( 48, 'ms_9'), ( 41, 'ms_9'), ( 44, 'ms_9'), ( 42, 'ms_9')], dtype = [('score', '<i4'), ('student', '|S8')]) print("Results from Levene's test, dealing with homogeneity of variance:") print(stats.levene(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9)) print("Results from one-way ANOVA:") print(stats.f_oneway(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9)) print("Results of the Kruskla Wallis Test:") print(stats.mstats.kruskalwallis(ms_1, ms_2, ms_3, ms_4, ms_5, ms_6, ms_7, ms_8, ms_9)) mc = MultiComparison(data['score'], data['student']) results = mc.tukeyhsd(alpha=0.1) print(results)
sigfeats_out.to_csv(sigfeats_outpath, header=False) if TukeyHSD: # Tally total number of significantly different pairwise comparisons n_sigdiff_pairwise_beforeBF = 0 n_sigdiff_pairwise_afterBF = 0 # Tukey HSD post-hoc pairwise differences between dates for each feature for feature in feature_colnames: # Tukey HSD post-hoc analysis (no Bonferroni correction!) tukeyHSD = pairwise_tukeyhsd(OP50_control_df[feature], OP50_control_df['date_yyyymmdd']) n_sigdiff_pairwise_beforeBF += sum(tukeyHSD.reject) # Tukey HSD post-hoc analysis (Bonferroni correction) tukeyHSD_BF = MultiComparison(OP50_control_df[feature], OP50_control_df['date_yyyymmdd']) n_sigdiff_pairwise_afterBF += sum( tukeyHSD_BF.tukeyhsd().reject) total_comparisons = len(feature_colnames) * 6 reject_H0_percentage = n_sigdiff_pairwise_afterBF / total_comparisons * 100 print("%d / %d (%.1f%%) of pairwise-comparisons of imaging dates (%d features) show significant variation for OP50 control (TukeyHSD)" %\ (n_sigdiff_pairwise_afterBF, total_comparisons, reject_H0_percentage, len(feature_colnames))) # TODO: Reverse-engineer p-values using mean/std #from statsmodels.stats.libqsturng import psturng ##studentized range statistic #rs = res2[1][2] / res2[1][3] #pvalues = psturng(np.abs(rs), 3, 27) # Mantel test?
print('F value:', f) print('P value:', p, '\n') from statsmodels.stats.multicomp import pairwise_tukeyhsd from statsmodels.stats.multicomp import MultiComparison import itertools headers = [ 'P-PG', 'P-TD', 'P-RMSPROP', 'NEAT', 'NEAT-EM-P-PG', 'NEAT-EM-P-TD', 'NEAT-EM-P-RMSPROP' ] group_names = [] for header in headers: group_names += list(itertools.repeat(header, min_length)) mc = MultiComparison(np.asarray(stripped_groups).flatten(), group_names) result = mc.tukeyhsd() from statsmodels.stats.libqsturng import psturng print(result) print(mc.groupsunique) print( psturng(np.abs(result.meandiffs / result.std_pairs), len(result.groupsunique), result.df_total)) def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), stats.sem(a)
print('Accuracy') path_to_data = 'D:/Sasha/subversion/trunk/AuthorshipAttributionRussianTexts/results/statistical_tests_data.json' with codecs.open(path_to_data, 'r') as f: data = f.read() data = json.loads(data) for k in data: values = {} mesure = data[k] values[k] = {} values[k]['SGD'] = [m['value'] for m in mesure if m['key'] == 'SGD'] values[k]['LSV'] = [m['value'] for m in mesure if m['key'] == 'LSV'] values[k]['PA'] = [m['value'] for m in mesure if m['key'] == 'PA'] values[k]['COMP'] = [m['value'] for m in mesure if m['key'] == 'COMP'] f, p = stats.f_oneway(values[k]['SGD'], values[k]['LSV'], values[k]['PA'], values[k]['COMP']) print('One-way ANOVA') print('=============') print('F value:', f) print('P value:', p, '\n') mc = MultiComparison([int(m['value']) for m in mesure], [m['key'] for m in mesure]) result = mc.tukeyhsd() print(result) print(mc.groupsunique)
"The standard deviation of the CUPED-adjusted metric is % s." % round(std_CUPED, 4)) print( "The relative reduction in standard deviation was % s" % round(relative_diff * 100, 5), "%", ) # As you can see, we have managed to reduce the relative degree of variance (as measured by standard deviation) by ~4%; now we can perform statistical analysis on our newly computed metric, in order to determine whether there was a statistical effect or not: # ## Significance testing (post CUPED-Analysis) # In[10]: mc = MultiComparison(df['CUPED-adjusted_metric'], df['Variant']) mc_results = mc.tukeyhsd() null_hypothesis = mc_results.reject df_grouped = df[['Variant', 'CUPED-adjusted_metric']] Control_Matrix = df_grouped[(df_grouped['Variant'] == 'Control_Matrix')] Variant_BT = df_grouped[(df_grouped['Variant'] == 'Variant_BT')] Mean_control = round(statistics.mean(Control_Matrix['CUPED-adjusted_metric']), 4) Mean_variant = round(statistics.mean(Variant_BT['CUPED-adjusted_metric']), 4) print(mc_results) print( "The mean of the Control (Matrix) group is:", round(statistics.mean(Control_Matrix['CUPED-adjusted_metric']), 4), ) print(
def position_stats(df, name_mapping=None): # print '### position stats' from statsmodels.stats.weightstats import ztest from functools32 import partial, wraps POS = df.position.unique() POS.sort() model = 'value ~ group' allpvals = None header = None DF = None ttest_log_wrap = wraps( partial(ttest_ind_log, equal_var=False))(ttest_ind_log) ttest_ind_nev = wraps( partial(stats.ttest_ind, equal_var=False))(stats.ttest_ind) mwu_test = wraps(partial(stats.mannwhitneyu, use_continuity=False))( stats.mannwhitneyu) bootstrap_sample_num = 1000 # print df stats_test = ttest_ind_nev GROUPS = df.group.unique() # GROUPS = [0,3] for pos in POS: # print pos data = df[df.position == pos] data = data.groupby(['sid']).mean() data = resample_data(data, num_sample_per_pos=BOOTSTRAP_NUM) # print data # print data.group.unique() # data = df[(df.group == 0) | (df.group == 3)] # print data # sys.exit() #cross = smf.ols(model, data=data).fit() #anova = sm.stats.anova_lm(cross, type=1) # print data.group mcp = MultiComparison(data.value, data.group.astype(int)) rtp = mcp.allpairtest(stats_test, method='bonf') mheader = [] for itest in rtp[2]: name1 = itest[0] name2 = itest[1] if name_mapping is not None: name1 = name_mapping[str(name1)] name2 = name_mapping[str(name2)] mheader.append("{} - {}".format(name1, name2)) if not header or len(mheader) > len(header): header = mheader # get the uncorrecte pvals pvals = rtp[1][0][:, 1] ndf = pd.DataFrame(data=[pvals], columns=mheader) if allpvals is None: allpvals = ndf else: allpvals = pd.concat([allpvals, ndf]) # return allpvals # corr_pvals = allpvals # print allpvals # return allpvals flatten = allpvals.values.ravel() flatten = flatten * 2 mcpres = multipletests(flatten, alpha=0.05, method='bonf') # print mcpres corr_pvals = np.array(mcpres[1]) # print corr_pvals corr_pvals = np.reshape(corr_pvals, (len(POS), -1)) # print corr_pvals,corr_pvals.shape,header data = pd.DataFrame(data=corr_pvals, columns=header) data = data[data.columns[:3]] return data
def ttp_histogram(): geometries = ("G1", "G3", "G7", "GX") ttp = [] fluomax = [] for i, pattern in enumerate(geometries): ttp.append([]) fluomax.append([]) for infile in glob.glob("data/raw/U1_{}_*.h5".format(pattern)): data = filereader.Simulation(infile) ttp[i].append(data.ttp) fluomax[i].append(data.fluomax) ttp[i] = np.array(ttp[i]) fluomax[i] = np.array(fluomax[i]) for i in range(4): ttp[i] = ttp[i][fluomax[i] > 0.4] nr_sparks = [len(data) for data in ttp] nr_runs = [len(data) for data in fluomax] print(" Case | Runs | Sparks | Fidelity") fidstring = "{:>6} | {:>4} | {:>6} | {:>7.1%}" print("-----------------------------------") for geom, sparks, runs in zip(geometries, nr_sparks, nr_runs): print fidstring.format(geom, runs, sparks, sparks / float(runs)) print("-----------------------------------\n") print(" Case | Mean TTP ± std. err [ms]") print("-----------------------------------") for geom, data, n in zip(geometries, ttp, nr_sparks): print("{:>6} | {:.3f} ± {:.3f}".format(geom, np.mean(data), np.std(data) / np.sqrt(n))) print("-----------------------------------\n") bins = np.arange(0, 25, 2.5) weights = [np.ones(len(t)) / len(t) for t in ttp] fig = plt.figure(figsize=(6, 6)) ax = fig.add_axes((0.2, 0.2, 0.7, 0.7)) color = ('blue', (1, 0.28, 0.1), (0.9, 0.18, 0), (0.7, 0.14, 0)) ax.hist(ttp, bins, weights=weights, histtype='bar', color=color) ax.set_xticks((5, 10, 15, 20, 25)) ax.set_xlabel('Time to Peak [ms]') ax.set_ylabel('Fraction') plotutil.simpleax(ax) ax = fig.add_axes((0.6, 0.6, 0.3, 0.3)) for i, t in enumerate(ttp): ax.bar(i + 1, np.mean(t), yerr=np.std(t) / np.sqrt(len(t)), color=color[i], ecolor='black') ax.axis((0.5, 5.3, 0, 12)) ax.set_ylabel('TTP [ms]') ax.set_xticks(()) ax.set_yticks((0, 4, 8, 12)) print(scipy.stats.f_oneway(*ttp)) grouplabels = [] grouplabels.extend(['1'] * nr_sparks[0]) grouplabels.extend(['3'] * nr_sparks[1]) grouplabels.extend(['7'] * nr_sparks[2]) grouplabels.extend(['X'] * nr_sparks[3]) endog = [] endog.extend(ttp[0]) endog.extend(ttp[1]) endog.extend(ttp[2]) endog.extend(ttp[3]) mc = MultiComparison(np.array(endog), np.array(grouplabels)) result = mc.tukeyhsd(alpha=0.05) print(result) print(mc.groupsunique)
spectraTransform[np.where(dominant == listDominant[10])[0], w], spectraTransform[np.where(dominant == listDominant[11])[0], w], spectraTransform[np.where(dominant == listDominant[12])[0], w], spectraTransform[np.where(dominant == listDominant[13])[0], w], spectraTransform[np.where(dominant == listDominant[14])[0], w], spectraTransform[np.where(dominant == listDominant[15])[0], w], spectraTransform[np.where(dominant == listDominant[16])[0], w], spectraTransform[np.where(dominant == listDominant[17])[0], w], spectraTransform[np.where(dominant == listDominant[18])[0], w], spectraTransform[np.where(dominant == listDominant[19])[0], w], spectraTransform[np.where(dominant == listDominant[20])[0], w], spectraTransform[np.where(dominant == listDominant[21])[0], w], spectraTransform[np.where(dominant == listDominant[22])[0], w]) # If the anova turns back a pvalue < 0.05, do multicomparison to figure out what samples are different if anovaResults[w, 1] < 0.05: mc = MultiComparison(spectraTransform[:, w], dominant) # http://statsmodels.sourceforge.net/0.6.0/_modules/statsmodels/stats/multicomp.html result = mc.tukeyhsd() # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.MultiComparison.tukeyhsd.html inResults = np.array([mc.groupsunique[mc.pairindices[0]], mc.groupsunique[mc.pairindices[1]], result.meandiffs, result.confint[:, 0], result.confint[:, 1], result.std_pairs, result.reject]).T inResults = np.column_stack((np.repeat(wavelengths[w], len(result.reject)), inResults)) tukeyResults = np.vstack((tukeyResults, inResults)) # Set up csv file to output statistical results outStats = file(outLocation + dateTag + '_statistical_analysis.csv', 'wb') # Opening in append mode row1 = np.hstack(('normal distribution p value for original spectra', normalStats)) row2 = np.hstack(('kurtosis p value for original spectra', kurtosisStats)) row3 = np.hstack(('skew p value for original spectra', skewStats)) row4 = np.hstack(('normal distribution p value for transformed spectra', normalTransformStats)) row5 = np.hstack(('kurtosis p value for transformed spectra', kurtosisTransformStats)) row6 = np.hstack(('skew p value for transformed spectra', skewTransformStats)) row7 = np.hstack(('anova results for transformed spectra', anovaResults[:, 1])) inRows = np.vstack((row1, row2, row3, row4, row5, row6, row7))
('Pat', 9), ('Pat', 4), ('Jack', 4), ('Jack', 8), ('Jack', 7), ('Jack', 5), ('Jack', 1), ('Jack', 5), ('Alex', 9), ('Alex', 8), ('Alex', 8), ('Alex', 10), ('Alex', 5), ('Alex', 10)], dtype = [('Archer','|U5'),('Score', '<i8')]) f, p = stats.f_oneway(data[data['Archer'] == 'Pat'].Score, data[data['Archer'] == 'Jack'].Score, data[data['Archer'] == 'Alex'].Score) print ('One-way ANOVA') print ('=============') print ('F value:', f) print ('P value:', p, '\n') mc = MultiComparison(data['Score'], data['Archer']) result = mc.tukeyhsd() print(result) print(mc.groupsunique)