def compute_anova_rev_restrict_t(topdir: str, m: int): # Assemble a large experiment table with all data neighbors = ["5", "10", "15", "20"] tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'] dfs = [] for n in neighbors: for tol in tolerances: casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n casetable = ac.compute_stored_runs(casedir, m, None) casetable['TOL'] = [float(tol)] * 5 casetable['NNN'] = [float(n)] * 5 dfs.append(casetable) dfa = pd.concat(dfs).reset_index(drop=True) df = dfa[dfa['TOL'] != 1.0] # Perform a regression with the data results = ols('REV ~ C(TOL) + C(NNN) + C(TOL):C(NNN)', data=df).fit() print(results.summary()) print('\n\n\n') aov_table = sm.stats.anova_lm(results, typ=2) print(aov_table) print('\n\n\n') mct = MultiComparison(df['REV'], df['TOL']) mct_results = mct.tukeyhsd() print(mct_results) mcn = MultiComparison(df['REV'], df['NNN']) mcn_results = mcn.tukeyhsd() print(mcn_results)
def test_incorrect_output(self): # too few groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4) # too many groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6) # just one group assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10) # group_order doesn't select all observations, only one group left assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 5, group_order=[1]) # group_order doesn't select all observations, # we do tukey_hsd with reduced set of observations data = np.arange(15) groups = np.repeat([1, 2, 3], 5) mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2]) res1 = mod1.tukeyhsd(alpha=0.01) mod2 = MultiComparison(np.array(data[:10]), groups[:10]) res2 = mod2.tukeyhsd(alpha=0.01) attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique', 'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14, err_msg=err_msg) attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique', 'ngroups', 'nobs', 'pairindices'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14, err_msg=err_msg)
def ANOVA_TimePoints(self, combine_sexes=True): """ Hardcoded to use count as the parameter and treatment as the grouping identifier. This is a limitation due to these values being in a string, can work around this later by building a string according to the format below. """ mc_results_to_return = [] summary_to_return = [] if combine_sexes: for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[self.df_dropna['Time Point'] == timepoint] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) elif not combine_sexes: for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[ (self.df_dropna['Time Point'] == timepoint) & (self.df_dropna['Treatment'].isin( self.male_treatment_labels))] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[ (self.df_dropna['Time Point'] == timepoint) & (self.df_dropna['Treatment'].isin( self.female_treatment_labels))] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) return summary_to_return, mc_results_to_return else: print( "Did not understand parameters for which stats to do here. Looking for True or False" )
def test_incorrect_output(self): # too few groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4) # too many groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6) # just one group assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10) # group_order doesn't select all observations, only one group left with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 5, group_order=[1]) # group_order doesn't select all observations, # we do tukey_hsd with reduced set of observations data = np.arange(15) groups = np.repeat([1, 2, 3], 5) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2]) assert_equal(len(w), 1) assert issubclass(w[0].category, UserWarning) res1 = mod1.tukeyhsd(alpha=0.01) mod2 = MultiComparison(np.array(data[:10]), groups[:10]) res2 = mod2.tukeyhsd(alpha=0.01) attributes = [ 'confint', 'data', 'df_total', 'groups', 'groupsunique', 'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance' ] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14, err_msg=err_msg) attributes = [ 'data', 'datali', 'groupintlab', 'groups', 'groupsunique', 'ngroups', 'nobs', 'pairindices' ] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14, err_msg=err_msg)
def tukey(structure, alpha, valutation): #Tukey calculation pairwise and multiple comparisons and finally print the plot if valutation == 'ap': data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("Average Precision (AP)", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_ap.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_ap.txt", "w") fw.write(str(result)) print(result) fw.close() elif valutation == 'p_10': data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("P(10)", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_p10.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_p10.txt", "w") fw.write(str(result)) print(result) fw.close() else: data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("Rprec", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_rprec.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_rprec.txt", "w") fw.write(str(result)) print(result) fw.close()
def tukey_test(self): """It applies Tukey test to the dataframe. Tukey is a multi-comparison method. Discover more at https://en.wikipedia.org/wiki/Tukey’s_range_test . Be sure you are working with a normal distribution""" MultiComp = MultiComparison(self.df.values, self.df.index) tukey = MultiComp.tukeyhsd().summary() print("\nTukey test for rows\n" + str(tukey) + "\n") self.results.write("\nTukey test for rows\n" + str(tukey) + "\n") MultiComp = MultiComparison(self.df.T.values, self.df.columns) tukey2 = MultiComp.tukeyhsd().summary() print("\nTukey test for columns\n" + str(tukey2) + "\n") self.results.write("\nTukey test for columns\n" + str(tukey2) + "\n") return (tukey, tukey2)
def tukeyhsd(statistics_table: pandas.DataFrame, column: str) -> Dict[str, TukeyHSDResults]: """ Perfors tukey multiple-comparison statistics. Parameters ---------- statistics_table: A table with each subject as a separate column column: The column with the relevant values. Should be identical to the `y` variable used when generating figures. """ is_nested = statistics_table['condition'].nunique() != 1 if is_nested: subjects = ['plate', 'strain', 'condition'] else: subjects = ['plate', 'strain'] tukey_results = dict() for subject in subjects: logger.debug(f"tukey subject: '{subject}'") logger.debug( f"tukey subject values: {statistics_table[subject].unique()}") # MultiComparison doesn't work when there are only two possible groups, so disable this if we only have 2 categories. number_of_unique_categories = statistics_table[subject].nunique() if number_of_unique_categories > 2: tukey_result = MultiComparison( statistics_table[column], statistics_table[subject]).tukeyhsd() tukey_results[subject] = tukey_result statistics_table['condition:strain'] = statistics_table[ 'condition'] + "-" + statistics_table['strain'] mc = MultiComparison(statistics_table[column], statistics_table['condition:strain']) tukey_results['condition_strain'] = mc.tukeyhsd() return tukey_results
def get_tukey(exp, df_all, measure): # Tukey posthoc analysis # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html # And https://code.google.com/archive/p/qsturng-py/ # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations if len(df_all.groupby('strain').count()) >= 3: df_tukey = df_all[np.isfinite(df_all[measure])] mc = MultiComparison(df_tukey[measure], df_tukey['strain']) result = mc.tukeyhsd() p = psturng(np.abs(result.meandiffs / result.std_pairs), len(result.groupsunique), result.df_total) df_pairs = pd.DataFrame({ 'group1': [ result._results_table[1][0], result._results_table[2][0], result._results_table[3][0] ], 'group2': [ result._results_table[1][1], result._results_table[2][1], result._results_table[3][1] ], 'p_value': [np.around(p[0], 4), np.around(p[1], 4), np.around(p[2], 4)] }) else: df_pairs = pd.DataFrame({'group1': [], 'group2': [], 'p_value': []}) file_out = exp.name + '_coupling_' + measure + '_' + '_tukey_' + '.csv' pfiles.save_csv(df_pairs, file_out, exp.dir_tukey, False) return df_pairs
def scipy_anova_post_hoc_tests(df=None, flight_status_col='flight status new', sig_test=stats.f_oneway): """ df should be melted by aberration type """ # make list of aberrations aberrations = list(df['aberration type'].unique()) # loop through aberrations & perform anovas between pre/mid/post for aberr in aberrations: g_1 = df[(df[flight_status_col] == 'Pre-Flight') & (df['aberration type'] == aberr)]['count per cell'] g_2 = df[(df[flight_status_col] == 'Mid-Flight') & (df['aberration type'] == aberr)]['count per cell'] g_3 = df[(df[flight_status_col] == 'Post-Flight') & (df['aberration type'] == aberr)]['count per cell'] statistic, p_value = sig_test(g_1, g_2, g_3) print(aberr, p_value) # if anova detects sig diff, perform post-hoc tests if p_value <= 0.05: mc = MultiComparison( df[df['aberration type'] == aberr]['count per cell'], df[df['aberration type'] == aberr][flight_status_col]) mc_results = mc.tukeyhsd() print(mc_results) res = mc_results print( f'pvalues: {list(psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total))}' ) print('\n')
def get_tukey(df_groups, measure, groups): # Tukey posthoc analysis # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html # And https://code.google.com/archive/p/qsturng-py/ # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations df_tukey = df_groups[3][np.isfinite(df_groups[3][measure])] #print(df_tukey) mc = MultiComparison(df_tukey[measure], df_tukey['strain']) #result = pairwise_tukeyhsd(mc.data,mc.groups,0.05) result = mc.tukeyhsd() p = psturng(np.abs(result.meandiffs / result.std_pairs), len(result.groupsunique), result.df_total) df_pairs = pd.DataFrame({ 'group1': [ result._results_table[1][0], result._results_table[2][0], result._results_table[3][0] ], 'group2': [ result._results_table[1][1], result._results_table[2][1], result._results_table[3][1] ], 'p_value': [np.around(p[0], 4), np.around(p[1], 4), np.around(p[2], 4)] }) for index, row in df_pairs.iterrows(): stars = get_stars(row['p_value']) df_pairs.loc[index, 'significance'] = stars return df_pairs
def anova_all( type, file_output, *datas ): together = pd.DataFrame() for d in datas: together = pd.concat( [together, d] ) groups = together.groupby(['strategy']) fvalue, pvalue = stats.f_oneway( # groups.get_group("m")["result"], # groups.get_group("pr")["result"], # groups.get_group("mpr")["result"], # groups.get_group("ols")["result"], groups.get_group("om")["result"], groups.get_group("opr")["result"], groups.get_group("ompr")["result"] ) f = open( file_output, "a" ) f.write( type + ":" + str( pvalue ) + "\n" ) if pvalue < 0.05: mc = MultiComparison( together["result"], together["strategy"] ) mc_results = mc.tukeyhsd() print( str( round( pvalue, 4 ) ), file=f ) print( mc_results, file=f )
class CheckTuckeyHSD(object): @classmethod def setup_class_(self): self.mc = MultiComparison(self.endog, self.groups) self.res = self.mc.tukeyhsd(alpha=self.alpha) def test_multicomptukey(self): meandiff1 = self.res[1][2] assert_almost_equal(meandiff1, self.meandiff2, decimal=14) confint1 = self.res[1][4] assert_almost_equal(confint1, self.confint2, decimal=2) reject1 = self.res[1][1] assert_equal(reject1, self.reject2) def test_group_tukey(self): res_t = get_thsd(self.mc,alpha=self.alpha) assert_almost_equal(res_t[4], self.confint2, decimal=2) def test_shortcut_function(self): #check wrapper function res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha) assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
def create_df_with_all_post_hoc(df, hue='value'): index_0 = [ "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7', "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml", "rcm" ] index_1 = ["comparaison 0", "comparaison 1", "comparaison 2"] index = pd.MultiIndex.from_product([index_0, index_1]) columns = ['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject'] n_row = len(index_0) * len(index_1) n_col = len(columns) data = np.empty((n_row, n_col)) data[:] = np.nan full_df = pd.DataFrame(data, index=index, columns=columns) for v in index_0: data = get_data_for_anova(df, v, hue) results = ols("{} ~ C(level)".format(v), data=data).fit() mc = MultiComparison(data[v], data['level']) post_hoc = mc.tukeyhsd() df_ph = create_df_post_hoc(post_hoc, v) if results.f_pvalue < 0.05: full_df.loc[(v)] = df_ph.values else: full_df.loc[(v)] = 'anova ns' return full_df
def print_hpcontrast(self, data: list, labels: list, alpha: float = 0.05): """Contrast the hypoteses of the scores given in the list and print the results in the report. Using Kurskal-Wallis and Tuckeyhsd tests. :param data: List containing the metric results of the models :type data: list :param labels: List containg the models tags :type labels: list :param alpha: Number for the pValue of the test, defaults to 0.05 :type alpha: float, optional """ _, pVal = stats.kruskal(*data) str_toprint = f"p-valor KrusW:{pVal}\n" if pVal <= alpha: str_toprint += ( "Hypotheses are being rejected: the models are different\n") stacked_data = np.vstack(data).ravel() cv = len(data[0]) model_rep = [] for i in labels: model_rep.append(np.repeat("model" + i, cv)) stacked_model = np.vstack(model_rep).ravel() multi_comp = MultiComparison(stacked_data, stacked_model) comp = multi_comp.tukeyhsd(alpha=alpha) str_toprint += str(comp) else: str_toprint = ( str_toprint + "Hypotheses are being accepted: the models are equal") self.print_noformat(str_toprint)
def Tukey(self, Categorical, Continuous): """ Calculate Tukey Honest Significance Difference (HSD) Test, to identify the groups whose distributions are significantly different """ temp_df = self.df.dropna() start = time.time() mc = MultiComparison(temp_df[Continuous], temp_df[Categorical]) result = mc.tukeyhsd() reject = result.reject meandiffs = result.meandiffs UniqueGroup = mc.groupsunique group1 = [UniqueGroup[index] for index in mc.pairindices[0]] group2 = [UniqueGroup[index] for index in mc.pairindices[1]] reject = result.reject meandiffs = [ round(float(meandiff), 3) for meandiff in result.meandiffs ] columns = ['Group 1', "Group 2", "Mean Difference", "Reject"] TukeyResult = pd.DataFrame(np.column_stack( (group1, group2, meandiffs, reject)), columns=columns) end = time.time() if self.debug == 'YES': print('Tukey', end - start) return TukeyResult
def anova_analysis(df): """ anova_analysis takes in a data frame and performs an anova test for hypothesis testing 1 prints out the test results """ time_periods = df.groupby(['week_ending','Holiday'],as_index = False)[['seats_sold']].sum() TG = time_periods.loc[time_periods['Holiday'] == 'ThanksGiving','seats_sold'] WB = time_periods.loc[time_periods['Holiday'] == 'WinterBreak','seats_sold'] SB = time_periods.loc[time_periods['Holiday'] == 'SummerBreak','seats_sold'] NH = time_periods.loc[time_periods['Holiday'] == 'Not Holiday','seats_sold'] f,p = stats.f_oneway(TG,WB,SB,NH) print('The f and p of ANOVA analysis are:') print(f,p) ## plot the mean of each group time_periods.boxplot('seats_sold', by='Holiday', figsize=(12, 8)) fileName = 'ANOVA.png' plt.savefig(fileName) print("The mean seats sold of each time periods:") print(time_periods.groupby('Holiday')['seats_sold'].mean()) pairwise = MultiComparison(time_periods['seats_sold'], time_periods['Holiday']) result = pairwise.tukeyhsd() print(pairwise) print(result)
def posthoc_turron_by_gender(melted, variable): df = melted.copy() df = df[df['variable'] == variable] df['turron:gender'] = df['turron'] + "_" + df['gender'] mc = MultiComparison(df['value'], df['turron:gender']) result = mc.tukeyhsd() print(result)
def tukey_multi_metrics(gather_df, col_indices=list(range(10)) + list(range(15, 20)) + list(range(25, 65)) + list(range(145, 149)) + list(range(150, 180)), alpha=0.05): metric_names = list(gather_df.columns.values[col_indices]) model_names = list(gather_df.index.levels[0]) tukey_dict = {} # drop fold means and medians gather_df = gather_df[metric_names] gather_df = gather_df.xs('test_metrics', level='set') gather_df = gather_df.drop('Folds Mean', level='fold') gather_df = gather_df.drop('Folds Median', level='fold') # get fold count model_names_rep = [] for m in model_names: k = gather_df.xs(m, level='model').shape[0] model_names_rep.extend([m for _ in range(k)]) for i, metric in zip(range(len(metric_names)), metric_names): m_df = gather_df[metric] m_df.sort_index(inplace=True) m_df = m_df.loc[model_names] m_df_mat = np.around(m_df.as_matrix(), decimals=4) mc_obj = MultiComparison(m_df_mat, model_names_rep) tukey_res = mc_obj.tukeyhsd(alpha=alpha) tukey_dict[metric] = tukey_res return tukey_dict
def tukey_hsd(data, groups, metric): """ANOVA and Tukey HSD post-hoc comparison from statsmodels. The Tukey HSD post-hoc comparison test controls for type I error and maintains the familywise error rate at 0.05. The group1 and group2 columns are the groups being compared, meandiff column is the difference in means of the two groups being calculated as group2 – group1, lower/upper columns are the lower/upper boundaries of the 95% confidence interval, reject column states whether or not the null hypothesis should be rejected. Args: data (Dataframe): Dataframe grouped by dialogue group and label_type values. groups (string): Indicates which columns values to group data for comparison i.e. label_type. metric (string): Indicates which column name has the result values i.e. values/times. Returns: tukey_frame (Dataframe): Contains f-statistic, p-value and eta/omega effect sizes. group1 group2 meandiff p-value lower upper reject 0 ap ap type -0.1167 0.8059 -0.5825 0.3492 False 1 ap da 0.1000 0.8543 -0.3659 0.5659 False 2 ap type da 0.2167 0.5158 -0.2492 0.6825 False """ # Compare the results (metric) for the range of values for this experiment_type multi_comparison = MultiComparison(data=data[metric], groups=data[groups]) # Create the tukey results table tukey_results = multi_comparison.tukeyhsd() # Convert the results to a dataframe tukey_frame = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0]) tukey_frame.rename(columns={'p-adj': 'p-value'}, inplace=True) return tukey_frame
def _calc_firing_rate(self, num_peaks: pd.DataFrame, epoch: str = "All_cells"): """ Sum all indices of peaks to find the average firing rate of cells in the three epochs :return: """ # Remove silent cells from comparison split_data = num_peaks.stack() mc = MultiComparison(split_data.values, split_data.index.get_level_values(1).values) try: res = mc.tukeyhsd() except ValueError: aprint("<yellow>Failed during the p-value calculation.</yellow>") else: print(res) print( f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):", psturng( np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total, ), ) finally: print(split_data.mean(level=1))
def preform_anova(df_anova, post_hoc=False): ### perform anova no_interaction = 'err ~ C(u) + C(m) + C(n) + C(h) + C(q) + C(prob)' with_interactions = 'err ~ C(u) * C(m) * C(n) * C(h) * C(q) * C(prob)' formula = 'err ~ u + m + n + h + q + prob' model = ols(formula, data=df_anova).fit() aov_table = sm.stats.anova_lm(model, typ=2) # Type 2 is for ANOVA DataFrame ### effect size (R^2) (R_squared) esq_sm = aov_table['sum_sq'][:-1] / (aov_table['sum_sq'][:-1] + aov_table['sum_sq'][-1]) aov_table.loc[:, 'r_sq'] = esq_sm print(aov_table) if post_hoc: for name, row in aov_table.iterrows(): if row['PR(>F)'] < 5e-2: mc = MultiComparison(df_anova[name], df_anova['err']) mc_results = mc.tukeyhsd() print('post_hoc') print(mc_results) aov_table.to_csv('analysis/anova_results.csv')
def calc_Tukey(f): ''' f= name of feature ''' mc = MultiComparison(data.grouped_features[f]['value'], data.grouped_features[f]['KD']) mc_results = mc.tukeyhsd() return mc_results
def posthoc_turron_by_first_time_tasting(melted, variable): df = melted.copy() df = df[df['variable'] == variable].dropna() df['turron:first_time_tasting'] = df['turron'] + "_" + df['first_time_tasting'] mc = MultiComparison(df['value'], df['turron:first_time_tasting']) res = mc.tukeyhsd() from statsmodels.stats.libqsturng import psturng p_values = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total) print(p_values) print(res)
def tukey(prediction: 'array of ints', model: 'array of string'): """ This performs Tukey HSD test, input: array(int(predictions)), array(str(model_names)) this test says if there are significant differences between the classes. """ mc = MultiComparison(prediction, model) mc_results = mc.tukeyhsd() print(mc_results) return mc_results
def perform_post_hoc_tukey(data, factor1, factor2, factor3, factor2_idx): for factor2_idx in range(len(factor2_idx)): print(factor2, ': ', factor2_idx + 1) mc = MultiComparison( data[data[factor2] == factor2_idx + 1][factor1], index_data_response_MDD[index_data_response_MDD[factor2] == factor2_idx + 1][factor3]) result = mc.tukeyhsd() print(result) print(mc.groupsunique)
def tukey(data, names): names = np.array(names) tmp = [] for item in data: for val in item: tmp.append(val) data = np.array(tmp) mc = MultiComparison(data, names) result = mc.tukeyhsd() print(result) print(mc.groupsunique)
def tukey(exp): mc = MultiComparison(exp, groups) result = mc.tukeyhsd() print(result) print(mc.groupsunique) # calc_p_value(exp2__saccade_count, exp3__saccade_count) # mean_calcs(exp3__saccade_count) # anova(exp2__fix_dur)
def tukey_hsd(self, stacked_df, colname): ''' input: stacked_df: from table_transform, a stacked df colname: string, the category to compare return: tukeyhsd table result and stacked table set up tukey hsd for post anova with significance ''' MultiComp = MultiComparison(stacked_df[colname], stacked_df['state']) return MultiComp.tukeyhsd().summary()
def GroupTukeyHSD(self,continuous, categorical): try: mc = MultiComparison(continuous, categorical) result = mc.tukeyhsd() reject = result.reject meandiffs = result.meandiffs UniqueGroup = mc.groupsunique group1 = [UniqueGroup[index] for index in mc.pairindices[0]] group2 = [UniqueGroup[index] for index in mc.pairindices[1]] reject = result.reject meandiffs = [round(float(meandiff),3) for meandiff in result.meandiffs] columns = ['Group 1', "Group 2", "Mean Difference", "Reject"] TukeyResult = pd.DataFrame(np.column_stack((group1, group2, meandiffs, reject)), columns=columns) ''' Once Tukey HSD test is done. Select only those entries, with Reject=False. This implies, only entries with similar distribution is selected. Once selected, group them into different distributions. ''' TukeyResult_false = TukeyResult[TukeyResult['Reject']=='False'] overall_distribution_list = [] same_distribution_list = [] if len(TukeyResult_false) > 0: for group1 in TukeyResult_false['Group 1'].unique(): if group1 not in overall_distribution_list: temp_list=[] temp_result = TukeyResult_false[TukeyResult_false['Group 1']== group1] overall_distribution_list.append(group1) temp_list.append(group1) for entry in list(temp_result['Group 2'].unique()): if entry not in overall_distribution_list: overall_distribution_list.append(entry) temp_list.append(entry) # if temp_result['Group 2'].nunique()>1: # temp_list.extend((temp_result['Group 2'].unique())) # else: # temp_list.append((temp_result['Group 2'].unique()[0])) same_distribution_list.append(dict(list_name=group1, lists=temp_list, length=len(temp_list))) if len(set(categorical.unique())-set(overall_distribution_list)) >0: missing_categories = list(set(categorical.unique())-set(overall_distribution_list)) for group1 in missing_categories: same_distribution_list.append(dict(list_name=group1, lists=[group1], length=1)) else: for group1 in categorical.unique(): same_distribution_list.append(dict(list_name=group1, lists=[group1], length=1)) g1 = pd.DataFrame(same_distribution_list).sort_values('length',ascending=False) except: g1 = pd.DataFrame() return g1
def tukey_test(score_array): # create INTEGER indexes to label scores index_array = [] for i in range(len(score_array)): index_dummy = np.array([(int(i + 1)) for j in range(len(score_array[i]))]) index_array.append(index_dummy) # transform arrays to tuples score_tuple = tuple(map(tuple, score_array)) index_tuple = tuple(map(tuple, np.array(index_array))) # format data for tukey function indexes = np.concatenate(index_tuple, axis=0) values = np.concatenate(score_tuple, axis=0) data = {'means': values, 'group': indexes} # perform the pairwise tukey test MultiComp2 = MultiComparison(data['means'], data['group']) print(MultiComp2.tukeyhsd(0.05).summary()) return MultiComp2.tukeyhsd(0.05)
class TukeyHSD(Difference): name = "tukeyhsd" # from https://cleverowl.uk/2015/07/01/using-one-way-anova-and-tukeys-test-to-compare-data-sets/ """Assumes normality, IID, one-to-one data pairing.""" def run(self, data, groups): self.mc = MultiComparison(data, groups) self.result = self.mc.tukeyhsd() return self.result, self.mc.groupsunique def report(self): info("{} results:".format(self.name)) info("result: {}".format(self.result.__str__())) info("Unique groups: {}".format(self.mc.groupsunique))
def test_table_names_custom_group_order(self): # if the group_order parameter is used, the groups should # be reported in the specified order mc = MultiComparison(self.endog, self.groups, group_order=[b'physical', b'medical', b'mental']) res = mc.tukeyhsd(alpha=self.alpha) #print(res) t = res._results_table expected_order = [(b'physical',b'medical'), (b'physical',b'mental'), (b'medical', b'mental')] for i in range(1, 4): first_group = t[i][0].data second_group = t[i][1].data assert_((first_group, second_group) == expected_order[i - 1])
( 30, 'medical', 1 )], dtype=[('idx', '<i4'), ('Treatment', '|S8'), ('StressReduction', '<i4')]) # First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print anovaResults if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print mod.tukeyhsd()[0] # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print mod.groupsunique # Generate a print import matplotlib.pyplot as plt plt.plot([0,1,2], res2[1][2], 'o') plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim)
spectraTransform[np.where(dominant == listDominant[11])[0], w], spectraTransform[np.where(dominant == listDominant[12])[0], w], spectraTransform[np.where(dominant == listDominant[13])[0], w], spectraTransform[np.where(dominant == listDominant[14])[0], w], spectraTransform[np.where(dominant == listDominant[15])[0], w], spectraTransform[np.where(dominant == listDominant[16])[0], w], spectraTransform[np.where(dominant == listDominant[17])[0], w], spectraTransform[np.where(dominant == listDominant[18])[0], w], spectraTransform[np.where(dominant == listDominant[19])[0], w], spectraTransform[np.where(dominant == listDominant[20])[0], w], spectraTransform[np.where(dominant == listDominant[21])[0], w], spectraTransform[np.where(dominant == listDominant[22])[0], w]) # If the anova turns back a pvalue < 0.05, do multicomparison to figure out what samples are different if anovaResults[w, 1] < 0.05: mc = MultiComparison(spectraTransform[:, w], dominant) # http://statsmodels.sourceforge.net/0.6.0/_modules/statsmodels/stats/multicomp.html result = mc.tukeyhsd() # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.MultiComparison.tukeyhsd.html inResults = np.array([mc.groupsunique[mc.pairindices[0]], mc.groupsunique[mc.pairindices[1]], result.meandiffs, result.confint[:, 0], result.confint[:, 1], result.std_pairs, result.reject]).T inResults = np.column_stack((np.repeat(wavelengths[w], len(result.reject)), inResults)) tukeyResults = np.vstack((tukeyResults, inResults)) # Set up csv file to output statistical results outStats = file(outLocation + dateTag + '_statistical_analysis.csv', 'wb') # Opening in append mode row1 = np.hstack(('normal distribution p value for original spectra', normalStats)) row2 = np.hstack(('kurtosis p value for original spectra', kurtosisStats)) row3 = np.hstack(('skew p value for original spectra', skewStats)) row4 = np.hstack(('normal distribution p value for transformed spectra', normalTransformStats)) row5 = np.hstack(('kurtosis p value for transformed spectra', kurtosisTransformStats)) row6 = np.hstack(('skew p value for transformed spectra', skewTransformStats)) row7 = np.hstack(('anova results for transformed spectra', anovaResults[:, 1])) inRows = np.vstack((row1, row2, row3, row4, row5, row6, row7)) np.savetxt(outStats, inRows, fmt='%s', delimiter=',')
def main(): # Note: the statsmodels module is required here. from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison) from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm # Set up the data, as a structured array. # The first and last field are 32-bit intergers; the second field is an # 8-byte string. Note that here we can also give names to the individual # fields! dta2 = np.rec.array([ ( 1, 'mental', 2 ), ( 2, 'mental', 2 ), ( 3, 'mental', 3 ), ( 4, 'mental', 4 ), ( 5, 'mental', 4 ), ( 6, 'mental', 5 ), ( 7, 'mental', 3 ), ( 8, 'mental', 4 ), ( 9, 'mental', 4 ), ( 10, 'mental', 4 ), ( 11, 'physical', 4 ), ( 12, 'physical', 4 ), ( 13, 'physical', 3 ), ( 14, 'physical', 5 ), ( 15, 'physical', 4 ), ( 16, 'physical', 1 ), ( 17, 'physical', 1 ), ( 18, 'physical', 2 ), ( 19, 'physical', 3 ), ( 20, 'physical', 3 ), ( 21, 'medical', 1 ), ( 22, 'medical', 2 ), ( 23, 'medical', 2 ), ( 24, 'medical', 2 ), ( 25, 'medical', 3 ), ( 26, 'medical', 2 ), ( 27, 'medical', 3 ), ( 28, 'medical', 1 ), ( 29, 'medical', 3 ), ( 30, 'medical', 1 )], dtype=[('idx', '<i4'), ('Treatment', '|S8'), ('StressReduction', '<i4')]) # First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print(anovaResults) if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print((mod.tukeyhsd().summary())) # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print((mod.groupsunique)) # Generate a print import matplotlib.pyplot as plt xvals = np.arange(3) plt.plot(xvals, res2.meandiffs, 'o') #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') errors = np.ravel(np.diff(res2.confint)/2) plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)] plt.xticks(xvals, pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') # Save to outfile outFile = 'MultComp.png' plt.savefig('MultComp.png', dpi=200) print(('Figure written to {0}'.format(outFile))) plt.show() # Instead of the Tukey's test, we can do pairwise t-test # First, with the "Holm" correction rtp = mod.allpairtest(stats.ttest_rel, method='Holm') print((rtp[0])) # and then with the Bonferroni correction print((mod.allpairtest(stats.ttest_rel, method='b')[0])) # Done this way, the variance is calculated at each comparison. # If you want the joint variance across all samples, you have to # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html) res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) studentized_mean = res2.meandiffs studentized_variance = res2.variance t_stat = (studentized_mean / studentized_variance) / np.sqrt(2) dof = len(dta2) - len(mod.groupsunique) my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2 # two-sided # Now with the Bonferroni correction from statsmodels.stats.multitest import multipletests res_b = multipletests(my_pvalues, method='b') return res2.variance
def run_stats(experiment): '''Run independent T-test or one-way ANOVA dependent on number of groups. Args: experiment (Experiment instance): An instance of the Experiment class. Returns: A new Pandas data frame with p values, adjusted p values and Tukey HSD post-hoc results if there are > 2 groups. ''' groups = experiment.get_groups() samples = experiment.get_sampleids() df = experiment.df all_vals = [] ## Get values for each group, ready for T-test or ANOVA. for group in groups: sample_re = re.compile(group + "_\d+$") ids = [sample for sample in samples if sample_re.match(sample)] vals = list(map(list, df[ids].values)) all_vals.append(vals) ## Decide whether to use T-test or ANOVA dependent on number of groups. if len(groups) == 2: p_vals = [ttest_ind(all_vals[0][i], all_vals[1][i])[1] for i in range(len(all_vals[0]))] else: p_vals = [] for i in range(len(all_vals[0])): row_vals = [all_vals[j][i] for j in range(len(groups))] p_val = f_oneway(*row_vals)[1] p_vals.append(p_val) ## Adjust the p values and create a new data frame with them in. p_val_adj = list(multipletests(p_vals, method='fdr_bh')[1]) new_df = df.ix[:, :5].copy() new_df['p_val'] = pd.Series(p_vals, index=new_df.index) new_df['p_val_adj'] = pd.Series(p_val_adj, index=new_df.index) ## Post-hoc test. ## Only do the post-hoc test if there are more than 2 groups, duh! if len(groups) > 2: vals_df = df[samples] group_ids = [sample.split('_')[0] for sample in vals_df.columns.values] posthoc_results = {} ## Run the post-hoc test on each row. for row in range(len(vals_df)): row_vals = vals_df.ix[row] mc = MultiComparison(row_vals, group_ids) mc_groups = mc.groupsunique results = mc.tukeyhsd() significant = results.reject pairs = list(zip(*[x.tolist() for x in mc.pairindices])) ## Go through each pair and add results to the posthoc_results dictionary. for i in range(len(pairs)): pair = list(pairs[i]) pair.sort() pair_name = str(mc_groups[pair[0]]) + '_' + str(mc_groups[pair[1]]) if pair_name in posthoc_results: posthoc_results[pair_name].append(significant[i]) else: posthoc_results[pair_name] = [significant[i]] ## Add the post-hoc results to the data frame. for pair_name in posthoc_results: new_df['significant_' + pair_name] = posthoc_results[pair_name] return new_df
('Pat', 9), ('Pat', 4), ('Jack', 4), ('Jack', 8), ('Jack', 7), ('Jack', 5), ('Jack', 1), ('Jack', 5), ('Alex', 9), ('Alex', 8), ('Alex', 8), ('Alex', 10), ('Alex', 5), ('Alex', 10)], dtype = [('Archer','|U5'),('Score', '<i8')]) f, p = stats.f_oneway(data[data['Archer'] == 'Pat'].Score, data[data['Archer'] == 'Jack'].Score, data[data['Archer'] == 'Alex'].Score) print ('One-way ANOVA') print ('=============') print ('F value:', f) print ('P value:', p, '\n') mc = MultiComparison(data['Score'], data['Archer']) result = mc.tukeyhsd() print(result) print(mc.groupsunique)