def compute_anova_rev_restrict_t(topdir: str, m: int): # Assemble a large experiment table with all data neighbors = ["5", "10", "15", "20"] tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'] dfs = [] for n in neighbors: for tol in tolerances: casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n casetable = ac.compute_stored_runs(casedir, m, None) casetable['TOL'] = [float(tol)] * 5 casetable['NNN'] = [float(n)] * 5 dfs.append(casetable) dfa = pd.concat(dfs).reset_index(drop=True) df = dfa[dfa['TOL'] != 1.0] # Perform a regression with the data results = ols('REV ~ C(TOL) + C(NNN) + C(TOL):C(NNN)', data=df).fit() print(results.summary()) print('\n\n\n') aov_table = sm.stats.anova_lm(results, typ=2) print(aov_table) print('\n\n\n') mct = MultiComparison(df['REV'], df['TOL']) mct_results = mct.tukeyhsd() print(mct_results) mcn = MultiComparison(df['REV'], df['NNN']) mcn_results = mcn.tukeyhsd() print(mcn_results)
def tukeyhsd(statistics_table: pandas.DataFrame, column: str) -> Dict[str, TukeyHSDResults]: """ Perfors tukey multiple-comparison statistics. Parameters ---------- statistics_table: A table with each subject as a separate column column: The column with the relevant values. Should be identical to the `y` variable used when generating figures. """ is_nested = statistics_table['condition'].nunique() != 1 if is_nested: subjects = ['plate', 'strain', 'condition'] else: subjects = ['plate', 'strain'] tukey_results = dict() for subject in subjects: logger.debug(f"tukey subject: '{subject}'") logger.debug( f"tukey subject values: {statistics_table[subject].unique()}") # MultiComparison doesn't work when there are only two possible groups, so disable this if we only have 2 categories. number_of_unique_categories = statistics_table[subject].nunique() if number_of_unique_categories > 2: tukey_result = MultiComparison( statistics_table[column], statistics_table[subject]).tukeyhsd() tukey_results[subject] = tukey_result statistics_table['condition:strain'] = statistics_table[ 'condition'] + "-" + statistics_table['strain'] mc = MultiComparison(statistics_table[column], statistics_table['condition:strain']) tukey_results['condition_strain'] = mc.tukeyhsd() return tukey_results
def analyzeData(results2): print('Accuracy') print( AnovaRM(data=results2, depvar='Accuracy', subject='Subject', within=['Condition'], aggregate_func='mean').fit()) MultiComp = MultiComparison(results2['Accuracy'], results2['Condition']) comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf') print(comp[0]) print('Reaction Time') print( AnovaRM(data=results2, depvar='Reaction Time', subject='Subject', within=['Condition'], aggregate_func='mean').fit()) MultiComp = MultiComparison(results2['Reaction Time'], results2['Condition']) comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf') print(comp[0])
def ANOVA_TimePoints(self, combine_sexes=True): """ Hardcoded to use count as the parameter and treatment as the grouping identifier. This is a limitation due to these values being in a string, can work around this later by building a string according to the format below. """ mc_results_to_return = [] summary_to_return = [] if combine_sexes: for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[self.df_dropna['Time Point'] == timepoint] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) elif not combine_sexes: for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[ (self.df_dropna['Time Point'] == timepoint) & (self.df_dropna['Treatment'].isin( self.male_treatment_labels))] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) for timepoint in self.pr_columns: print("Time point: " + timepoint) df_timepoint = self.df_dropna.loc[ (self.df_dropna['Time Point'] == timepoint) & (self.df_dropna['Treatment'].isin( self.female_treatment_labels))] results = ols('Count ~ C(Treatment)', data=df_timepoint).fit() print(results.summary()) mc = MultiComparison(df_timepoint['Count'], df_timepoint['Treatment']) mc_results = mc.tukeyhsd() print(mc_results) summary_to_return.append(results) mc_results_to_return.append(mc_results) return summary_to_return, mc_results_to_return else: print( "Did not understand parameters for which stats to do here. Looking for True or False" )
def test_incorrect_output(self): # too few groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4) # too many groups assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6) # just one group assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10) # group_order doesn't select all observations, only one group left with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 5, group_order=[1]) # group_order doesn't select all observations, # we do tukey_hsd with reduced set of observations data = np.arange(15) groups = np.repeat([1, 2, 3], 5) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2]) assert_equal(len(w), 1) assert issubclass(w[0].category, UserWarning) res1 = mod1.tukeyhsd(alpha=0.01) mod2 = MultiComparison(np.array(data[:10]), groups[:10]) res2 = mod2.tukeyhsd(alpha=0.01) attributes = [ 'confint', 'data', 'df_total', 'groups', 'groupsunique', 'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance' ] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14, err_msg=err_msg) attributes = [ 'data', 'datali', 'groupintlab', 'groups', 'groupsunique', 'ngroups', 'nobs', 'pairindices' ] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14, err_msg=err_msg)
def tukey(structure, alpha, valutation): #Tukey calculation pairwise and multiple comparisons and finally print the plot if valutation == 'ap': data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("Average Precision (AP)", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_ap.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_ap.txt", "w") fw.write(str(result)) print(result) fw.close() elif valutation == 'p_10': data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("P(10)", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_p10.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_p10.txt", "w") fw.write(str(result)) print(result) fw.close() else: data, group = make_datagroup(structure, valutation) mc = MultiComparison(data, group) result = mc.tukeyhsd(alpha) fig = result.plot_simultaneous() # Plot group confidence intervals fig.set_figwidth(30) fig.set_figheight(20) axes = fig.gca() fig.suptitle('Tukey_HSD test', fontsize=40) axes.set_xlabel("Rprec", fontsize=30) axes.tick_params(labelsize=30) fileplot = path+"results/run/plot/Tukey_HSD_test_rprec.png" fig.savefig(fileplot, dpi=300) fw = open(path+"results/run/plot/tukey_HSD_rprec.txt", "w") fw.write(str(result)) print(result) fw.close()
def tukey_test(self): """It applies Tukey test to the dataframe. Tukey is a multi-comparison method. Discover more at https://en.wikipedia.org/wiki/Tukey’s_range_test . Be sure you are working with a normal distribution""" MultiComp = MultiComparison(self.df.values, self.df.index) tukey = MultiComp.tukeyhsd().summary() print("\nTukey test for rows\n" + str(tukey) + "\n") self.results.write("\nTukey test for rows\n" + str(tukey) + "\n") MultiComp = MultiComparison(self.df.T.values, self.df.columns) tukey2 = MultiComp.tukeyhsd().summary() print("\nTukey test for columns\n" + str(tukey2) + "\n") self.results.write("\nTukey test for columns\n" + str(tukey2) + "\n") return (tukey, tukey2)
def tukey_multi_metrics(gather_df, col_indices=list(range(10)) + list(range(15, 20)) + list(range(25, 65)) + list(range(145, 149)) + list(range(150, 180)), alpha=0.05): metric_names = list(gather_df.columns.values[col_indices]) model_names = list(gather_df.index.levels[0]) tukey_dict = {} # drop fold means and medians gather_df = gather_df[metric_names] gather_df = gather_df.xs('test_metrics', level='set') gather_df = gather_df.drop('Folds Mean', level='fold') gather_df = gather_df.drop('Folds Median', level='fold') # get fold count model_names_rep = [] for m in model_names: k = gather_df.xs(m, level='model').shape[0] model_names_rep.extend([m for _ in range(k)]) for i, metric in zip(range(len(metric_names)), metric_names): m_df = gather_df[metric] m_df.sort_index(inplace=True) m_df = m_df.loc[model_names] m_df_mat = np.around(m_df.as_matrix(), decimals=4) mc_obj = MultiComparison(m_df_mat, model_names_rep) tukey_res = mc_obj.tukeyhsd(alpha=alpha) tukey_dict[metric] = tukey_res return tukey_dict
def anova_all( type, file_output, *datas ): together = pd.DataFrame() for d in datas: together = pd.concat( [together, d] ) groups = together.groupby(['strategy']) fvalue, pvalue = stats.f_oneway( # groups.get_group("m")["result"], # groups.get_group("pr")["result"], # groups.get_group("mpr")["result"], # groups.get_group("ols")["result"], groups.get_group("om")["result"], groups.get_group("opr")["result"], groups.get_group("ompr")["result"] ) f = open( file_output, "a" ) f.write( type + ":" + str( pvalue ) + "\n" ) if pvalue < 0.05: mc = MultiComparison( together["result"], together["strategy"] ) mc_results = mc.tukeyhsd() print( str( round( pvalue, 4 ) ), file=f ) print( mc_results, file=f )
def get_tukey(exp, df_all, measure): # Tukey posthoc analysis # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html # And https://code.google.com/archive/p/qsturng-py/ # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations if len(df_all.groupby('strain').count()) >= 3: df_tukey = df_all[np.isfinite(df_all[measure])] mc = MultiComparison(df_tukey[measure], df_tukey['strain']) result = mc.tukeyhsd() p = psturng(np.abs(result.meandiffs / result.std_pairs), len(result.groupsunique), result.df_total) df_pairs = pd.DataFrame({ 'group1': [ result._results_table[1][0], result._results_table[2][0], result._results_table[3][0] ], 'group2': [ result._results_table[1][1], result._results_table[2][1], result._results_table[3][1] ], 'p_value': [np.around(p[0], 4), np.around(p[1], 4), np.around(p[2], 4)] }) else: df_pairs = pd.DataFrame({'group1': [], 'group2': [], 'p_value': []}) file_out = exp.name + '_coupling_' + measure + '_' + '_tukey_' + '.csv' pfiles.save_csv(df_pairs, file_out, exp.dir_tukey, False) return df_pairs
def scipy_anova_post_hoc_tests(df=None, flight_status_col='flight status new', sig_test=stats.f_oneway): """ df should be melted by aberration type """ # make list of aberrations aberrations = list(df['aberration type'].unique()) # loop through aberrations & perform anovas between pre/mid/post for aberr in aberrations: g_1 = df[(df[flight_status_col] == 'Pre-Flight') & (df['aberration type'] == aberr)]['count per cell'] g_2 = df[(df[flight_status_col] == 'Mid-Flight') & (df['aberration type'] == aberr)]['count per cell'] g_3 = df[(df[flight_status_col] == 'Post-Flight') & (df['aberration type'] == aberr)]['count per cell'] statistic, p_value = sig_test(g_1, g_2, g_3) print(aberr, p_value) # if anova detects sig diff, perform post-hoc tests if p_value <= 0.05: mc = MultiComparison( df[df['aberration type'] == aberr]['count per cell'], df[df['aberration type'] == aberr][flight_status_col]) mc_results = mc.tukeyhsd() print(mc_results) res = mc_results print( f'pvalues: {list(psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total))}' ) print('\n')
def get_tukey(df_groups, measure, groups): # Tukey posthoc analysis # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html # And https://code.google.com/archive/p/qsturng-py/ # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations df_tukey = df_groups[3][np.isfinite(df_groups[3][measure])] #print(df_tukey) mc = MultiComparison(df_tukey[measure], df_tukey['strain']) #result = pairwise_tukeyhsd(mc.data,mc.groups,0.05) result = mc.tukeyhsd() p = psturng(np.abs(result.meandiffs / result.std_pairs), len(result.groupsunique), result.df_total) df_pairs = pd.DataFrame({ 'group1': [ result._results_table[1][0], result._results_table[2][0], result._results_table[3][0] ], 'group2': [ result._results_table[1][1], result._results_table[2][1], result._results_table[3][1] ], 'p_value': [np.around(p[0], 4), np.around(p[1], 4), np.around(p[2], 4)] }) for index, row in df_pairs.iterrows(): stars = get_stars(row['p_value']) df_pairs.loc[index, 'significance'] = stars return df_pairs
def posthoc_turron_by_gender(melted, variable): df = melted.copy() df = df[df['variable'] == variable] df['turron:gender'] = df['turron'] + "_" + df['gender'] mc = MultiComparison(df['value'], df['turron:gender']) result = mc.tukeyhsd() print(result)
def calculate_test(self): """It applies Holm-Bonferroni test to the dataframe. Bonferroni is a multi-comparison method. Discover more at https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method . Be sure you are working with a normal distribution""" MultiComp = MultiComparison(self.data.values,self.data.index) holm=MultiComp.allpairtest(stats.ttest_rel, method='Holm') print("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n") self.results.write("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n") MultiComp = MultiComparison(self.data.T.values,self.data.columns) holm2=MultiComp.allpairtest(stats.ttest_rel, method='Holm') print("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n") self.results.write("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n") return (holm,holm2)
def tukey_hsd(data, groups, metric): """ANOVA and Tukey HSD post-hoc comparison from statsmodels. The Tukey HSD post-hoc comparison test controls for type I error and maintains the familywise error rate at 0.05. The group1 and group2 columns are the groups being compared, meandiff column is the difference in means of the two groups being calculated as group2 – group1, lower/upper columns are the lower/upper boundaries of the 95% confidence interval, reject column states whether or not the null hypothesis should be rejected. Args: data (Dataframe): Dataframe grouped by dialogue group and label_type values. groups (string): Indicates which columns values to group data for comparison i.e. label_type. metric (string): Indicates which column name has the result values i.e. values/times. Returns: tukey_frame (Dataframe): Contains f-statistic, p-value and eta/omega effect sizes. group1 group2 meandiff p-value lower upper reject 0 ap ap type -0.1167 0.8059 -0.5825 0.3492 False 1 ap da 0.1000 0.8543 -0.3659 0.5659 False 2 ap type da 0.2167 0.5158 -0.2492 0.6825 False """ # Compare the results (metric) for the range of values for this experiment_type multi_comparison = MultiComparison(data=data[metric], groups=data[groups]) # Create the tukey results table tukey_results = multi_comparison.tukeyhsd() # Convert the results to a dataframe tukey_frame = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0]) tukey_frame.rename(columns={'p-adj': 'p-value'}, inplace=True) return tukey_frame
def print_hpcontrast(self, data: list, labels: list, alpha: float = 0.05): """Contrast the hypoteses of the scores given in the list and print the results in the report. Using Kurskal-Wallis and Tuckeyhsd tests. :param data: List containing the metric results of the models :type data: list :param labels: List containg the models tags :type labels: list :param alpha: Number for the pValue of the test, defaults to 0.05 :type alpha: float, optional """ _, pVal = stats.kruskal(*data) str_toprint = f"p-valor KrusW:{pVal}\n" if pVal <= alpha: str_toprint += ( "Hypotheses are being rejected: the models are different\n") stacked_data = np.vstack(data).ravel() cv = len(data[0]) model_rep = [] for i in labels: model_rep.append(np.repeat("model" + i, cv)) stacked_model = np.vstack(model_rep).ravel() multi_comp = MultiComparison(stacked_data, stacked_model) comp = multi_comp.tukeyhsd(alpha=alpha) str_toprint += str(comp) else: str_toprint = ( str_toprint + "Hypotheses are being accepted: the models are equal") self.print_noformat(str_toprint)
def pairwise_ttest(val_vec, cnf): df = pd.DataFrame() cluster = [] score = [] for subc, dic_conf in val_vec.items(): cluster += [str(subc) for idx in range(len(dic_conf[cnf]))] score.extend(dic_conf[cnf]) df['subcluster'] = cluster df['score'] = score # all_comb = list(combinations(df.subcluster, 2)) # p_vals = [] # for comb in all_comb: # g1 = df[(df.subcluster == comb[0])]['score'] # g2 = df[(df.subcluster == comb[1])]['score'] # stat, pval = ttest_ind(g1, g2, equal_var=False) # p_vals.append(pval) # reject_list, corrected_p_vals = multipletests(p_vals, method='bonferroni')[:2] # for comb, pv, cpv, r in zip(all_comb, p_vals, corrected_p_vals, reject_list): # print("Comparison: {0} -- p={1}, corr_p={2}, rej={3}".format( # comb, pv, cpv, r)) MultiComp = MultiComparison(df['score'], df['subcluster']) comp = MultiComp.allpairtest(ttest_ind, method='bonf') print(comp[0]) pd.options.display.float_format = '{:.3f}'.format print(df.groupby(['subcluster']).describe())
def anova_analysis(df): """ anova_analysis takes in a data frame and performs an anova test for hypothesis testing 1 prints out the test results """ time_periods = df.groupby(['week_ending','Holiday'],as_index = False)[['seats_sold']].sum() TG = time_periods.loc[time_periods['Holiday'] == 'ThanksGiving','seats_sold'] WB = time_periods.loc[time_periods['Holiday'] == 'WinterBreak','seats_sold'] SB = time_periods.loc[time_periods['Holiday'] == 'SummerBreak','seats_sold'] NH = time_periods.loc[time_periods['Holiday'] == 'Not Holiday','seats_sold'] f,p = stats.f_oneway(TG,WB,SB,NH) print('The f and p of ANOVA analysis are:') print(f,p) ## plot the mean of each group time_periods.boxplot('seats_sold', by='Holiday', figsize=(12, 8)) fileName = 'ANOVA.png' plt.savefig(fileName) print("The mean seats sold of each time periods:") print(time_periods.groupby('Holiday')['seats_sold'].mean()) pairwise = MultiComparison(time_periods['seats_sold'], time_periods['Holiday']) result = pairwise.tukeyhsd() print(pairwise) print(result)
def get_significance_booleans(data): ''' preform multiple comparisons (t-tests). paramters --------- data: Series must have a single level index containing the group labels (id). values are the results to be compared returns ------ booleans: Series boolean values indicating significance between the groups. ''' id = data.index.values value = data.values # multiple comparison multiple_comparisons = MultiComparison(value, id) # instanciate multiple comparisons object pairwise_holm = multiple_comparisons.allpairtest(ttest_ind, method='holm') # preform pairwise t-test significance_matrix = DataFrame(pairwise_holm[2]) # store results in dataframe groups_as_index = significance_matrix.set_index(['group1', 'group2']) significance_booleans = groups_as_index['reject'] return significance_booleans
def create_df_with_all_post_hoc(df, hue='value'): index_0 = [ "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7', "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml", "rcm" ] index_1 = ["comparaison 0", "comparaison 1", "comparaison 2"] index = pd.MultiIndex.from_product([index_0, index_1]) columns = ['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject'] n_row = len(index_0) * len(index_1) n_col = len(columns) data = np.empty((n_row, n_col)) data[:] = np.nan full_df = pd.DataFrame(data, index=index, columns=columns) for v in index_0: data = get_data_for_anova(df, v, hue) results = ols("{} ~ C(level)".format(v), data=data).fit() mc = MultiComparison(data[v], data['level']) post_hoc = mc.tukeyhsd() df_ph = create_df_post_hoc(post_hoc, v) if results.f_pvalue < 0.05: full_df.loc[(v)] = df_ph.values else: full_df.loc[(v)] = 'anova ns' return full_df
def _calc_firing_rate(self, num_peaks: pd.DataFrame, epoch: str = "All_cells"): """ Sum all indices of peaks to find the average firing rate of cells in the three epochs :return: """ # Remove silent cells from comparison split_data = num_peaks.stack() mc = MultiComparison(split_data.values, split_data.index.get_level_values(1).values) try: res = mc.tukeyhsd() except ValueError: aprint("<yellow>Failed during the p-value calculation.</yellow>") else: print(res) print( f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):", psturng( np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total, ), ) finally: print(split_data.mean(level=1))
def Tukey(self, Categorical, Continuous): """ Calculate Tukey Honest Significance Difference (HSD) Test, to identify the groups whose distributions are significantly different """ temp_df = self.df.dropna() start = time.time() mc = MultiComparison(temp_df[Continuous], temp_df[Categorical]) result = mc.tukeyhsd() reject = result.reject meandiffs = result.meandiffs UniqueGroup = mc.groupsunique group1 = [UniqueGroup[index] for index in mc.pairindices[0]] group2 = [UniqueGroup[index] for index in mc.pairindices[1]] reject = result.reject meandiffs = [ round(float(meandiff), 3) for meandiff in result.meandiffs ] columns = ['Group 1', "Group 2", "Mean Difference", "Reject"] TukeyResult = pd.DataFrame(np.column_stack( (group1, group2, meandiffs, reject)), columns=columns) end = time.time() if self.debug == 'YES': print('Tukey', end - start) return TukeyResult
def preform_anova(df_anova, post_hoc=False): ### perform anova no_interaction = 'err ~ C(u) + C(m) + C(n) + C(h) + C(q) + C(prob)' with_interactions = 'err ~ C(u) * C(m) * C(n) * C(h) * C(q) * C(prob)' formula = 'err ~ u + m + n + h + q + prob' model = ols(formula, data=df_anova).fit() aov_table = sm.stats.anova_lm(model, typ=2) # Type 2 is for ANOVA DataFrame ### effect size (R^2) (R_squared) esq_sm = aov_table['sum_sq'][:-1] / (aov_table['sum_sq'][:-1] + aov_table['sum_sq'][-1]) aov_table.loc[:, 'r_sq'] = esq_sm print(aov_table) if post_hoc: for name, row in aov_table.iterrows(): if row['PR(>F)'] < 5e-2: mc = MultiComparison(df_anova[name], df_anova['err']) mc_results = mc.tukeyhsd() print('post_hoc') print(mc_results) aov_table.to_csv('analysis/anova_results.csv')
def calc_Tukey(f): ''' f= name of feature ''' mc = MultiComparison(data.grouped_features[f]['value'], data.grouped_features[f]['KD']) mc_results = mc.tukeyhsd() return mc_results
def posthoc_turron_by_first_time_tasting(melted, variable): df = melted.copy() df = df[df['variable'] == variable].dropna() df['turron:first_time_tasting'] = df['turron'] + "_" + df['first_time_tasting'] mc = MultiComparison(df['value'], df['turron:first_time_tasting']) res = mc.tukeyhsd() from statsmodels.stats.libqsturng import psturng p_values = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total) print(p_values) print(res)
def tukey(prediction: 'array of ints', model: 'array of string'): """ This performs Tukey HSD test, input: array(int(predictions)), array(str(model_names)) this test says if there are significant differences between the classes. """ mc = MultiComparison(prediction, model) mc_results = mc.tukeyhsd() print(mc_results) return mc_results
def perform_post_hoc_tukey(data, factor1, factor2, factor3, factor2_idx): for factor2_idx in range(len(factor2_idx)): print(factor2, ': ', factor2_idx + 1) mc = MultiComparison( data[data[factor2] == factor2_idx + 1][factor1], index_data_response_MDD[index_data_response_MDD[factor2] == factor2_idx + 1][factor3]) result = mc.tukeyhsd() print(result) print(mc.groupsunique)
def tukey_hsd(self, stacked_df, colname): ''' input: stacked_df: from table_transform, a stacked df colname: string, the category to compare return: tukeyhsd table result and stacked table set up tukey hsd for post anova with significance ''' MultiComp = MultiComparison(stacked_df[colname], stacked_df['state']) return MultiComp.tukeyhsd().summary()
def get_multiplecomparisons(self, dataframe, test): # If distributions are different then do multiple comparisons dataframe = dataframe.dropna() print(dataframe) cleanbin = dataframe.melt(var_name='Bin', value_name='Value') MultiComp = MultiComparison(cleanbin['Value'], cleanbin['Bin']) if test == 'ttest': comp = MultiComp.allpairtest(scipy.stats.ttest_rel, method='Bonf') else: comp = MultiComp.allpairtest(scipy.stats.wilcoxon, method='Bonf') print(comp[0])
def tukey(data, names): names = np.array(names) tmp = [] for item in data: for val in item: tmp.append(val) data = np.array(tmp) mc = MultiComparison(data, names) result = mc.tukeyhsd() print(result) print(mc.groupsunique)