def kruskal_wallis(df, independent_variable, dependent_variable): kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable) # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only p_value = float("%.4f" % kruskal_result["p-unc"][0]) # p_value = kruskal_result["p-unc"][0] result = { "test": "Kruskall Wallis Test", "p_value": p_value, "variable_1": independent_variable, "variable_2": dependent_variable, "null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'", "info": """Assumes that dependent variable ('{0}') is ordinal or continuous, that the independent variable ('{1}') consists of more than 2 groups and that these groups follow the same distribution (the shape on a histogram).\n NOTE: It is also possible to perform this test on categories containing just 2 groups, however we have not done so as it could conflict with results from Mann-Whitney U test (performed on categories with 2 groups only).""".format( dependent_variable, independent_variable) } return p_value, result
def calculate_kruskalwallish(dataframe, col_name="", p_adj_method="none", stats_id="", save_files=False, save_dir="/Users/kyleweber/Desktop/"): stats_pairwise_t = None stats_main = pg.kruskal(data=dataframe, dv=col_name, between="Cohort", detailed=True) stats_main["Sig."] = ["*" if stats_main["p-unc"].iloc[0] < .05 else " "] # if stats_main["Sig."].iloc[0] == "*": stats_pairwise_t = pg.pairwise_ttests(data=dataframe, dv=col_name, between="Cohort", within=None, subject="Participant", parametric=False, marginal=True, alpha=.05, tail="two-sided", padjust=p_adj_method, effsize="cohen", correction='auto') stats_pairwise_t["Sig."] = ["*" if row[[i for i in stats_pairwise_t.columns].index("p-unc") + 1] < .05 else " " for row in stats_pairwise_t.itertuples()] if save_files: print("Saving files to {}".format(save_dir)) stats_main.to_csv(save_dir + stats_id + "_Main.csv") stats_pairwise_t.to_csv(save_dir + stats_id + "_PairwiseT.csv", index=False) return stats_main, stats_pairwise_t
def apply(self, alpha=0.05, plot=True, filename="kruskal", use_latex=False): kruskal = pg.kruskal(dv=self.val_col, between=self.group_col, data=self.df) pvalue = kruskal['p-unc'][0] if plot: chi_squared = kruskal['H'][0] degree_freed = kruskal['ddof1'][0] p = "< 0.001" if pvalue < 0.001 else ( "< 0.01" if pvalue < 0.01 else ("< 0.05" if pvalue < 0.05 else (round(pvalue, 3)))) plt.figure(figsize=(70, 8)) sns.boxplot(x=self.group_col, y=self.val_col, data=self.df) # Jittered BoxPlots sns.stripplot(x=self.group_col, y=self.val_col, data=self.df, size=4, jitter=True, edgecolor="gray") # Add mean and median lines plt.axhline(y=self.df[self.val_col].mean(), color='r', linestyle='--', linewidth=1.5) plt.axhline(y=self.df[self.val_col].median(), color='b', linestyle='--', linewidth=2) plt.title("") plt.suptitle("") plt.xlabel( f"\nKruskal-Wallis chi-squared = {chi_squared}, df = {degree_freed}, p = {p}", labelpad=20) plt.ylabel('') plt.savefig(filename + ('.pgf' if use_latex else '.pdf'), bbox_inches='tight') plt.clf() # If the Kruskal-Wallis test is significant, a post-hoc analysis can be performed # to determine which levels of the independent variable differ from each other level. if pvalue < alpha: eff = effect_size(self.df, self.val_col, self.group_col) return kruskal, [ self._post_hoc_nemenyi(), self._kruskal_multiple_comparisons(), eff.VD_A() ] return kruskal, None
def kw_test(err_or_dt, var, min_exp_bound=-float('inf'), ignore_exp=()): err_or_dt_dict, _min_exp, _max_exp = get_err_or_dt_dict(f'results/diff_dens/diff_{err_or_dt}', err_or_dt, min_exp_bound=min_exp_bound, ignore_exp=ignore_exp) step_items = tuple(err_or_dt_dict.items()) for s, df in step_items: df['step'] = np.repeat(s, len(df)) for (s1, df1), (s2, df2) in zip(step_items[:-1], step_items[1:]): print(s1, s2) data = df1.append(df2) print(pg.kruskal(data, dv=var, between='step')) print()
def apply(self, ax, alpha=0.05, plot=True, ylabel=''): kruskal = pg.kruskal(dv=self.val_col, between=self.group_col, data=self.df) if 'p-unc' in kruskal.columns: pvalue = kruskal['p-unc'][0] if plot: chi_squared, degree_freed = kruskal['H'][0], kruskal['ddof1'][ 0] p = "< 0.001" if pvalue < 0.001 else ( "< 0.01" if pvalue < 0.01 else ("< 0.05" if pvalue < 0.05 else (round(pvalue, 3)))) sns.boxplot(x=self.group_col, y=self.val_col, data=self.df, ax=ax) # Jittered BoxPlots sns.stripplot(x=self.group_col, y=self.val_col, data=self.df, size=4, jitter=True, edgecolor="gray", ax=ax) # Add mean and median lines ax.axhline(y=self.df[self.val_col].mean(), color='r', linestyle='--', linewidth=1.5) ax.axhline(y=self.df[self.val_col].median(), color='b', linestyle='--', linewidth=2) ax.set_ylabel(ylabel) ax.set_xlabel(f"\nKruskal-Wallis p-value = {p}", labelpad=15) # If the Kruskal-Wallis test is significant, a post-hoc analysis can be performed # to determine which levels of the independent variable differ from # each other level. if pvalue < alpha: return kruskal, [ self._post_hoc_nemenyi(), VD_A_DF(self.df, self.val_col, self.group_col) ] return kruskal, None
def kruskall_wallis(survey_id, df, independent_variable, dependent_variable, form): if is_string_dtype(df[dependent_variable]): flash( "Dependent Variable '" + dependent_variable + "' is not numeric.", "danger") return render_template("analysis/analysedata.html", form=form) kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable) # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only p_value = "%.4f" % kruskal_result["p-unc"][0] return redirect( url_for('analysis.result', survey=survey_id, test="Kruskall Wallis Test", p_value=p_value, independent_variable=independent_variable, dependent_variable=dependent_variable))
def kruskal_test(df, dependent_variable, between): """Do Kruskal-Wallis analysis""" # Kruskal-Wallis one way analysis of variance return kruskal(data=df, dv=dependent_variable, between=between)
def analyse(survey_id): form = StatisticalTestForm() survey = mongo.db.surveys.find_one_or_404({"_id": ObjectId(survey_id)}) if survey["user"] != current_user._id: flash("You do not have access to that page", "danger") abort(403) df = read_file(survey["fileName"]) # Populate the select options in the form with all the variables for variable in list(df.columns.values): form.independent_variable.choices.append((variable, variable)) form.dependent_variable.choices.append((variable, variable)) if form.validate_on_submit(): # Get the dataset, and save the variables in python variables independent_variable = form.independent_variable.data dependent_variable = form.dependent_variable.data # Ensure the user hasn't selected the same variable for both if independent_variable == dependent_variable: flash("You can't select the same variable for both.", "danger") return render_template("analysis/analysedata.html", form=form) test = form.test.data # If the user selects Chi-Square goodness fit then they are redirected to a separate URL if test == "Chi-Square goodness of fit": return redirect( url_for('analysis.chi_goodness', variable=independent_variable, survey_id=survey_id)) # The other tests all require a dependent variable if dependent_variable == "": flash("You must select a dependent variable for this test.", "danger") return render_template("analysis/analysedata.html", form=form) if test == "Kruskall Wallis Test": if is_string_dtype(df[dependent_variable]): flash( "Dependent Variable '" + dependent_variable + "' is not numeric.", "danger") return render_template("analysis/analysedata.html", form=form) kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable) # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only p_value = "%.4f" % kruskal_result["p-unc"][0] # AT THE MOMENT, THIS TEST IS 2 TAILED. MAY WANT TO ADD OPTIONS FOR 1 TAILED TESTS elif test == "Mann-Whitney U Test": if is_string_dtype(df[dependent_variable]): flash( "Dependent Variable '" + dependent_variable + "' is not numeric.", "danger") return render_template("analysis/analysedata.html", form=form) group_by = df.groupby(independent_variable) group_array = [group_by.get_group(x) for x in group_by.groups] if len(group_array) != 2: flash( "Independent variable '" + independent_variable + "' has too many groups, only 2 allowed for Mann-Whitney U Test.", "danger") return render_template("analysis/analysedata.html", form=form) x = group_array[0][dependent_variable].values y = group_array[1][dependent_variable].values mwu_result = mwu(x, y) p_value = "%.4f" % mwu_result['p-val'].values[0] elif test == "Chi-Square Test": contingency_table = pd.crosstab(df[independent_variable], df[dependent_variable]) _, p_value, _, _ = chi2_contingency(contingency_table, correction=False) return redirect( url_for('analysis.result', survey=survey_id, test=test, p_value=p_value, independent_variable=independent_variable, dependent_variable=dependent_variable)) return render_template("analysis/analysedata.html", form=form)
color_palette = expand.selectbox( "Choose color palette", ("Set2", "Accent", "Blues", "BrBG", "Dark2", "GnBu", "Greys", "Oranges", "Paired", "Pastel1", "Purples", "Set1", "Set3", "Spectral", "Wistia", "autumn", "binary", "cividis", "cool", "coolwarm", "icefire", "inferno", "magma", "ocean", "plasma", "rainbow", "summer", "twilight", "viridis", "winter")) st.header("Difference in means between groups results") st.success("Descriptive statistics are being calculated") function_dict = {x_var: ["mean", "std", "sem", "count"]} new = pd.DataFrame(df.groupby(y_var).aggregate(function_dict)) st.write(new) results = pg.kruskal(data=df, dv=x_var, between=y_var, detailed=True) st.success("Kruskal-Wallis non-parametric ANOVA results") st.write(results) st.success("Games-Howell multiple comparisons") games_howell = pg.pairwise_gameshowell(dv=x_var, between=y_var, data=df).round(3) st.write(games_howell) df_filtered = games_howell[games_howell['pval'] < 0.05][['A', 'B']] tuples = [tuple(x) for x in df_filtered.to_numpy()] st.markdown("## ") st.success("Bar plots with errors are being generated") fig = plt.figure(figsize=(12, 6)) error = None
df = pd.read_excel(file, header=0, index_col=0) df.index = df['condition'] df = df.loc[groups] for cat, indx in zip(['Local', 'Distal'], range(2)): print('') print('{} side'.format(cat)) sn.boxplot(x='condition', y=cat, data=df, ax=ax[indx], palette=colors) sn.swarmplot(x='condition', y=cat, data=df, ax=ax[indx], color='0.5') print(pg.kruskal(data=df, dv=cat, between='condition')) print('') # kw = stats.kruskal(df.loc[groups[0],cat].values, # df.loc[groups[1],cat].values, # df.loc[groups[2],cat].values, # df.loc[groups[3],cat].values, # df.loc[groups[4],cat].values, # df.loc[groups[5],cat].values, # df.loc[groups[6],cat].values) # print (kw) for group in groups:
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import pingouin as pg nest_raw = pd.read_csv( '/Users/labc02/Documents/PDCB_data/MK-project/Nesting Data.csv') nest_1mg = nest_raw[nest_raw['Dose (mg/Kg)'] == 1.0] nest_1mg #Variable is ordinal; Kruskall-Wallis #Check homoscedasticity pg.homoscedasticity(data=nest_1mg, dv='Nesting Score', group='Genotype') nest_kw = pg.kruskal(data=nest_1mg, dv='Nesting Score', between='Genotype') nest_kw #%% nest_fig = plt.figure(figsize=(4, 3)) sns.boxplot(x='Tx', y='Nesting Score', hue='Genotype', data=nest_1mg, palette=['forestgreen', 'royalblue'], showmeans=True, meanprops={ 'marker': '+', 'markeredgecolor': 'k' }, width=.5) plt.legend(frameon=False, loc='lower right') plt.xlabel('Treatment')
test=test, text_format='star', loc='inside', verbose=2, comparisons_correction=correction) sn.boxplot(data=distributionsDf, palette=pal, ax=box[4]) #sn.swarmplot(data=distributionsDf,ax=box[4],color='0.25') if saveData == True: allData.to_excel('{}/Average_Amplitudes.xlsx'.format(saveDir)) if statsToDo == True: kruskalAvg = pingouin.kruskal(data=allData, dv='Average amplitudes (pA)', between='Condition') print('Avg Amp') print(kruskalAvg) box[0].set_title('p(KW)={}'.format(round(kruskalAvg['p-unc'][0], 7))) kruskalTotal = pingouin.kruskal(data=allData, dv='Total amplitudes (pA)', between='Condition') print('Total Amp') print(kruskalTotal) box[1].set_title('p(KW)={}'.format(round(kruskalTotal['p-unc'][0], 7))) kruskalProportion = pingouin.kruskal(data=allData, dv='Propotion (%)', between='Condition')
print(' {} vs {} p-val={}'.format( colA, colB, mwu_test[1])) else: print('KW test failed') plt.tight_layout() #Inter group stats print('') print('------Inter Group Statistics-------') globalDf = pd.concat(globalDf, axis=0) for col in deepValues: interKruskal = pg.kruskal(data=globalDf, dv=col, between='condition') print('Range:{}'.format(col)) print(interKruskal) print('') #post hoc MWU for groupA in groups: serieA = globalDf[col].loc[(globalDf['condition'] == groupA)] for groupB in groups: serieB = globalDf[col].loc[(globalDf['condition'] == groupB)] interPostMwu = stats.mannwhitneyu(serieA, serieB, alternative='two-sided')
def qualOrdinalUnpaired(imgDir, sheetName, sheetDf, sheetScale, silent=False): print("######################################## ", sheetName, " ########################################" ) if not silent else None meltedSheetDf = sheetDf.melt(var_name='Factor', value_name='Variable') contingencySheetDf = pd.crosstab(index=meltedSheetDf['Variable'], columns=meltedSheetDf['Factor']) statDf = pd.DataFrame(columns=[ 'COMPARISON', 'TEST', 'STATISTICS', 'P-VALUE', 'EFFECT SIZE' ]) #fill empty scale value for sheetStep in range(sheetScale): if not sheetStep in contingencySheetDf.index.values: contingencySheetDf.loc[sheetStep] = [ 0 for x in range(len(contingencySheetDf.columns.values)) ] contingencySheetDf.sort_index(inplace=True) # ALL MODALITY if len(contingencySheetDf.columns) > 2: sheetDf_long = sheetDf.melt(ignore_index=False).reset_index() kruskal_stats = pg.kruskal(data=sheetDf_long, dv="value", between="variable") source, ddof1, hvalue, pvalue = kruskal_stats.values[0] statDf = statDf.append( { 'COMPARISON': 'ALL', 'TEST': "Kruskal-Wallis", 'STATISTICS': hvalue, 'P-VALUE': pvalue, 'EFFECT SIZE': -1 }, ignore_index=True) # BETWEEN MODALITY modality_names = sheetDf.columns.values uncorrectedStatIndex = len(statDf.index) for i in range(len(modality_names)): for j in range(i + 1, len(modality_names)): stats_mannwhitney = pg.mwu(x=sheetDf.loc[:, modality_names[i]], y=sheetDf.loc[:, modality_names[j]], alternative='two-sided') uvalue, alternative, pvalue, RBC, CLES = stats_mannwhitney.values[ 0] statDf = statDf.append( { 'COMPARISON': modality_names[i] + '|' + modality_names[j], 'TEST': "Mann-Whitney", 'STATISTICS': uvalue, 'P-VALUE': pvalue, 'EFFECT SIZE': RBC }, ignore_index=True) reject, statDf.loc[uncorrectedStatIndex::, 'P-VALUE'] = pg.multicomp( statDf.loc[uncorrectedStatIndex::, 'P-VALUE'].values, alpha=0.05, method="holm") StackedBarPlotter.StackedBarPlotter(filename=imgDir + '/' + sheetName + '.png', title=sheetName, dataDf=sheetDf, histDf=contingencySheetDf, statDf=statDf)
file = 'E:/03_FORMATED_DATA/BEHAVIOR/Catwalk_Norm_Profiles_Cuff_Sham_Ctrl.xlsx' palette = ['royalblue','0.5','lightcoral'] fig, ax = plt.subplots(1,4, sharex=False, sharey=True) df = pd.read_excel(file, header=0) #Easy first, peak amplitude at D15 postOp15 = df[['post_op_15','Condition']] sn.boxplot(x='Condition',y='post_op_15',data=df,ax=ax[0],palette=['lightcoral','0.5','royalblue']) #sn.swarmplot(x='Condition',y='post_op_15',data=postOp15,ax=ax[0],color='black') #[1] --------------------Stats on PostOp15 peak------------------------------------- postOp15_KW = pg.kruskal(data=postOp15,dv='post_op_15',between='Condition') postOp15_Anova = pg.anova(data=postOp15,dv='post_op_15',between='Condition') print('Analysis of behavioral features') print ('[1]--------------------- Post Op 15 ------------------------') print('Average values') print(postOp15.groupby('Condition').mean()) print('') print('STD') print(postOp15.groupby('Condition').std()) print('') print('Multi condition test') print ('Kruskal Wallis') print (postOp15_KW) if postOp15_KW['p-unc'].values <= 0.05:
stackeddelta = np.hstack((np.squeeze(summarydelta['state1delta']), np.squeeze(summarydelta['state2delta']))) whichstate = np.ones(stackeddelta.shape[0]) * 2 whichstate[0:(np.squeeze(summarydelta['state1delta']).size)] = 1 delta_df = pd.DataFrame({ 'standarddelta': stackeddelta, 'brainstate': whichstate }) aov = anova(dv='standarddelta', between='brainstate', data=delta_df, detailed=True) allFstat[p] = aov['F'][0] ANOVApvals[p] = aov['p-unc'][0] kw = kruskal(dv='standarddelta', between='brainstate', data=delta_df, detailed=True) allHstat[p] = kw['H'][0] KWpvals[p] = kw['p-unc'][0] nANOVAsig001 = np.sum(ANOVApvals < .001) print( f'There were {nANOVAsig001} significant differences by ANOVA (alpha = .001) of {npatients} patients between mean standardized delta across both brain-derived states' ) nANOVAsig01 = np.sum(ANOVApvals < .01) print( f'There were {nANOVAsig01} significant differences by ANOVA (alpha = .01) of {npatients} patients between mean standardized delta across both brain-derived states' ) nANOVAsig05 = np.sum(ANOVApvals < .05) print( f'There were {nANOVAsig05} significant differences by ANOVA (alpha = .05) of {npatients} patients between mean standardized delta across both brain-derived states'
def stats(model, quantity, data, targets, tw, rm, nd): if model == 'absolute': data = data.drop(['NormQuant'], axis=1) data['NormMean'] = data['NormMean'].astype(float) mean = 'NormMean' else: data = data.drop(['rq'], axis=1) data['rqMean'] = data['rqMean'].astype(float) mean = 'rqMean' # prepare data from intermediate dataframe data = data[data['Outliers'].eq(False)] data = data.drop_duplicates(keep='first') # t-test and anova for normally distributed data if nd == 'True': if quantity == 2: # T-Test between 2 groups stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() group = data['Group'].dropna() group = group.drop_duplicates(keep='first').values.tolist() for item in targets: df = data[data['Target Name'].eq(item)] group1 = df[df['Group'].eq(group[0])][mean] group2 = df[df['Group'].eq(group[1])][mean] t_test = ttest(group1, group2, paired=bool(rm)) if rm == 'True': t_test['paired'] = 'TRUE' else: t_test['paired'] = 'FALSE' t_test['Target Name'] = item if stats_dfs is None: stats_dfs = t_test else: stats_dfs = stats_dfs.append(t_test, ignore_index=True) # reformat output table stats_dfs = stats_dfs.rename(columns={ 'cohen-d': 'effect size', 'BF10': 'Bayes factor', 'dof': 'DF' }) cols = [ 'Target Name', 'DF', 'T', 'tail', 'paired', 'p-val', 'effect size', 'power', 'Bayes factor' ] stats_dfs = stats_dfs.reindex(columns=cols) elif quantity >= 3: # ANOVA test stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() # tukey_dfs = pandas.DataFrame() pvals = [] for item in targets: if rm == 'True': # one-way if tw == 'False': # repeated measure anova aov = pg.rm_anova( dv=mean, data=data[data['Target Name'].eq(item)], within='Group', subject='Sample Name', detailed=True) pvals.append(aov['p-unc'][0]) aov = aov.drop([1]) aov['measures'] = ['dependent'] aov['Target Name'] = item # two-way else: aov = pg.rm_anova( dv=mean, data=data[data['Target Name'].eq(item)], within=['Group1', 'Group2'], subject='Sample Name', detailed=True) reject_tw, pval_corr_tw = pg.multicomp(list( aov['p-unc']), alpha=0.05, method='bonf') aov['p-value corrected'] = pval_corr_tw aov['measures'] = ['dependent'] * 3 aov['Target Name'] = [item] * 3 aov.drop(['eps'], axis=1) ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, within='Group', subject='Sample Name', padjust='fdr_bh') ph['Target Name'] = item ph['Test'] = 'T-Test' else: # one-way if tw == 'False': aov = pg.anova(dv=mean, between='Group', data=data[data['Target Name'].eq(item)], detailed=True) pvals.append(aov['p-unc'][0]) aov = aov.drop([1]) aov['measures'] = ['independent'] aov['Target Name'] = item ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between='Group', padjust='fdr_bh') ph['Test'] = 'T-Test' # two-way else: aov = pg.anova(dv=mean, between=['Group1', 'Group2'], data=data[data['Target Name'].eq(item)], detailed=False) aov = aov.drop([3]) reject_tw, pval_corr_tw = pg.multicomp(list( aov['p-unc']), alpha=0.05, method='bonf') aov['p-value corrected'] = pval_corr_tw aov['measures'] = ['independent'] * 3 aov['Target Name'] = [item] * 3 ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between=['Group1', 'Group2'], padjust='fdr_bh') ph['Test'] = 'T-Test' ph['Target Name'] = item if stats_dfs is None: stats_dfs = aov else: stats_dfs = stats_dfs.append(aov, ignore_index=True) if posthoc_dfs is None: posthoc_dfs = ph else: posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True) reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf') # reformat output tables stats_dfs = stats_dfs.rename(columns={ 'p-unc': 'p-value', 'np2': 'effect size' }) if tw == 'False': stats_dfs['p-value corrected'] = pvals_corr stats_dfs['distribution'] = ['parametric'] * len(targets) stats_dfs['test'] = ['ANOVA'] * len(targets) stats_dfs['statistic'] = ['NA'] * len(targets) else: stats_dfs['distribution'] = ['parametric'] * (len(targets) * 3) stats_dfs['test'] = ['ANOVA'] * (len(targets) * 3) stats_dfs['statistic'] = ['NA'] * (len(targets) * 3) cols = [ 'Target Name', 'Source', 'DF', 'F', 'MS', 'SS', 'p-value', 'p-value corrected', 'measures', 'distribution', 'test', 'statistic', 'effect size' ] stats_dfs = stats_dfs.reindex(columns=cols) if tw == 'False': posthoc_dfs = posthoc_dfs.drop(['Contrast', 'T'], axis=1) else: posthoc_dfs = posthoc_dfs.drop(['T'], axis=1) posthoc_dfs = posthoc_dfs.rename( columns={ 'hedges': 'effect size', 'p-corr': 'p-value corrected', 'p-unc': 'p-value', 'p-adjust': 'correction method', 'BF10': 'Bayes factor', 'dof': 'DF' }) if tw == 'False': cols2 = [ 'Target Name', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] else: cols2 = [ 'Target Name', 'Contrast', 'Group1', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] posthoc_dfs = posthoc_dfs.reindex(columns=cols2) # nonparametric tests for not normally distributed data else: if quantity == 2: stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() group = data['Group'].dropna() group = group.drop_duplicates(keep='first').values.tolist() for item in targets: df = data[data['Target Name'].eq(item)] group1 = df[df['Group'].eq(group[0])][mean] group2 = df[df['Group'].eq(group[1])][mean] if rm == 'True': # Mann-Whitney U test test = mannwhitneyu(group1, group2) test = pandas.DataFrame( { 'Target Name': item, 'pvalue': test.pvalue, 'statistic': test.statistic }, index=[0]) else: # Wilcoxon test = wilcoxon(group1, group2) test = pandas.DataFrame( { 'Target Name': item, 'pvalue': test.pvalue, 'statistic': test.statistic }, index=[0]) if stats_dfs is None: stats_dfs = test else: stats_dfs = stats_dfs.append(test, ignore_index=True) elif quantity >= 3: stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() pvals = [] for item in targets: if rm == 'True': # friedman test for repeated measurements df = pg.friedman(dv=mean, within='Group', subject='Sample Name', data=data[data['Target Name'].eq(item)]) pvals.append(df['p-unc'][0]) df['test'] = ['Friedman Q'] df['measures'] = ['dependent'] df = df.rename(columns={'Q': 'statistic'}) df['Target Name'] = item df['DF'] = 'NA' ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, within='Group', subject='Sample Name', padjust='fdr_bh', parametric=False) ph['Target Name'] = item ph['DF'] = 'NA' ph['Bayes factor'] = 'NA' ph['Test'] = 'Wilcoxon' else: # Kruskal-Wallis H test df = pg.kruskal(dv=mean, between='Group', data=data[data['Target Name'].eq(item)]) pvals.append(df['p-unc'][0]) df['test'] = ['Kruskal-Wallis H'] df['measures'] = ['independent'] df = df.rename(columns={'H': 'statistic'}) df['Target Name'] = item df['DF'] = 'NA' ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between='Group', padjust='fdr_bh', parametric=False) ph['Target Name'] = item ph['DF'] = 'NA' ph['Bayes factor'] = 'NA' ph['Test'] = 'Mann-Whitney U' if stats_dfs is None: stats_dfs = df else: stats_dfs = stats_dfs.append(df, ignore_index=True) if posthoc_dfs is None: posthoc_dfs = ph else: posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True) reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf') # reformat output tables stats_dfs = stats_dfs.rename(columns={ 'dof': 'DF', 'p-unc': 'p-value' }) stats_dfs['p-value corrected'] = pvals_corr stats_dfs['distribution'] = ['non-parametric'] * len(targets) stats_dfs['MS'] = ['NA'] * len(targets) stats_dfs['SS'] = ['NA'] * len(targets) stats_dfs['effect size'] = ['NA'] * len(targets) cols = [ 'Target Name', 'DF', 'MS', 'SS', 'p-value', 'p-value corrected', 'measures', 'distribution', 'test', 'statistic', 'effect size' ] stats_dfs = stats_dfs.reindex(columns=cols) posthoc_dfs = posthoc_dfs.drop(['Contrast'], axis=1) posthoc_dfs = posthoc_dfs.rename( columns={ 'hedges': 'effect size', 'p-corr': 'p-value corrected', 'p-unc': 'p-value', 'p-adjust': 'correction method', 'BF10': 'Bayes factor' }) cols2 = [ 'Target Name', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] posthoc_dfs = posthoc_dfs.reindex(columns=cols2) return stats_dfs, posthoc_dfs