def test_result_attributes(self): x = [1, 3, 5, 7, 9] y = [2, 4, 6, 8, 10] res = mstats.kruskal(x, y) attributes = ('statistic', 'pvalue') check_named_results(res, attributes, ma=True)
expenses_group1[val[0]] = int(val[1]) for val in total_spend_per_product_group2: if val[0] in expenses_group2.keys(): expenses_group2[val[0]] += int(val[1]) else: expenses_group2[val[0]] = int(val[1]) #print(expenses_group1, expenses_group2) # Statistical study # Kruskal-Wallis: Check whether the null hypothesis: "The average expenditure for each group is the same" is true from scipy.stats.mstats import kruskal from scipy.stats.mstats import mannwhitneyu st, pvalue = kruskal([x[1] for x in customers_group1], [x[1] for x in customers_group2 ]) # x[1] contains the expenditure of each product if pvalue < 0.05: print( "The null hypothesis: \n\t'The average expenditure for each group is the same'\nis False" ) # Mann-Whitney for each pair of groups. Eg.: milk_group1 - milk_group2, delicatessen_group1 - delicatessen_group2 for product in range(2, 8): product_group1 = [] product_group2 = [] # Get the expenditures per type of product for value in customers_group1: product_index = data[value[0]].index(value[1]) if product_index == product: product_group1.append(value[1]) for value in customers_group2:
print("\n") print(mannwhitneyu(all_data["Gaussian"], all_data["Finetuned\nFiji U-Net"])[1]) print(mannwhitneyu(all_data["Hessian"], all_data["Finetuned\nFiji U-Net"])[1]) print( mannwhitneyu(all_data["Laplacian"], all_data["Finetuned\nFiji U-Net"])[1]) print(mannwhitneyu(all_data["Ilastik"], all_data["Finetuned\nFiji U-Net"])[1]) #print(np.median(all_data["Hessian"]), np.median(all_data["Ilastik"]), np.median(all_data["MitoSegNet"])) #print(np.average(all_data["Hessian"]), np.average(all_data["Ilastik"]), np.average(all_data["MitoSegNet"])) print( kruskal(all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(), all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(), all_data["MitoSegNet"].tolist(), all_data["Finetuned\nFiji U-Net"].tolist())) dt = posthoc_dunn([ all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(), all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(), all_data["MitoSegNet"].tolist(), all_data["Finetuned\nFiji U-Net"].tolist() ]) dt.to_excel("ed_posthoc.xlsx") # pooled standard deviation for calculation of effect size (cohen's d) def cohens_d(data1, data2):
print(mannwhitneyu(m_l, ga_l)[1]) print(mannwhitneyu(m_l, il_l)[1]) print(mannwhitneyu(m_l, u_l_pt)[1]) #print(mannwhitneyu(m_l, u_l)[1]) """ all_data["Gaussian"] = ga_l all_data["Hessian"] = h_l all_data["Laplacian"] = la_l all_data["Ilastik"] = il_l all_data["MitoSegNet"] = m_l all_data["Finetuned\nFiji U-Net"] = u_l_pt """ p_val = kruskal(ga_l, h_l, la_l, il_l, m_l, u_l_pt) print(p_val) dt = posthoc_dunn([ga_l, h_l, la_l, il_l, m_l, u_l_pt]) #dt = posthoc_dunn(all_data, val_col="MitoSegNet", group_col="Ilastik") print(dt) dt.to_excel("dc_posthoc.xlsx") print("\n") print(mannwhitneyu(u_l_pt, h_l)[1]) print(mannwhitneyu(u_l_pt, la_l)[1]) print(mannwhitneyu(u_l_pt, ga_l)[1]) print(mannwhitneyu(u_l_pt, il_l)[1]) print("\n")
dice_l.append(dice) all_data[folder] = dice_l #print(all_data) #all_data.to_csv(path + "/dice_coefficient_table.csv") """ # checking if data is normally distributed for seg in all_data: print(seg, normaltest(all_data[seg])) """ # hypothesis testing kwt = kruskal([ all_data["MitoSegNet"], all_data["Finetuned Fiji U-Net"], all_data["Ilastik"], all_data["Gaussian"], all_data["Hessian"], all_data["Laplacian"] ]) #print(kwt) dt = posthoc_dunn([ all_data["MitoSegNet"], all_data["Finetuned Fiji U-Net"], all_data["Ilastik"], all_data["Gaussian"], all_data["Hessian"], all_data["Laplacian"] ]) #print(dt) #dt.to_excel("dc_posthoc.xlsx") #print(cohens_d(all_data["MitoSegNet"], all_data["Ilastik"])) # pos_y and pos_x determine position of bar, p sets the number of asterisks, y_dist sets y distance of the asterisk to
def independence_test(df_test, column_value="speed", column_factor="city_id", alpha=0.05, decimals=6): """ Function that carries out indepence test given a numerical column and a factor column. First, it is checked that assumptions for ANOVA test are fullfilled. Second, if so, ANOVA test is carried out. If not, a non parametric test, Kruskal wallis is performed Args: A dataframe, a numerical column, a factor column, an alpha factor for the alpha level of the test, decimals to round the p-value Returns: A dataframe with the results of the test """ ##We are extracting all posible combinations: n_cases = list(df_test[column_factor].unique()) tuple_combinations = [(x, y) for x in n_cases for y in n_cases if n_cases.index(y) > n_cases.index(x)] ##Shapiro test to check the normal distribution of residuals ##Null hypothesis: data is drawn from normal distribution. w_saphiro, pvalue_saphiro = stats.shapiro(df_test[column_value].values) df_out = pd.DataFrame(tuple_combinations) ### Kurskal test is performed in any case, and use where anova is not usable. Kruskal_test = [ kruskal( df_test[df_test[column_factor] == combination[0]] [column_value].values, df_test[ df_test[column_factor] == combination[1]][column_value].values) for combination in tuple_combinations ] Kruskal_pvalue = [round(x.pvalue, decimals) for x in Kruskal_test] Kruskal_H = [x.statistic for x in Kruskal_test] ## if saphiro test is passed, we continue calculating Anova: if pvalue_saphiro > 0.05: print("Saphiro test p-value: %s" % round(pvalue_saphiro, decimals)) print( "Null hypothesis cannot be rejected. We asume residuals are normally distributed" ) ##Now let's check every possible combinatio to perform Bartlet test: Bartlett_test = [ stats.bartlett( df_test[df_test[column_factor] == combination[0]] [column_value], df_test[df_test[column_factor] == combination[1]][column_value]) for combination in tuple_combinations ] ##Extracting p-values and statistic values: Bartlett_pvalue = [x.pvalue for x in Bartlett_test] Bartlett_w = [x.statistic for x in Bartlett_test] Anova_test = [ stats.f_oneway( df_test[df_test[column_factor] == combination[0]][column_value].values, df_test[df_test[column_factor] == combination[1]][column_value].values) for combination in tuple_combinations ] Anova_pvalue = [x.pvalue for x in Anova_test] Anova_f = [x.statistic for x in Anova_test] df_out = pd.concat([ df_out, pd.DataFrame(Bartlett_pvalue), pd.DataFrame(Anova_pvalue), pd.DataFrame(Kruskal_pvalue) ], axis=1) df_out.columns = [ "First_value", "Second_value", "Bartlett_pvalue", "Anova_pvalue", "kruskal_pvalue" ] df_out["Reject Bartlett null hyp"] = np.where( df_out["Bartlett_pvalue"] < alpha, True, False) df_out["Reject Anova null hyp"] = np.where( df_out["Anova_pvalue"] < alpha, True, False) ## Kruscal_pvalue df_out["kruskal_pvalue"] = np.where( df_out["Reject Bartlett null hyp"] == True, df_out["kruskal_pvalue"], np.NaN) else: print("Saphiro test p-value: %s" % round(pvalue_saphiro, 4)) print( "Null hypothesis rejected. We cannot asume residuals are normally distributed" ) print("Using kruskal wallis test") df_out = pd.concat( [df_out, pd.DataFrame(Kruskal_pvalue), pd.DataFrame(Kruskal_H)], axis=1) # df_out=pd.concat([df_out,pd.DataFrame([Kruskal_H])],axis=1) df_out.columns = [ "First_value", "Second_value", "kruskal_p_value", "kruskal_H_value" ] df_out["Reject Kruskal null hyp"] = np.where( df_out["kruskal_p_value"] < alpha, True, False) return df_out