Exemple #1
0
    def test_result_attributes(self):
        x = [1, 3, 5, 7, 9]
        y = [2, 4, 6, 8, 10]

        res = mstats.kruskal(x, y)
        attributes = ('statistic', 'pvalue')
        check_named_results(res, attributes, ma=True)
    def test_result_attributes(self):
        x = [1, 3, 5, 7, 9]
        y = [2, 4, 6, 8, 10]

        res = mstats.kruskal(x, y)
        attributes = ('statistic', 'pvalue')
        check_named_results(res, attributes, ma=True)
        expenses_group1[val[0]] = int(val[1])

for val in total_spend_per_product_group2:
    if val[0] in expenses_group2.keys():
        expenses_group2[val[0]] += int(val[1])
    else:
        expenses_group2[val[0]] = int(val[1])

#print(expenses_group1, expenses_group2)

# Statistical study
# Kruskal-Wallis: Check whether the null hypothesis: "The average expenditure for each group is the same" is true
from scipy.stats.mstats import kruskal
from scipy.stats.mstats import mannwhitneyu
st, pvalue = kruskal([x[1] for x in customers_group1],
                     [x[1] for x in customers_group2
                      ])  # x[1] contains the expenditure of each product
if pvalue < 0.05:
    print(
        "The null hypothesis: \n\t'The average expenditure for each group is the same'\nis False"
    )
    # Mann-Whitney for each pair of groups. Eg.: milk_group1 - milk_group2, delicatessen_group1 - delicatessen_group2
    for product in range(2, 8):
        product_group1 = []
        product_group2 = []
        # Get the expenditures per type of product
        for value in customers_group1:
            product_index = data[value[0]].index(value[1])
            if product_index == product:
                product_group1.append(value[1])
        for value in customers_group2:
Exemple #4
0

print("\n")

print(mannwhitneyu(all_data["Gaussian"], all_data["Finetuned\nFiji U-Net"])[1])
print(mannwhitneyu(all_data["Hessian"], all_data["Finetuned\nFiji U-Net"])[1])
print(
    mannwhitneyu(all_data["Laplacian"], all_data["Finetuned\nFiji U-Net"])[1])
print(mannwhitneyu(all_data["Ilastik"], all_data["Finetuned\nFiji U-Net"])[1])

#print(np.median(all_data["Hessian"]), np.median(all_data["Ilastik"]), np.median(all_data["MitoSegNet"]))
#print(np.average(all_data["Hessian"]), np.average(all_data["Ilastik"]), np.average(all_data["MitoSegNet"]))

print(
    kruskal(all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(),
            all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(),
            all_data["MitoSegNet"].tolist(),
            all_data["Finetuned\nFiji U-Net"].tolist()))

dt = posthoc_dunn([
    all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(),
    all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(),
    all_data["MitoSegNet"].tolist(),
    all_data["Finetuned\nFiji U-Net"].tolist()
])

dt.to_excel("ed_posthoc.xlsx")


# pooled standard deviation for calculation of effect size (cohen's d)
def cohens_d(data1, data2):
Exemple #5
0
print(mannwhitneyu(m_l, ga_l)[1])
print(mannwhitneyu(m_l, il_l)[1])
print(mannwhitneyu(m_l, u_l_pt)[1])

#print(mannwhitneyu(m_l, u_l)[1])
"""
all_data["Gaussian"] = ga_l
all_data["Hessian"] = h_l
all_data["Laplacian"] = la_l
all_data["Ilastik"] = il_l

all_data["MitoSegNet"] = m_l
all_data["Finetuned\nFiji U-Net"] = u_l_pt
"""

p_val = kruskal(ga_l, h_l, la_l, il_l, m_l, u_l_pt)
print(p_val)

dt = posthoc_dunn([ga_l, h_l, la_l, il_l, m_l, u_l_pt])

#dt = posthoc_dunn(all_data, val_col="MitoSegNet", group_col="Ilastik")
print(dt)

dt.to_excel("dc_posthoc.xlsx")

print("\n")
print(mannwhitneyu(u_l_pt, h_l)[1])
print(mannwhitneyu(u_l_pt, la_l)[1])
print(mannwhitneyu(u_l_pt, ga_l)[1])
print(mannwhitneyu(u_l_pt, il_l)[1])
print("\n")
        dice_l.append(dice)

    all_data[folder] = dice_l

#print(all_data)
#all_data.to_csv(path + "/dice_coefficient_table.csv")
"""
# checking if data is normally distributed
for seg in all_data:
    print(seg, normaltest(all_data[seg]))
"""

# hypothesis testing
kwt = kruskal([
    all_data["MitoSegNet"], all_data["Finetuned Fiji U-Net"],
    all_data["Ilastik"], all_data["Gaussian"], all_data["Hessian"],
    all_data["Laplacian"]
])
#print(kwt)

dt = posthoc_dunn([
    all_data["MitoSegNet"], all_data["Finetuned Fiji U-Net"],
    all_data["Ilastik"], all_data["Gaussian"], all_data["Hessian"],
    all_data["Laplacian"]
])
#print(dt)
#dt.to_excel("dc_posthoc.xlsx")

#print(cohens_d(all_data["MitoSegNet"], all_data["Ilastik"]))

# pos_y and pos_x determine position of bar, p sets the number of asterisks, y_dist sets y distance of the asterisk to
def independence_test(df_test,
                      column_value="speed",
                      column_factor="city_id",
                      alpha=0.05,
                      decimals=6):
    """
    Function that carries out indepence test given a numerical column and a factor column.
    First, it is checked that assumptions for ANOVA test are fullfilled.
    Second, if so, ANOVA test is carried out. If not, a non parametric test, Kruskal wallis is performed
    Args: A dataframe, a numerical column, a factor column, an alpha factor for the alpha level of the test,
    decimals to round the p-value

    Returns: A dataframe with the results of the test

    """

    ##We are extracting all posible combinations:
    n_cases = list(df_test[column_factor].unique())

    tuple_combinations = [(x, y) for x in n_cases for y in n_cases
                          if n_cases.index(y) > n_cases.index(x)]

    ##Shapiro test to check the normal distribution of residuals
    ##Null hypothesis: data is drawn from normal distribution.
    w_saphiro, pvalue_saphiro = stats.shapiro(df_test[column_value].values)
    df_out = pd.DataFrame(tuple_combinations)

    ### Kurskal test is performed in any case, and use where anova is not usable.
    Kruskal_test = [
        kruskal(
            df_test[df_test[column_factor] == combination[0]]
            [column_value].values, df_test[
                df_test[column_factor] == combination[1]][column_value].values)
        for combination in tuple_combinations
    ]
    Kruskal_pvalue = [round(x.pvalue, decimals) for x in Kruskal_test]
    Kruskal_H = [x.statistic for x in Kruskal_test]

    ## if saphiro test is passed, we continue calculating Anova:

    if pvalue_saphiro > 0.05:
        print("Saphiro test p-value: %s" % round(pvalue_saphiro, decimals))
        print(
            "Null hypothesis cannot be rejected. We asume residuals are normally distributed"
        )
        ##Now let's check every possible combinatio to perform Bartlet test:
        Bartlett_test = [
            stats.bartlett(
                df_test[df_test[column_factor] == combination[0]]
                [column_value], df_test[df_test[column_factor] ==
                                        combination[1]][column_value])
            for combination in tuple_combinations
        ]
        ##Extracting p-values and statistic values:
        Bartlett_pvalue = [x.pvalue for x in Bartlett_test]
        Bartlett_w = [x.statistic for x in Bartlett_test]
        Anova_test = [
            stats.f_oneway(
                df_test[df_test[column_factor] ==
                        combination[0]][column_value].values,
                df_test[df_test[column_factor] ==
                        combination[1]][column_value].values)
            for combination in tuple_combinations
        ]
        Anova_pvalue = [x.pvalue for x in Anova_test]
        Anova_f = [x.statistic for x in Anova_test]

        df_out = pd.concat([
            df_out,
            pd.DataFrame(Bartlett_pvalue),
            pd.DataFrame(Anova_pvalue),
            pd.DataFrame(Kruskal_pvalue)
        ],
                           axis=1)
        df_out.columns = [
            "First_value", "Second_value", "Bartlett_pvalue", "Anova_pvalue",
            "kruskal_pvalue"
        ]

        df_out["Reject Bartlett null hyp"] = np.where(
            df_out["Bartlett_pvalue"] < alpha, True, False)
        df_out["Reject Anova null hyp"] = np.where(
            df_out["Anova_pvalue"] < alpha, True, False)
        ## Kruscal_pvalue
        df_out["kruskal_pvalue"] = np.where(
            df_out["Reject Bartlett null hyp"] == True,
            df_out["kruskal_pvalue"], np.NaN)

    else:
        print("Saphiro test p-value: %s" % round(pvalue_saphiro, 4))
        print(
            "Null hypothesis rejected. We cannot asume residuals are normally distributed"
        )
        print("Using kruskal wallis test")

        df_out = pd.concat(
            [df_out,
             pd.DataFrame(Kruskal_pvalue),
             pd.DataFrame(Kruskal_H)],
            axis=1)
        # df_out=pd.concat([df_out,pd.DataFrame([Kruskal_H])],axis=1)

        df_out.columns = [
            "First_value", "Second_value", "kruskal_p_value", "kruskal_H_value"
        ]
        df_out["Reject Kruskal null hyp"] = np.where(
            df_out["kruskal_p_value"] < alpha, True, False)

    return df_out