Beispiel #1
0
def kruskal_posthoc_tests(benchmark_snapshot_df):
    """Returns p-value tables for various Kruskal posthoc tests.

    Results should considered only if Kruskal test rejects null hypothesis.
    """
    common_args = {
        'a': benchmark_snapshot_df,
        'group_col': 'fuzzer',
        'val_col': 'edges_covered',
        'sort': True
    }
    p_adjust = 'holm'

    posthoc_tests = {}
    posthoc_tests['mann_whitney'] = sp.posthoc_mannwhitney(**common_args,
                                                           p_adjust=p_adjust)
    posthoc_tests['conover'] = sp.posthoc_conover(**common_args,
                                                  p_adjust=p_adjust)
    posthoc_tests['wilcoxon'] = sp.posthoc_wilcoxon(**common_args,
                                                    p_adjust=p_adjust)
    posthoc_tests['dunn'] = sp.posthoc_dunn(**common_args, p_adjust=p_adjust)
    posthoc_tests['nemenyi'] = sp.posthoc_nemenyi(**common_args)

    return posthoc_tests
                arch][5][i][1249]

    #########################################
    # Statistics: Test for differences between groups
    #
    # We analyzed the learning performance of the unpaired samples from the different
    # architectures using the non-parametric Kruskal-Wallis test [Kruskal & Wallis 1952]
    # (as the data appears not normally distributed) and for post-hoc analysis using the
    # Dunn [Dunn & Dunn 1961] post-hoc test (applying Bonferroni correction)
    # following [Raff 2019].
    #########################################
    from scipy import stats
    import scikit_posthocs as sp
    stats.kruskal(architecture_samples_at312[0], architecture_samples_at312[1],
                  architecture_samples_at312[2], architecture_samples_at312[3])
    sp.posthoc_mannwhitney(architecture_samples_at312, p_adjust='holm')
    sp.posthoc_mannwhitney(architecture_samples_at312, p_adjust='bonferroni')
    # OR posthoc_dunn, posthoc_conover
    stats.kruskal(architecture_samples_at625[0], architecture_samples_at625[1],
                  architecture_samples_at625[2], architecture_samples_at625[3])
    sp.posthoc_mannwhitney(architecture_samples_at625, p_adjust='holm')
    sp.posthoc_mannwhitney(architecture_samples_at625, p_adjust='bonferroni')

    stats.kruskal(architecture_samples_at1250[0],
                  architecture_samples_at1250[1],
                  architecture_samples_at1250[2],
                  architecture_samples_at1250[3])
    sp.posthoc_mannwhitney(architecture_samples_at1250, p_adjust='holm')
    sp.posthoc_mannwhitney(architecture_samples_at1250, p_adjust='bonferroni')

    stats.kruskal(architecture_learn_perf_samples_at1250[0],
Beispiel #3
0
def boxplotResults(mainFolder, folders, tp, rep, indiv, lastGen, plotType,
                   saveFile, annotatePairs):
    logCol = 1
    file = 'evolution'
    if (tp == 'evol'):
        logCol = 1
        file = 'evolution'
        variable = 'Fitness'
    elif (tp == 'nModules'):
        logCol = 2
        file = 'bestFeatures'
        variable = 'Number of Modules'
    elif (tp == 'brokenConn'):
        logCol = 11
        file = 'meanFeatures'
        variable = 'Number of Broken Connections'
    elif (tp == 'nConn'):
        logCol = 19
        file = 'bestFeatures'
        variable = 'Average Connections per Module'

    #dfAll = pd.DataFrame(columns=folders)
    dfAll = pd.DataFrame()

    if (not indiv):
        fig = plt.figure(figsize=(15, 10))
        ax1 = fig.gca()

    data = []

    for k in range(0, len(folders)):
        if (indiv):
            fig = plt.figure()
            ax1 = fig.gca()
        #nGenerations = minGenerationCount(mainFolder,folders[k],rep)
        data.clear()

        for i in range(0, rep):
            csv_file = open('./' + mainFolder + '/' + folders[k] + 'xL/' +
                            str(i + 1) + '/log/' + file + '.txt')
            csv_reader = csv.reader(csv_file)
            oldRows = list(csv_reader)
            rows = []
            for row in oldRows:
                rows.append(row[0].split(" - "))
                #print(row)
            #print(rows)
            #print(nGenerations)
            if (lastGen):
                data.append(float(rows[-1][logCol]))
            else:
                line_count = 0
                for row in rows:
                    #print(row[logCol])
                    data.append(float(row[logCol]))
                    line_count = line_count + 1
                    #if line_count >= nGenerations:
                    #    break

        dfPartial = pd.DataFrame(data, columns=[variable])
        dfPartial['Length'] = folders[k]
        #print(dfPartial)

        dfAll = dfAll.append(dfPartial, ignore_index=True)
        #ax1.set_title('Length x'+folders[k])
        #print(dfAll)
    #print(dfAll)
    #dfAll.boxplot(column='Fitness',by='Length',ax=ax1,grid=False,notch=False)
    #dfAll.groupby('Length',sort=True).boxplot()
    #if(tp=='evol'):
    #        ax1.set_ylim(-0.1,11)
    x = "Length"
    y = variable
    order = folders
    if (plotType == 'box'):
        #ax = sns.boxplot(data=dfAll, x=x, y=y,order=order,showfliers=False)
        ax = sns.boxplot(data=dfAll, x=x, y=y, order=order)
    elif (plotType == 'swarm'):
        ax = sns.swarmplot(data=dfAll, x=x, y=y, order=order)
    elif (plotType == 'strip'):
        ax = sns.stripplot(data=dfAll, x=x, y=y, order=order)
    elif (plotType == 'violin'):
        ax = sns.violinplot(data=dfAll, x=x, y=y, order=order)

    if (tp != 'brokenConn'):
        add_stat_annotation(ax,
                            data=dfAll,
                            x=x,
                            y=y,
                            order=order,
                            box_pairs=annotatePairs,
                            test='Mann-Whitney',
                            text_format='star',
                            loc='outside',
                            verbose=2)

    plt.savefig(saveFile + tp + plotType + '.eps', bbox_inches="tight")
    plt.show()
    print(
        scp_stats.kruskal(*[
            group[variable].values for name, group in dfAll.groupby('Length')
        ]))

    #Connover
    #postHoc = sp.posthoc_conover(dfAll,val_col='Fitness',group_col='Length')
    #print(postHoc)
    if (tp != 'brokenConn'):
        #Mann-Whitney
        postHoc = sp.posthoc_mannwhitney(dfAll,
                                         val_col=variable,
                                         group_col='Length')
        #print(postHoc)
        heatmap_args = {
            'linewidths': 0.25,
            'linecolor': '0.5',
            'clip_on': False,
            'square': True,
            'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
        }
        sp.sign_plot(postHoc, **heatmap_args)
#post hoc with Conover test
pc = sp.posthoc_conover(x)
heatmap_args = {
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
}
sp.sign_plot(pc, **heatmap_args)

# In[39]:

# Post hoc with mann whitney
pc2 = sp.posthoc_mannwhitney(x)
heatmap_args = {
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
}
sp.sign_plot(pc2, **heatmap_args)

# In[40]:

# Create knowledge bins
bins = [0, 3, 6, 8]
labels = ['Low', 'Mid', 'High']
data['Knowledge_bin'] = pd.cut(data['Knowledge'], bins=bins, labels=labels)