def run_stats(input_df):
    """Run Kruskal-Wallis H test. This is analogous to 1 way ANOVA but for non-parametric applications. 
	The conover test is used for post-hoc testing to determine relationship between variables. NOTE that the post hoc tests 
	should only be used when there is a significant result of the omnibus test."""

    #deal with cases where all vals in a col are nan
    input_df = input_df.dropna(axis=1, how='all')
    #set inf to nan
    input_df = input_df.replace(np.inf, np.nan)

    if input_df.isnull().all().all():
        return None
    #reformat the df cols into arrays to pass to the stats func
    data = [
        input_df[column].to_numpy() for column in input_df.columns
        if not column == 'huc8'
    ]

    #run the kruskal-wallis
    H, p = stats.kruskal(*data, nan_policy='omit')
    #print(H,p)
    try:
        #run the post-hoc test
        #conover = sp.posthoc_conover([input_df.dropna().iloc[:,0].values,input_df.dropna().iloc[:,1].values,input_df.dropna().iloc[:,2].values,input_df.dropna().iloc[:,3].values],p_adjust='holm')
        conover = sp.posthoc_conover(data, p_adjust='holm')
        conover.columns = input_df.columns
        conover.index = input_df.columns

        return H, p, conover

    except Exception as e:
        print('Error is: ', e)
コード例 #2
0
def sign_barplot(df, val_col, group_col, test="HSD"):
    if test == "HSD":
        result_df = tukey_hsd(df, val_col, group_col)
    if test == "tukey":
        result_df = sp.posthoc_tukey(df, val_col, group_col)
    if test == "ttest":
        result_df = sp.posthoc_ttest(df, val_col, group_col)
    if test == "scheffe":
        result_df = sp.posthoc_scheffe(df, val_col, group_col)
    if test == "dscf":
        result_df = sp.posthoc_dscf(df, val_col, group_col)
    if test == "conover":
        result_df = sp.posthoc_conover(df, val_col, group_col)
    #マッピングのプロファイル
    fig, ax = plt.subplots(1, 2, figsize=(10, 6))
    cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
    heatmap_args = {
        'cmap': cmap,
        'linewidths': 0.25,
        'linecolor': '0.5',
        'clip_on': False,
        'square': True
    }

    sp.sign_plot(result_df, ax=ax[1], **heatmap_args)  #検定結果を描画

    sns.barplot(data=df, x=group_col, y=val_col, capsize=0.1,
                ax=ax[0])  #使ったデータを描画
    plt.show()
コード例 #3
0
def run_kruskall_posthoc(x, y, df):
    # select only parts of df that are needed for analysis and remove outliers
    df = remove_outliers(x, y, df)
    # Post warning if df is less than 5 samples
    if len(df) < 5:
        print(
            'Warning: Sample Size Smaller Than 5. Kruskall Wallace Test Value Suspect'
        )
    # Reorded dataframes and run sruskall wallace test
    groups = []
    group_names = df[x].drop_duplicates()
    group_names = np.array(group_names.values)
    for i in range(len(group_names)):
        indv_group = df[df[x] == group_names[i]][y].values
        indv_group = indv_group.tolist()
        groups.append(indv_group)
    # run kruskall-wallis test
    k = st.mstats.kruskalwallis(*groups)
    # if kruskall-wallace test is satisfied then run post hoc tests, and return results as a dataframe
    # run post hoc dunn following kruskall wallace test
    # this returns p values of each group with the diagonals being -1 beucase
    # you're comparing the same group
    x = sp.posthoc_conover(df, val_col=y, group_col=x, p_adjust='holm')
    # Store the result of comparisons of different groups in dataframe
    x['Measurement'] = [y] * len(x)
    x['Comparison Group'] = x.index.values
    x = x.reset_index()
    x['Krusal-Wallis p value'] = [k[1]] * len(x)
    return x
コード例 #4
0
def do_stats_stuff(m):
    ll = list(m.values())
    args = [zz for zz in ll]
    s = stats.kruskal(*args)
    print("Kruskal Result: " + str(s))

    lt = []
    for k, v in m.items():
        for v1 in v:
            lt.append((k, v1))

    df = pd.DataFrame(lt, columns=['label', 'ratio'])

    f = posthocs.posthoc_conover(df,
                                 sort=True,
                                 p_adjust='bonferroni',
                                 group_col='label',
                                 val_col='ratio')
    print("Dunn's Result")
    print(str(f))

    for k, v in m.items():
        if 'External To External' in k:
            q3 = np.percentile(v, 25, interpolation='midpoint')
            iqr = (1.5 * stats.iqr(v, interpolation='midpoint')) + q3
            outlier = []
            for v1 in v:
                if v1 >= iqr:
                    outlier.append(v1)
            if len(outlier) > 0:
                print(outlier)

    return s
コード例 #5
0
ファイル: test.py プロジェクト: atlefren/geodiff_stats
def stats_tests(df, metrics, groups, group_by):
  p_value = 0.05
  results = {}
  for metric in metrics:
      normality = {}
      for differ, ids in groups.items():
        normal = is_normal(df.loc[ids, metric].values, p_value)
        normality[differ] = normal

      # get an array pr differ for selected metric
      data_for_metric = [df.loc[ids, metric].values for ids in groups.values()]
      #h0: samples from same distribution
      from_same_distribution = kruskal(data_for_metric, p_value)

      conover_result = sp.posthoc_conover(df, val_col=metric, group_col=group_by, p_adjust='holm')

      conover_arr = []
      for m1 in groups:
        for m2 in groups:
          if m1 != m2:
            res = conover_result.loc[m1, m2] 
            conover_arr.append(res < p_value)

      results[metric] = {
        'normality': normality,
        'from_same_distribution': from_same_distribution,
        'post_hoc': conover_result if False in conover_arr else None,
      }
  return results
    def get_significant_pairs(self, df, metric):

        pairwise_comparisons = sp.posthoc_conover(df,
                                                  val_col=metric,
                                                  group_col='condition',
                                                  p_adjust='holm')
        # embed()
        # TO DO: Wilcoxon won't work for mode switches because not truly paired test (conditions have different lengths)
        # pairwise_comparisons = sp.posthoc_wilcoxon(df, val_col=metric, group_col='condition', p_adjust='holm')

        groups = pairwise_comparisons.keys().to_list()
        combinations = list(itertools.combinations(
            groups, 2))  # possible combinations for pairwise comparison
        pairs = []
        p_values = []
        # get pairs for x:
        for i in range(len(combinations)):
            if pairwise_comparisons.loc[
                    combinations[i][0], combinations[i]
                [1]] <= self.alpha:  # if signifcane between the two pairs is alot, add position
                pairs.append([
                    self.label_to_plot_pos[combinations[i][0]],
                    self.label_to_plot_pos[combinations[i][1]]
                ])
                p_values.append(pairwise_comparisons.loc[combinations[i][0],
                                                         combinations[i][1]])

        return pairs, p_values
	def get_significant_pairs(self, df, metric, label_to_plot_pos):

		df["trial"] = df["condition"]+" "+df["block"]
		pairwise_comparisons = sp.posthoc_conover(df, val_col=metric, group_col='trial', p_adjust='holm')
		# TO DO: Wilcoxon won't work for mode switches because not truly paired test (conditions have different lengths)
		# pairwise_comparisons = sp.posthoc_wilcoxon(df, val_col=metric, group_col='condition', p_adjust='holm')

		groups = pairwise_comparisons.keys().to_list()
		combinations = list(itertools.combinations(groups, 2)) # possible combinations for pairwise comparison
		combinations = [('Corrective First', 'Corrective Second'),('Filtered First', 'Filtered Second'),('No Assistance First', 'No Assistance Second')]
		pairs = []
		p_values = []
		# get pairs for x:
		for i in range(len(combinations)):
			if pairwise_comparisons.loc[combinations[i][0], combinations[i][1]] <= self.alpha:  # if signifcane between the two pairs is alot, add position
				pairs.append([label_to_plot_pos[combinations[i][0]], label_to_plot_pos[combinations[i][1]]])
				p_values.append(pairwise_comparisons.loc[combinations[i][0], combinations[i][1]])

		return pairs, p_values
コード例 #8
0
def compare_values(values_to_compare):
    pvals = []

    for value in values_to_compare:
        groups = [
            avg_props[avg_props['group'] == group][value]
            for group in groups_to_compare
        ]

        statistic, pval = stats.kruskal(*groups)
        pvals.append(pval)

    adj_pvals = multipletests(pvals, alpha=0.05, method='holm')[1]

    for idx, value in enumerate(values_to_compare):
        name = translation[value] if type(value) is int else value
        print("Comparing {}".format(name))

        for group in groups_to_compare:
            group = avg_props[avg_props['group'] == group]
            print("{}: {} (+/- {})".format(group['group'][0],
                                           group[value].mean(),
                                           group[value].std()))

        print("H-test adjusted p-value: {}".format(adj_pvals[idx]))
        print()

        opt = pd.get_option('display.float_format')
        pd.set_option('display.float_format', '{:.3g}'.format)
        print(
            sp.posthoc_conover(avg_props,
                               val_col=value,
                               group_col='group',
                               p_adjust='holm'))
        pd.set_option('display.float_format', opt)
        print()

        if not type(value) is int:
            plot_health(avg_props, value)
コード例 #9
0
def kruskal_posthoc_tests(benchmark_snapshot_df):
    """Returns p-value tables for various Kruskal posthoc tests.

    Results should considered only if Kruskal test rejects null hypothesis.
    """
    common_args = {
        'a': benchmark_snapshot_df,
        'group_col': 'fuzzer',
        'val_col': 'edges_covered',
        'sort': True
    }
    p_adjust = 'holm'

    posthoc_tests = {}
    posthoc_tests['mann_whitney'] = sp.posthoc_mannwhitney(**common_args,
                                                           p_adjust=p_adjust)
    posthoc_tests['conover'] = sp.posthoc_conover(**common_args,
                                                  p_adjust=p_adjust)
    posthoc_tests['wilcoxon'] = sp.posthoc_wilcoxon(**common_args,
                                                    p_adjust=p_adjust)
    posthoc_tests['dunn'] = sp.posthoc_dunn(**common_args, p_adjust=p_adjust)
    posthoc_tests['nemenyi'] = sp.posthoc_nemenyi(**common_args)

    return posthoc_tests
コード例 #10
0
 try:
     f_value, p_value = stats.kruskal(byGeneticDiversity0FR0,
                                      byGeneticDiversity0FR1,
                                      byGeneticDiversity0FR2,
                                      byGeneticDiversity0FR3,
                                      byGeneticDiversity0FR4)
     if (p_value <= 0.1):
         numberOfComparisonsp1 += 1
         if (p_value <= 0.05):
             numberOfComparisonsp05 += 1
         if (p_value <= 0.01):
             numberOfComparisonsp01 += 1
         print("KW for failures in column " + str(columnName) +
               " RT " + str(i) + "*** " + str(p_value))
         try:
             pvals = sp.posthoc_conover(ph0, p_adjust='holm')
             truth = np.logical_and(pvals <= 0.1, pvals >= 0)
             if (np.any(truth)):
                 print("significant comparison:")
                 print(pvals)
         except Exception as e:
             print('Could not compute posthoc conover: ' + str(e))
 except:
     print("")
 try:
     f_value, p_value = stats.kruskal(byGeneticDiversity1FR0,
                                      byGeneticDiversity1FR1,
                                      byGeneticDiversity1FR2,
                                      byGeneticDiversity1FR3,
                                      byGeneticDiversity1FR4)
     if (p_value <= 0.1):
コード例 #11
0
        cond = (sens_perf['decision'] == decision) & (sens_perf['auction']
                                                      == auc)
        sens_perf.loc[cond,
                      'rank'] = sens_perf.loc[cond,
                                              'delta'].rank(method='average',
                                                            ascending=False)

# %%  ANOVA and posthoc tests
anova_perf = {}
for decision in decision_types:
    for auc in auction_types:
        data = [
            delta_dict_perf[auc, decision][x].values
            for x in delta_perf['names']
        ]
        H, p = ss.kruskal(*data)
        df = pd.melt(delta_dict_perf[auc, decision],
                     id_vars=[],
                     value_vars=delta_dict_perf[auc, decision].columns)
        ph = sp.posthoc_conover(df,
                                val_col='value',
                                group_col='variable',
                                p_adjust='holm')
        anova_perf[auc, decision] = {'anova_p': p, 'posthoc_matrix': ph}

# %%  Manually correct ranks
corrected = pd.read_csv('Results_perf.csv')
sens_perf['rank_corrected'] = corrected['rank_corrected']
with open('postprocess_dicts_sens_synth.pkl', 'wb') as f:
    pickle.dump([corr, sens_perf, delta_dict_perf, anova_perf], f)
コード例 #12
0
div_df = pd.DataFrame(div, columns=[metric])

div_df.to_csv("alphadiversity_" + mname + "_" + var + ".txt", sep="\t")

combined_df = pd.concat([div_df, map_df], axis=1).reset_index()
combined_df.rename(columns={'index':'sample'}, inplace=True)
combined_df = combined_df.sort_values(by=[var])

#Kruskal-Wallis test
cat_dict = {}
for upar in getattr(combined_df, var).unique():
    cat_dict[str(upar)] = list(combined_df[combined_df[var] == upar][metric])
H, p = ss.kruskal(*cat_dict.values())

#Post-hoc test with Benjamini-Hochberg correction
con = sp.posthoc_conover(combined_df, val_col=metric, group_col=var, p_adjust = 'fdr_bh')
with open("statistics_" + mname + "_" + var + ".txt", "w") as st:
    st.write("Kruskal-Wallis H-test:\n\n")
    st.write("H\t" + str(H) + "\n")
    st.write("p-value\t" + str(p) + "\n\n\n")
    st.write("Conover post-hoc test with Benjamini/Hochberg correction:\n\n")
    st.write(con.to_string())

sns.set_style("ticks", {"ytick.major.size": "2.0"})
ax = sns.barplot(data=combined_df, x=var, y=metric, color=col, ci="sd", errwidth=0.6, capsize=0.1)
sns.despine(right=True)
plt.ylabel(label)

plt.savefig(figname, dpi=dpi)

end = time.time()
コード例 #13
0
ファイル: KrusKal-Wallis.py プロジェクト: renato145/DENN
from denn import *
from scipy.stats import kruskal
import scikit_posthocs as sp
import pylustrator

pylustrator.start()

path = Path('../../data/results/experiment4')

# fitness plots
no_nn = pd.read_csv(path/'no_nn_mof.csv')
nn_normal_rand = pd.read_csv(path/'nn-normal-random_mof.csv')
nn_dist_rand = pd.read_csv(path/'nn-distribution-random_mof.csv')
nn_dropout_rand= pd.read_csv(path/'nn-dropout-random_mof.csv')
labels = ['no_nn', 'nn_normal_rand', 'nn_dist_rand', 'nn_drop_rand']

x=np.array([no_nn.mof, nn_normal_rand.mof, nn_dist_rand.mof,nn_dropout_rand.mof])

stat, p = kruskal(no_nn,nn_normal_rand,nn_dist_rand,nn_dropout_rand)
pc=sp.posthoc_conover(x, p_adjust='holm', val_col='values', group_col='groups')
print('Statistics=%.3f, p=%.3f' % (stat, p))
print(pc)
heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}

ax,cbar = sp.sign_plot(pc, **heatmap_args)
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
plt.show()
コード例 #14
0
df_sort_by = agg_df[(agg_df.metric == "SDR") & (agg_df.target == "vocals")]

methods_by_sdr = df_sort_by.score.groupby(
    df_sort_by.method).median().sort_values().index.tolist()

f = plt.figure(figsize=(22, 20))
# resort them by median SDR
# Get sorting keys (sorted by median of SDR:vocals score)
df_voc = agg_df[(agg_df.target == 'vocals') & (agg_df.metric == "SAR")]

targets_by_voc_sdr = df_voc.score.groupby(
    df_voc.method).median().sort_values().index.tolist()

# prepare the pairwise statistics
pc_voc = sp.posthoc_conover(df_voc,
                            val_col='score',
                            group_col='method',
                            sort=True)
print(pc_voc)

f = plt.figure(figsize=(10, 10))
# Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
cmap = ['1', '#ff2626', '#ffffff', '#fcbdbd', '#ff7272']
heatmap_args = {
    'cmap': cmap,
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.90, 0.35, 0.04, 0.3]
}
sp.sign_plot(pc_voc, **heatmap_args)
コード例 #15
0
# df = pd.DataFrame(rastringin).T
# df.to_excel(excel_writer = "C:/Users/Tulio/Desktop/Mestrado/Busca_e_Otimizacao/search_and_optmization/src/utils/test.xlsx")

data = a280

# print(statistics.pstdev(data[2]))

# Friedman de grupo
print(stats.friedmanchisquare(*data))

# Kruskal-Wallis de grupo
print(stats.kruskal(*data))

#Teste de Conover baseado em Kruskal-Wallis
pc = sp.posthoc_conover(data)

#Caso precise mudar os indices e colunas do DataFrame
pc.columns = ['GRASP 2-opt', 'GRASP mBUC', 'HC mBUC']
pc.index = ['GRASP 2-opt', 'GRASP mBUC', 'HC mBUC']

print(pc)

#Heatmap do Teste de Conover
cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
heatmap_args = {
    'cmap': cmap,
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
コード例 #16
0
                  data.loc[data['Symptom'] == 2, 'Knowledge'])

print('Statistics=%.3f \n p=%.4f' % (stat, p))

# In[37]:

x = [
    data.loc[data['Symptom'] == 0, 'Knowledge'],
    data.loc[data['Symptom'] == 1, 'Knowledge'], data.loc[data['Symptom'] == 2,
                                                          'Knowledge']
]

# In[38]:

#post hoc with Conover test
pc = sp.posthoc_conover(x)
heatmap_args = {
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
}
sp.sign_plot(pc, **heatmap_args)

# In[39]:

# Post hoc with mann whitney
pc2 = sp.posthoc_mannwhitney(x)
heatmap_args = {
    'linewidths': 0.25,
コード例 #17
0
def compareBases(mainFolders, folders, tp, rep, lastGen, plotType, saveFile):
    logCol = 1
    file = 'evolution'

    if ((tp == 'evol') or (tp == 'evolBase')):
        logCol = 1
        file = 'evolution'
        variable = 'Fitness'
    elif ((tp == 'nModules') or (tp == 'nModulesBase')):
        logCol = 2
        file = 'bestFeatures'
        variable = 'Number of Modules'
    elif ((tp == 'brokenConn') or (tp == 'brokenConnBase')):
        logCol = 11
        file = 'meanFeatures'
        variable = 'Number of Broken Connections'
    elif ((tp == 'nConn') or (tp == 'nConnBase')):
        logCol = 19
        file = 'bestFeatures'
        variable = 'Average Connections per Module'

    dfAll = pd.DataFrame()
    data = []

    for l in range(0, len(mainFolders)):
        dfBase = pd.DataFrame()
        for k in range(0, len(folders)):
            #nGenerations = minGenerationCount(mainFolders[l],folders[k],rep)
            data.clear()

            for i in range(0, rep):
                csv_file = open('./' + mainFolders[l] + '/' + folders[k] +
                                'xL/' + str(i + 1) + '/log/' + file + '.txt')
                csv_reader = csv.reader(csv_file)
                oldRows = list(csv_reader)
                rows = []
                for row in oldRows:
                    rows.append(row[0].split(" - "))
            #print(rows)
            #print(nGenerations)
                if (lastGen):
                    data.append(float(rows[-1][logCol]))
                else:
                    line_count = 0
                    for row in rows:
                        #print(row[logCol])
                        data.append(float(row[logCol]))
                        line_count = line_count + 1
                        #if line_count >= nGenerations:
                        #    break

            dfPartial = pd.DataFrame(data, columns=[variable])
            dfPartial['Length'] = folders[k]
            #print(dfPartial)

            dfBase = dfBase.append(dfPartial, ignore_index=True)
            #ax1.set_title('Length x'+folders[k])
        dfBase['Base'] = mainFolders[l]
        dfAll = dfAll.append(dfBase, ignore_index=True)
        #print(dfAll)
        #dfAll.boxplot(column='Fitness',by='Length',ax=ax1,grid=False,notch=False)
        #dfAll.groupby('Length',sort=True).boxplot()
    #print(dfAll)

    #print([group['Fitness'].values for name,group in dfAll.groupby(['Length','Base'])])
    if ((tp != 'nModulesBase') and (tp != 'brokenConnBase')
            and (tp != 'evolBase') and (tp != 'nConnBase')):
        print(
            scp_stats.kruskal(*[
                group[variable].values
                for name, group in dfAll.groupby(['Length', 'Base'])
            ]))
    else:
        print(
            scp_stats.kruskal(*[
                group[variable].values
                for name, group in dfAll.groupby(['Base'])
            ]))

    if ((tp != 'brokenConn') and (tp != 'nModulesBase')
            and (tp != 'brokenConnBase') and (tp != 'evolBase')
            and (tp != 'nConnBase')):
        #Connover
        postHoc = sp.posthoc_conover([
            group[variable].values
            for name, group in dfAll.groupby(['Length', 'Base'])
        ])
        #print(postHoc)

        #Mann-Whitney
        #postHoc = sp.posthoc_mannwhitney([group['Fitness'].values for name,group in dfAll.groupby(['Length','Base'])])
        #print(postHoc)

        heatmap_args = {
            'linewidths': 0.25,
            'linecolor': '0.5',
            'clip_on': False,
            'square': True,
            'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
        }
        sp.sign_plot(postHoc, **heatmap_args)

    fig = plt.figure(figsize=(15, 10))
    y = variable

    if ((tp == 'nModulesBase') or (tp == 'brokenConnBase')
            or (tp == 'evolBase') or (tp == 'nConnBase')):
        x = 'Base'
        order = mainFolders
        if (plotType == 'box'):
            #ax = sns.boxplot(data=dfAll, x=x, y=y,order=order,showfliers=False)
            ax = sns.boxplot(data=dfAll, x=x, y=y, order=order)
        elif (plotType == 'swarm'):
            ax = sns.swarmplot(data=dfAll, x=x, y=y, order=order)
        elif (plotType == 'strip'):
            ax = sns.stripplot(data=dfAll, x=x, y=y, order=order)
        elif (plotType == 'violin'):
            ax = sns.violinplot(data=dfAll, x=x, y=y, order=order)
    else:
        x = "Length"
        hue = "Base"
        order = folders
        if (plotType == 'box'):
            #ax = sns.boxplot(data=dfAll, x=x, y=y,order=order,hue = hue,showfliers=False)
            ax = sns.boxplot(data=dfAll, x=x, y=y, order=order, hue=hue)
        elif (plotType == 'swarm'):
            ax = sns.swarmplot(data=dfAll, x=x, y=y, order=order, hue=hue)
        elif (plotType == 'strip'):
            ax = sns.stripplot(data=dfAll, x=x, y=y, order=order, hue=hue)
        elif (plotType == 'violin'):
            ax = sns.violinplot(data=dfAll, x=x, y=y, order=order, hue=hue)

    #dfAll.boxplot(column='Fitness',by=['Length','Base'],ax=ax1,grid=False,notch=False)
    #if(tp=='evol'):
    #ax.set_ylim(-0.1,11)
    plt.savefig(saveFile + tp + plotType + '.eps', bbox_inches="tight")
    plt.show()
コード例 #18
0
ファイル: stat_test.py プロジェクト: GAIPS/ILU-RL
        lbls = args.labels # Custom labels.
    else:
        lbls = [Path(exp_path).name for exp_path in args.experiments_paths] # Default labels.


    # Shapiro tests.
    print('Shapiro tests:')
    for (df, lbl) in zip(dfs,lbls):
        shapiro_test = stats.shapiro(df['travel_time'])
        print(f'\t{lbl}: {shapiro_test}')

    args = [df['travel_time'] for df in dfs]
    print(f'\nLevene\'s test: {stats.levene(*args)}')

    print(f'\nANOVA test: {stats.f_oneway(*args)}')

    data = []
    groups = []
    for (df, lbl) in zip(dfs, lbls):
        data.extend(df['travel_time'].tolist())
        groups.extend([lbl for _ in range(len(df['travel_time'].tolist()))])

    print('\nTukeyHSD:', pairwise_tukeyhsd(data, groups))

    # Non-parametric test.
    print('\nKruskal (non-parametric) test:', stats.kruskal(*args))

    # Post-hoc non-parametric comparisons.
    data = [df['travel_time'].tolist() for df in dfs]
    print(sp.posthoc_conover(data, p_adjust = 'holm'))
コード例 #19
0
def vary_thresholds(fn, thresholds, cthres):

    pp = PdfPages('figures/vary_thresholds_gamma={}.pdf'.format(gamma))

    years = ["1996", "2001", "2007", "2012"]

    def ind(yr):
        return years.index(yr)

    fig, ax = plt.subplots(1, 1)
    ax.grid(False)

    # color-blind spectrum: http://personal.sron.nl/~pault/colourschemes.pdf
    colors = [
        "#88ccee", "#44aa99", "#999933", "#DDCC77", "#CC6677", "#882255",
        "#AA4499"
    ]

    bars = []
    rows = []

    # Crude

    sm = 0
    hm = {t: {v: [] for v in krange} for t in thresholds}
    total = {t: {ind(y): 0 for y in years} for t in thresholds}
    byyear = {t: {ind(y): [] for y in years} for t in thresholds}
    compy = [(years[k], years[k + 1]) for k in range(len(years) - 1)]

    def process(tup):
        x, yr, sample = tup
        # Run the misfits calculation
        mf = s.misfits(x, krange, gamma=gamma)
        return x, mf, yr, sample

    cached_process = memory.cache(process)

    for year in tqdm(years):
        fdata = Parallel(n_jobs=num_cores)(
            delayed(cached_process)(tup)
            for tup in read_field_data_year(fn, [year]))

        for x, mf, yr, sample in fdata:
            mf = np.array(mf)
            # Find first drop below threshold
            tqdm.write("Length:{}. Interesting bases:{}. Misfits: {}".format(
                len(x), len([xx for xx in x if 1.0 - 1e-6 > xx > 1e-6]), mf))
            for thres in thresholds:
                best = sum(mf > thres)
                if best >= len(mf):
                    continue
                best += 1
                if best not in hm[thres]:
                    hm[thres][best] = []
                byyear[thres][ind(year)] += [best]
                hm[thres][best] += [ind(year)]
                total[thres][ind(year)] += 1

    of = open('figures/vary_thresholds_gamma={}.txt'.format(gamma), 'w')
    x = {}
    bw = {}
    kruskals = {}
    conovers = {(a, b): {} for (a, b) in compy}
    for thres in tqdm(thresholds):
        x[thres] = [byyear[thres][ind(y)] for y in years]

        try:
            kr = sts.kruskal(*x[thres])
            kruskals[thres] = kr[1]  # p-value

            of.write("\n{}\n{}\tKruskal-Willis:\n{}\n".format(
                "*" * 80, thres, kr))

            pc = sp.posthoc_conover(x[thres],
                                    val_col='values',
                                    group_col='groups',
                                    p_adjust='fdr_tsbky')
            for (a, b) in compy:
                #tqdm.write ("Conovers:\t{}\n".format( (a,b,ind(a),ind(b) )))
                #tqdm.write ("Conovers:\t{}\n".format(pc))
                #tqdm.write ("Conovers:\t{}\n".format(pc[ind(a)+1][ind(b)+1]))
                conovers[(a, b)][thres] = pc[ind(a) + 1][ind(b) + 1]

            of.write("{}\tConover:\n{}\n".format(thres, pc))
        except:
            tqdm.write('Exception with threshold {}, NaNing'.format(thres))

            of.write("{}\tKruskal-Willis:\nNaN\n".format(thres))
            of.write("{}\tConover:\nNaN\n".format(thres))
            kruskals[thres] = float('nan')
            for (a, b) in compy:
                conovers[(a, b)][thres] = float('nan')

    plt.ylabel('$q$-value')
    plt.xlabel('MOI Misfit Threshold ($T$)')
    plt.yscale('log')
    plt.xscale('log')

    plt.plot(thresholds, [kruskals[t] for t in thresholds],
             color='orange',
             linewidth=1.3,
             alpha=0.7,
             label='Kruskal-Willis')
    for i, (a, b) in enumerate(compy):
        plt.plot(thresholds, [conovers[a, b][t] for t in thresholds],
                 color=colors[i],
                 alpha=0.7,
                 label='Conover-Imam %s vs. %s' % (a, b))
    plt.legend(prop={'size': 10}, loc='lower right')

    plt.axvline(x=cthres,
                color='k',
                linestyle='--',
                linewidth=0.5,
                label='',
                alpha=0.8)
    plt.text(cthres,
             1e-5,
             'Threshold used',
             horizontalalignment='left',
             size='small',
             color='k',
             alpha=0.8)

    plt.axhline(y=0.05,
                color='b',
                linestyle='--',
                linewidth=0.5,
                label='0.05',
                alpha=0.7)
    plt.text(cthres,
             0.05,
             '$q$=0.05',
             horizontalalignment='right',
             size='small',
             color='b',
             alpha=0.7)

    pp.savefig()
    pp.close()

    of.close()
コード例 #20
0
            return pd.pivot_table(df,
                                  index=x,
                                  columns=c,
                                  values=y,
                                  aggfunc="count")


for i in group:  # 输出全部的分组信息
    fenbu(i)

#方差分析及事后检验
f, p = stats.f_oneway(*args)
print(f, p)

x = [list(args[1]), list(args[2]), list(args[3])]
sp.posthoc_conover(x, group_col=x, val_col=statistics, p_adjust='holm')

#独立样本t检验
ttest_group1 = df[df['GHQ分2类(1-很好;2-较差)'] == 1]['GHQ总分']
ttest_group2 = df[df['GHQ分2类(1-很好;2-较差)'] == 2]['GHQ总分']

group_mean = df.groupby('GHQ分2类(1-很好;2-较差)')
group_mean['GHQ总分.1'].agg("mean")

t, p = stats.ttest_ind(ttest_group1, ttest_group2)
print(t, p)

#事后检验
x = pd.DataFrame({
    "k": [1, 2, 4, 5, 6],
    "j": [1, 3, 5, 7, 66],
コード例 #21
0
        nasa_df[nasa_df.experiment_type == "CONTROL"][item],
        nasa_df[nasa_df.experiment_type == "BUTTON"][item],
        nasa_df[nasa_df.experiment_type == "TOUCH"][item],
        center='median'
        )

    if norm_p1 < 0.05 or norm_p2 < 0.05 or norm_p3 < 0.05 or norm_p4 < 0.05:
        _, anova_p = stats.friedmanchisquare(
            nasa_df[nasa_df.experiment_type == "BASELINE"][item],
            nasa_df[nasa_df.experiment_type == "CONTROL"][item],
            nasa_df[nasa_df.experiment_type == "BUTTON"][item],
            nasa_df[nasa_df.experiment_type == "TOUCH"][item],
        )
        print("anova(friedman test)", anova_p)
        if anova_p < 0.05:
            print(sp.posthoc_conover(nasa_df, val_col=item, group_col="experiment_type"))
    else:
        melted_df = pd.melt(nasa_df, id_vars=["name", "experiment_type"],  var_name="type", value_name="rate")
        aov = stats_anova.AnovaRM(melted_df[melted_df.type == item], "rate", "name", ["experiment_type"])
        print("reperted anova: ", aov.fit())
        multicomp_result = multicomp.MultiComparison(nasa_df[item], nasa_df.experiment_type)
        print(multicomp_result.tukeyhsd().summary())



melted_df = pd.melt(nasa_df, id_vars=nasa_df.columns.values[:2], var_name="args", value_name="value")
# plot = sns.boxplot(x='args', y="value", hue="experiment_type", data=melted_df,showmeans=True, meanline=True, meanprops={"linestyle":"--", "color":"Red"})
axes = sns.barplot(x='args', y="value", hue="experiment_type", data=melted_df)
axes.set_ylim([0, 10])
axes.set_ylabel('Workload Rating', fontsize=15)
axes.set_xlabel('Scale', fontsize=15)
コード例 #22
0
    df_acc.method).median().sort_values().index.tolist()

targets_by_voc_sdr_acc = [
    x for x in targets_by_voc_sdr if x in targets_by_acc_sdr
]

# get the two sortings
df_voc['method'] = df_voc['method'].astype('category',
                                           categories=targets_by_voc_sdr,
                                           ordered=True)
df_acc['method'] = df_acc['method'].astype('category',
                                           categories=targets_by_acc_sdr,
                                           ordered=True)

# prepare the pairwise plots
pc_voc = sp.posthoc_conover(df_voc, val_col='score', group_col='method')
pc_acc = sp.posthoc_conover(df_acc, val_col='score', group_col='method')

f = plt.figure(figsize=(10, 10))
# Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
cmap = ['1', '#ff2626', '#ffffff', '#fcbdbd', '#ff7272']
heatmap_args = {
    'cmap': cmap,
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.90, 0.35, 0.04, 0.3]
}
sp.sign_plot(pc_voc, **heatmap_args)
コード例 #23
0
# Kruskal–Wallis one-way analysis of variance
hstat_quart_lsnum, hpval_quart_lsnum = stats.kruskal(
    *[df_grouped_quart.get_group(i).ls_num for i in range(4)])
hstat_quart_lsgsenum, hpval_quart_lsgsenum = stats.kruskal(
    *[df_grouped_quart.get_group(i).ls_gse_num for i in range(4)])
hstat_quart_lsprivnum, hpval_quart_lsprivnum = stats.kruskal(
    *[df_grouped_quart.get_group(i).ls_priv_num for i in range(4)])
hstat_quart_secnum, hpval_quart_secnum = stats.kruskal(
    *[df_grouped_quart.get_group(i).sec_num for i in range(4)])
'''NOTE: The Kruskal-Wallis test is to try out stuff. At least one group is dominating the other.
    As another try-out, let's try a conover test.
'''

# Conover test, p-adjust????
cpval_quart_lsnum = sp.posthoc_conover(df,
                                       val_col='ls_num',
                                       group_col='quart_distance')
cpval_quart_lsgsenum = sp.posthoc_conover(df,
                                          val_col='ls_gse_num',
                                          group_col='quart_distance')
cpval_quart_lsprivnum = sp.posthoc_conover(df,
                                           val_col='ls_priv_num',
                                           group_col='quart_distance')
cpval_quart_secnum = sp.posthoc_conover(df,
                                        val_col='sec_num',
                                        group_col='quart_distance')

#------------------------------------------------------------
# Decile grouping
#df['dec_distance'] = pd.qcut(df.log_min_distance, q = 10, labels = False)
コード例 #24
0
                                          get_precision(smote),
                                          get_precision(smote_data_aug))
recall_stat, recall_p_val = kruskal(get_recall(baseline), get_recall(umce),
                                    get_recall(smote),
                                    get_recall(smote_data_aug))
f1_stat, f1_p_val = kruskal(get_f1(baseline), get_f1(umce), get_f1(smote),
                            get_f1(smote_data_aug))

print("precision: ", precision_stat, precision_p_val)
print("recall: ", recall_stat, recall_p_val)
print("f1: ", f1_stat, f1_p_val)

print("precision: ")
posthoc_precison = posthoc_conover([
    get_precision(baseline),
    get_precision(umce),
    get_precision(smote),
    get_precision(smote_data_aug)
])
print(posthoc_precison)

print("recall: ")
posthoc_recall = posthoc_conover([
    get_recall(baseline),
    get_recall(umce),
    get_recall(smote),
    get_recall(smote_data_aug)
])
print(posthoc_recall)

print("f1: ")
posthoc_f1 = posthoc_conover(
コード例 #25
0
def field_longitudinal(fn, thres):

    pp = PdfPages('figures/field_longitudinal_gamma={}_thres={}.pdf'.format(
        gamma, thres))

    years = ["1996", "2001", "2007", "2012"]

    def ind(yr):
        return years.index(yr)

    fig, ax = plt.subplots(1, 1)
    ax.grid(False)

    # color-blind spectrum: http://personal.sron.nl/~pault/colourschemes.pdf
    colors = [
        "#88ccee", "#44aa99", "#999933", "#DDCC77", "#CC6677", "#882255",
        "#AA4499"
    ]
    colors = colors[0:len(krange)]

    bars = []
    rows = []

    # Crude

    hm = {v: [] for v in krange}

    sm = 0
    total = {ind(y): 0 for y in years}
    byyear = {ind(y): [] for y in years}

    def process_vc(tup):
        x, yr, sample = tup
        # Run the misfits calculation
        mf = s.misfits(x, krange, gamma=gamma)
        sr = {k: s.compute(x, k, gamma=gamma) for k in krange}
        return x, mf, yr, sample, sr

    cached_process = memory.cache(process_vc)

    stf = open('figures/strains.out', 'w')

    for year in tqdm(years):
        fdata = Parallel(n_jobs=num_cores)(
            delayed(cached_process)(tup)
            for tup in read_field_data_year(fn, [year]))

        for x, mf, yr, sample, sr in fdata:
            mf = np.array(mf)
            # Find first drop below threshold
            tqdm.write("Length:{}. Interesting bases:{}. Misfits: {}".format(
                len(x), len([xx for xx in x if 1.0 - 1e-6 > xx > 1e-6]), mf))
            best = sum(mf > thres)
            if best >= len(mf):
                continue
            best += 1
            if best not in hm:
                hm[best] = []
            byyear[ind(year)] += [best]
            hm[best] += [ind(year)]
            total[ind(year)] += 1

            # Print out the strain sequences of at least 5% proportion in a sample
            assert best in sr
            print(sr[best][1], max(sr[best][1]))
            stf.write("DOMSTR\t{}\t{:.2f}%\n".format(yr, max(sr[best][1])))
            for i, f in enumerate(sr[best][1]):
                if f >= 0.05:
                    stf.write("STRAIN\t{}\t{}\t{}\t{:.2f}%\t{}\n".format(
                        sample, yr, i, 100.0 * sr[best][1][i], "".join([
                            "{}".format(int(z)) if not np.isnan(z) else "N"
                            for z in sr[best][0][i]
                        ])))

    stf.close()

    of = open(
        'figures/field_longitudinal_gamma={}_thres={}.txt'.format(
            gamma, thres), 'w')
    for year in years:
        of.write("Year %s\t" % year + "\t".join([
            "%d=%d" % (v, len(list(filter(lambda y: y == ind(year), hm[v]))))
            for v in krange
        ]))
        of.write("\tAverage (including 5+):\t" + "%2.4f" %
                 (np.mean(byyear[ind(year)])) + "\tAverage (excluding 5):\t" +
                 "%2.4f" % (np.mean([sc
                                     for sc in byyear[ind(year)] if sc < 5])))
        of.write("\tMedian:\t" + "%2.4f" % (np.median(byyear[ind(year)])) +
                 "\n")
    m = [hm[v] for v in krange]

    plt.ylabel('% samples')
    plt.xlabel('Survey year')
    plt.xticks(np.arange(len(years)), years)
    plt.ylim([0, 100])

    weights = np.array([[100.0 / float(total[int(y)]) for y in hm[v]]
                        for v in krange])
    bins = np.arange(len(years) + 1) - 0.5
    hatch = '/'
    _, _, patches = plt.hist(
        m,
        bins=bins,
        histtype='bar',
        stacked=True,
        weights=weights,
        rwidth=0.5,
        color=colors,
        label=[
            "%s%d strain%s" % ("=" if v != krange[-1] else "$\geq$", v,
                               "s" if v != krange[0] else "") for v in krange
        ])  #, hatch=hatch)
    plt.legend(
        bbox_to_anchor=(1.04, 0.5),
        loc="center left",
        borderaxespad=0,
        prop={'size': 10},
    )

    mm = np.array(m)
    lk = {
        year: {
            v: len(list(filter(lambda y: y == ind(year), hm[v])))
            for v in krange
        }
        for year in years
    }
    for j, bc in enumerate(patches):
        for i, p in enumerate(bc):
            #l = np.sum(np.array(byyear[i]) == len(patches)-j-1)
            l = lk[years[i]][krange[j]]
            if l == 0:
                continue
            h1 = p.get_height()
            print("{} {}".format(p, l))
            z = 100.0 * l / float(sum(lk[years[i]].values()))
            ax.text(p.get_x() + p.get_width() / 2.,
                    p.get_y() + h1 / 2.,
                    "%d%%" % int(z),
                    ha="center",
                    va="center",
                    color="black",
                    fontsize=12,
                    fontweight="bold")

    pp.savefig(bbox_inches="tight")
    pp.close()

    for y in years:
        of.write("%s: length %d\n" % (y, len(byyear[ind(y)])))
    of.write("{}\n".format(byyear[ind("1996")]))

    of.write("H1\t{}\t1996 vs 2001:\t{}\n".format(
        thres, sts.mannwhitneyu(byyear[ind("1996")], byyear[ind("2001")])))
    of.write("H2\t{}\t2007 vs 2012:\t{}\n".format(
        thres, sts.mannwhitneyu(byyear[ind("2007")], byyear[ind("2012")])))

    x = [byyear[ind(y)] for y in years]
    #pc = sp.posthoc_conover(x, val_col='values', group_col='groups', p_adjust='holm')

    kr = sts.kruskal(*x)
    of.write("Kruskal-Willis:\n{}\n".format(kr))

    pc = sp.posthoc_conover(x,
                            val_col='values',
                            group_col='groups',
                            p_adjust='fdr_tsbky')
    of.write("Conover:\n{}\n".format(pc))
    # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
    cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
    heatmap_args = {
        'cmap': cmap,
        'linewidths': 0.25,
        'linecolor': '0.5',
        'clip_on': False,
        'square': True,
        'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
    }
    sp.sign_plot(pc, **heatmap_args)

    of.close()
コード例 #26
0
# In[141]:


data = df[(df['Country']=='Slovenia')| 
          (df['Country']=='Denmark')|
          (df['Country']=='Cyprus')|
          (df['Country']=='Japan') |
          (df['Country']=='Switzerland')]
df2 = data[['Country','Life expectancy ']]


# In[142]:


scikit_posthocs.posthoc_conover(a = df2, val_col = 'Life expectancy ', group_col = 'Country')


# The Pvalue for countries (Cyprus,Japan) , (Cyprus , Switzerland),(Denmark,Japan),(Denmark,Switzerland),(Japan,Slovenia) and (Slovenia,Switzerland) is less than alpha and thus there is difference in Life expectancies between these countries

# ### Test-4
# <b>Test the claim that population depends upon the status and Year</b>

# In[143]:


df3 = df[['Status', 'Year', 'Population']].dropna()
df3


# The null and alternative hypothesis<br>
コード例 #27
0
    tracker.write("\nANOVA results: cor %f| p %f\n" % cor)
else:
    cor = stats.kruskal(
        df['total_score'][(df['conflict'] == "Standoff")
                          & (df['is_kashmir'] == True)],
        df['total_score'][(df['conflict'] == "Mumbai")
                          & (df['is_kashmir'] == True)],
        df['total_score'][(df['conflict'] == "Burhan")
                          & (df['is_kashmir'] == True)],
        df['total_score'][(df['conflict'] == "Non-conflict")
                          & (df['is_kashmir'] == True)])
    tracker.write("\nKruskal Wallis results: cor %f| p %f\n" % cor)

tracker.write("\nConover Post Hoc Test\n")
sp.posthoc_conover(df[df["is_kashmir"] == True],
                   val_col='total_score',
                   group_col='conflict').to_csv(tracker, mode="a")

#HYPOTHESIS 3 Pakistan conflict to Pak non conflict
tracker.write(
    "\n\n\n\nHYPOTHESIS 3: Pakistan-related headlines will have more negative sentiment scores on average in conflict periods than Pakistan-related headlines in non-conflict  periods\r\n"
)

rp.summary_cont(df.groupby(['is_pakistan',
                            'conflict'])['total_score']).to_csv(tracker,
                                                                mode="a")

levene = stats.levene(
    df['total_score'][(df['conflict'] == "Standoff")
                      & (df['is_pakistan'] == True)],
    df['total_score'][(df['conflict'] == "Mumbai")
コード例 #28
0
 def posthoc(self, df, x, y):
     df = df[[x, y]].dropna()
     p = sp.posthoc_conover(df, val_col=y, group_col=x, p_adjust='fdr_bh')
     return (p)