Esempio n. 1
0
def solve(problem, cloning_param, mutation):
    final_data = []
    final_problem = problem(dim)
    final_mutation = mutation(mut_pb, 20)

    for x in range(repetitions):
        algorithm = CloneAlg(
            problem=final_problem,
            population_size=100,
            offspring_population_size=100,
            mutation=final_mutation,
            cloning_param=cloning_param,
            termination_criterion=StoppingByEvaluations(max_evaluations=5000))
        data = []
        dataobserver = DataObserver(1.0, data)
        algorithm.observable.register(observer=dataobserver)
        algorithm.run()
        final_data.append(data)

    trans_list = np.array(final_data).T.tolist()

    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_axes([0, 0, 1, 1])
    bp = ax.boxplot(trans_list)
    plt.title(
        "Problem: {0} benchmark, dim: {1}, cloning_param: {2}, mutation: {3}".
        format(final_problem.get_name(), dim, algorithm.get_cloning_param(),
               final_mutation.get_name()))
    plt.show()

    # Kruskal-Wallis and Dunn tests
    print(stats.kruskal(trans_list[0], trans_list[1], trans_list[-1]))
    sp.posthoc_dunn([trans_list[0], trans_list[1], trans_list[-1]],
                    p_adjust='holm')
Esempio n. 2
0
def plot_statistical_test(df, column, group_names, title, savefile):
    """
    Performs a dunn test,  plots and saves results
    :param df: the dataframe with the results, we assume the existence of column 'config'
    :param column: column in the dataframe representing the random variable to be tested
    :param group_names: map or function mapping configs to strings (group names), can be a pd.DataSeries
    :param title: title for the plot
    :param savefile: pdf or png file to save the results to
    :return:
    """
    dunnRes = sp.posthoc_dunn(df,
                              group_col='config',
                              val_col=column,
                              p_adjust='holm')

    # reorder configs
    # dunnRes = dunnRes.loc[cols, cols] # cols must be a permutation of the group names
    dunnRes = dunnRes.rename(group_names, axis=0).rename(group_names, axis=1)

    columnOrder = dunnRes.columns

    # melt for plotting
    dunnRes = dunnRes.reset_index().melt(id_vars='index',
                                         value_vars=dunnRes.columns)
    dunnRes['stat'] = dunnRes.value.map(stat_range_values)

    # plot
    p = ggplot(dunnRes, aes('index', 'variable', fill='factor(stat)'))\
        + geom_tile(aes(width=.95, height=.95)) + ggtitle(title)\
        + scale_fill_manual(values=stat_colors, name='p value') \
        + scale_x_discrete(limits=columnOrder) \
        + scale_y_discrete(limits=columnOrder) \
        + labs(x='', y='', title=title) \
        + theme(axis_text_x=element_text(rotation=45, hjust=1))
    ggsave(p, savefile, dpi=300)
Esempio n. 3
0
def kruskal_wallis(df, cat_col, num_col, notebook=True):
    """
    Perform kruskal wallis test between the selected columns of the given dataframe.
    Columns need to be continuous
    :param df:
    :param cat_group:
    :param num_col:
    :return:
    """
    variables = []
    for idx, cat_group in df.groupby(cat_col):
        # NAN values not included in computation
        variables.append(cat_group[num_col][cat_group[num_col].notnull()])
    kruskal_h, kruskal_p = ss.kruskal(*variables)
    if print:
        print(f"H-Value: {kruskal_h}, p-Value: {kruskal_p}")
    output = f"\tTest: Kruskal-Wallis\n"
    output += f"\tH-Value: {kruskal_h}, p-Value: {kruskal_p}\n"
    if kruskal_p <= 0.05:
        output += "\tSignificance found \n"
        output += "\tPost-Hoc Tests: Dunns with Bonferonni Correction\n"
        # Remove nan values
        selector = df[cat_col].notnull() & df[num_col].notnull()
        posthoc_data = df[selector]
        posthoc_result = sp.posthoc_dunn(posthoc_data,
                                         num_col,
                                         cat_col,
                                         p_adjust="bonferroni")
        if notebook:
            print(posthoc_result)
        output += str(posthoc_result)
        output += "\n"

    return output
Esempio n. 4
0
def my_kruskal_3samp(x1, x2, x3, tag1, tag2, tag3):

    n1 = len(x1)
    n2 = len(x2)
    n3 = len(x3)
    sample_sizes = (n1, n2, n3)
    mdn1, mdn2, mdn3 = np.median(x1), np.median(x2), np.median(x3)
    lqr1, hqr1 = np.quantile(x1, 0.25), np.quantile(x1, 0.75)
    lqr2, hqr2 = np.quantile(x2, 0.25), np.quantile(x2, 0.75)
    lqr3, hqr3 = np.quantile(x3, 0.25), np.quantile(x3, 0.75)
    kruskal_results = kruskal(x1, x2, x3)
    Hstat, kruskal_p = kruskal_results

    dunntable = posthoc_tests.posthoc_dunn([x1, x2, x3],
                                           p_adjust='fdr_bh').to_numpy()
    p12 = dunntable[0, 1]
    p23 = dunntable[1, 2]
    p13 = dunntable[0, 2]
    dunn_pvals = (p12, p23, p13)

    ktxt = r'Kruskal-Wallis, %s $(n=%d)$ vs %s $(n=%d)$ vs %s $(n=%d)$, $H_{(2)}=%0.2f, p=%s$'% \
           (tag1, n1, tag2, n2, tag3, n3, Hstat, p2str(kruskal_p))

    dunntxt12 = r'%s vs %s, $p=%s$' % (tag1, tag2, p2str(p12))
    dunntxt23 = r'%s vs %s, $p=%s$' % (tag2, tag3, p2str(p23))
    dunntxt13 = r'%s vs %s, $p=%s$' % (tag1, tag3, p2str(p13))
    dunntxt = r'\textit{post hoc} Dunn’s test with Benjamini-Hochberg correction: %s, %s, %s' % (
        dunntxt12, dunntxt23, dunntxt13)

    txt = ktxt + '; ' + dunntxt

    descrips = ((mdn1, lqr1, hqr1), (mdn2, lqr2, hqr2), (mdn3, lqr3, hqr3))

    return kruskal_p, dunn_pvals, sample_sizes, descrips, txt
Esempio n. 5
0
def dunn_posthoc_test(df, dependent_variable, between):
    """dunn_posthoc tests with bonferroni multiple correction"""
    return sp.posthoc_dunn(
        df,
        val_col=dependent_variable,
        group_col=between,
        p_adjust="bonferroni",
    )
Esempio n. 6
0
def create_var_stats(data,
                     var_list,
                     type,
                     test_func,
                     out_prefix,
                     force_unique=True,
                     plot_dunn=True):
    out_csv = config.OUT_EVALS_DIR + "/" + out_prefix
    out_png = config.OUT_PLOT_DIR + "/" + out_prefix

    res = {}
    stat = ['s', 'p']
    res_posthoc = None
    plot_list = []
    statistics_dct = {}
    statistics_idx_col = "var"
    statistics_dct[statistics_idx_col] = []
    statistics_dct['n'] = []
    column_names = [statistics_idx_col, 'n'] + stat

    for var in var_list:
        res[var] = {}
        dct = data.to_single_type_dict(type, var, force_unique)
        lst = data.to_single_type_list(type, var, force_unique)

        # statistic test
        res[var][stat[0]], res[var][stat[1]] = test_func(
            *lst)  # '*' splits list into list of arguments
        if 'n' not in res[var]:
            res[var]['n'] = len(lst[0])

        statistics_dct[statistics_idx_col].append(var)
        statistics_dct['n'].append(len(lst[0]))
        for st in stat:
            if st not in statistics_dct:
                statistics_dct[st] = []
            val = utils.format_number(res[var][st], config.PRINT_PRECISION)
            statistics_dct[st].append(val)

        # dunn
        if plot_dunn:
            frame = pd.DataFrame.from_dict(dct)
            frame = frame.melt(var_name='groups', value_name='values')
            res_posthoc = sp.posthoc_dunn(frame,
                                          val_col='values',
                                          group_col='groups',
                                          p_adjust='bonferroni')
            path = out_png + "heat_" + var + "_" + sp.posthoc_dunn.__name__ + "." + config.OUT_PNG_EXT
            plot_list.append(plots.saveHeatMapPlot(res_posthoc, path))

    # statistics to df
    statistics_df = pd.DataFrame(statistics_dct, columns=column_names)
    statistics_df.set_index(statistics_idx_col)
    out_var_path = out_csv + "question_groups_" + type.name + "_" + test_func.__name__ + "." + config.OUT_CSV_EXT
    statistics_df.to_csv(out_var_path, index=False)

    return res, plot_list
Esempio n. 7
0
def compare_languages(list_lang_results, feat_name, list_corpus_names, p_threshold=0.05, dict_path="feature_dict_checked.json"):
	list_lang_no_nan = list()
	corpus_names = OrderedDict()
	for lang_values, corpus_name in zip(list_lang_results, list_corpus_names):
		no_nans = lang_values[lang_values.notnull()]
		if len(no_nans) > 0:
			list_lang_no_nan.append(no_nans)
			corpus_names[corpus_name] = len(no_nans)
	if len(list_lang_no_nan) == 0:
		return 0,0
	# scale_of_measurement = check_scale(list_lang_no_nan[0])
	scale_of_measurement = check_scale_from_dict(dict_path, "paired", feat_name)
	# # 0: nominal, 1: ordinal, 2: interval, 3: ratio
	normal_distribution = check_distribution(list_lang_no_nan, p_threshold=0.05)
	variance_homogeneity = check_variance_homogeneity(list_lang_no_nan, p_threshold=0.05)
	if scale_of_measurement >= 2 and normal_distribution and variance_homogeneity:
		# does the language affect the value of the feature? Does simplifications for each langauge work similar?
		t_value, p_value = stats.f_oneway(*list_lang_no_nan)
		return ("ANOVA", p_value)
		#if p_value <= p_threshold:
			# posthoc: which langauges are different?
			# stats.multicomp.pairwise_tukeyhsd
			# if two different ones found, use pearson to get effect size
			#effect_size = stats.pearsonr(complex_values, simple_values)[0]
			# effec_size = cohend(complex_values, simple_values)
	elif scale_of_measurement >= 1:
		try:
			h_statistic, p_value = stats.kruskal(*list_lang_no_nan)
		except ValueError:
			return 0,0
		if 0 < p_value <= p_threshold:
			if p_value <= 0.01:
				p_value = "p<=.01"
			elif p_value <= 0.05:
				p_value = "p<=.05"
			else:
				p_value = "p>0.05"
			output_list = list()
			posthoc_frame = scikit_posthocs.posthoc_dunn(list_lang_no_nan, p_adjust="holm")
			posthoc_frame_z = posthoc_dunn_z(list_lang_no_nan)
			for i, name_corpus_col in zip(posthoc_frame.columns.values, corpus_names.keys()):
				for n, name_corpus_row in zip(range(0, len(posthoc_frame)), corpus_names.keys()):
					if p_threshold >= posthoc_frame.iloc[n][i] > 0:
						effect_size = abs(posthoc_frame_z.iloc[n][i]/math.sqrt(corpus_names[name_corpus_col]+corpus_names[name_corpus_row]))
						if effect_size >= 0.1:
							output_list.append(["Kruskal ", p_value, "effectsize", str(round(effect_size, 4)),
												"h", str(round(h_statistic, 4)), "z", str(round(posthoc_frame_z.iloc[n][i],4)), name_corpus_col, name_corpus_row])
					#pos_col = list(corpus_names.keys()).index(name_corpus_col)
						#pos_row = list(corpus_names.keys()).index(name_corpus_row)
						#effect_size_pearson = stats.pearsonr(list_lang_no_nan[pos_col], list_lang_no_nan[pos_row])[0]
						# print(len(list_lang_no_nan[pos_col]), len(list_lang_no_nan[pos_row]))
						# effect_size_cohen = cohend(list_lang_no_nan[pos_col], list_lang_no_nan[pos_row])
			return output_list
		else:
			return 0, 0
	else:
		return 0, 0
Esempio n. 8
0
    def posthoc_dunn(self, data_df, predictor, response):
        data_df[response] = pd.to_numeric(data_df[response], errors='coerce')

        dunn_df = scikit.posthoc_dunn(data_df,
                                      val_col=response,
                                      group_col=predictor,
                                      p_adjust='bonferroni')
        dunn_df = dunn_df.applymap(lambda x: np.nan if x < 0 else x)
        dunn_df = dunn_df.round(2)
        dunn_df = dunn_df.applymap(lambda x: "< 0.001" if x < 0.001 else x)

        return dunn_df
Esempio n. 9
0
def evaluate(crosssover_algo, problem):
  alldata = []
  series = []
  for x in range(10):
    algorithm = GeneticAlgorithm(
        problem=problem,
        population_size=100,
        offspring_population_size=100,
        mutation=PolynomialMutation(1.0 / problem.number_of_variables, 20.0),
        crossover=crosssover_algo,
        selection=BinaryTournamentSelection(),
        termination_criterion=StoppingByEvaluations(max_evaluations=500000)
    )
    data = []
    dataobserver = DataObserver(1.0, data)
    algorithm.observable.register(observer=dataobserver)
    algorithm.run()
    result = algorithm.get_result().objectives[0]
    series.append(result) 
    alldata.append(data)

  numpy_array = np.array(alldata)
  transpose = numpy_array.T
  transpose_list = transpose.tolist()

    
  fig = plt.figure(figsize =(60, 42)) 
     
  ax = fig.add_axes([0, 0, 1, 1]) 
    
  bp = ax.boxplot(transpose_list) 
   
  plt.show()

  print(stats.kruskal(transpose_list[0],transpose_list[1],transpose_list[-1]))
  series = [series] 
  print(np.average(series))

  sp.posthoc_dunn([transpose_list[0],transpose_list[1],transpose_list[-1]], p_adjust = 'holm')
Esempio n. 10
0
def post_hoc_df(df, Y_col, X_col, posthoc="tukey", alpha=0.05):
    """
    Returns a df with pairwise comparisons with reject column calculated according to alpha
    
    TODO: Add more posthoc tests to this function
    """
    if posthoc == "Statsmodels_tukey":
        comp = multi.MultiComparison(df[Y_col], df['comb'])
        results = comp.tukeyhsd(alpha=alpha)
        results = pd.DataFrame(data=results._results_table.data[1:],
                               columns=results._results_table.data[0])
    if posthoc == "dunn":
        results = scikit_results_munger(
            sp.posthoc_dunn(df,
                            val_col=Y_col,
                            group_col=X_col,
                            p_adjust='holm'), alpha)
    if posthoc == "tukey":
        results = scikit_results_munger(
            sp.posthoc_tukey(df, val_col=Y_col, group_col=X_col), alpha)
    return results
def process_instance(list_files):
    ##Load all the algoritms in a different class..
    Wins = [0] * len(list_files)
    Losts = [0] * len(list_files)
    Ties = [0] * len(list_files)
    data = list()
    for i, val1 in enumerate(list_files):
        data.append(loadtxt(val1))
    ##Run kruskal test, a p-value > 0.05 accepts the null hypothesis
    T, p = ss.kruskal(*data)
    ##null hypothesis is rejected, try a post-hoc analyzes with a correction procedure
    if p < alpha:
        ptable = sp.posthoc_dunn(data, p_adjust='hommel')
        for i in range(0, len(list_files)):
            val1 = list_files[i]
            for j in range(0, i):
                val2 = list_files[j]
                if ptable.iat[i, j] > alpha:
                    Ties[i] += 1
                    Ties[j] += 1
                else:
                    if check_means(loadtxt(val1), loadtxt(val2)) == 1:
                        Wins[i] += 1
                        Losts[j] += 1
                    elif check_means(loadtxt(val1), loadtxt(val2)) == -1:
                        Wins[j] += 1
                        Losts[i] += 1
                    else:
                        Ties[i] += 1
                        Ties[j] += 1
    else:
        Ties[0:len(list_files)] += ones(
            len(list_files)) * (len(list_files) - 1)
    for i, val1 in enumerate(list_files):
        sys.stdout.write(" " + str(Wins[i]) + " " + str(Losts[i]) + " " +
                         str(Ties[i]))
        sys.stdout.flush()
    print("")
Esempio n. 12
0
def kruskal_posthoc_tests(benchmark_snapshot_df):
    """Returns p-value tables for various Kruskal posthoc tests.

    Results should considered only if Kruskal test rejects null hypothesis.
    """
    common_args = {
        'a': benchmark_snapshot_df,
        'group_col': 'fuzzer',
        'val_col': 'edges_covered',
        'sort': True
    }
    p_adjust = 'holm'

    posthoc_tests = {}
    posthoc_tests['mann_whitney'] = sp.posthoc_mannwhitney(**common_args,
                                                           p_adjust=p_adjust)
    posthoc_tests['conover'] = sp.posthoc_conover(**common_args,
                                                  p_adjust=p_adjust)
    posthoc_tests['wilcoxon'] = sp.posthoc_wilcoxon(**common_args,
                                                    p_adjust=p_adjust)
    posthoc_tests['dunn'] = sp.posthoc_dunn(**common_args, p_adjust=p_adjust)
    posthoc_tests['nemenyi'] = sp.posthoc_nemenyi(**common_args)

    return posthoc_tests
stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}
test_df = learned_2.loc[learned_2['lab_number'].isin(
    ['Lab 1', 'Lab 2', 'Lab 3', 'Lab 4', 'Lab 5', 'Lab 6', 'Lab 7'])]

for i, var in enumerate(['perf_easy', 'reaction_time', 'threshold', 'bias']):
    _, normal = stats.normaltest(test_df[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[
            group[var].values for name, group in test_df.groupby('lab_number')
        ])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(test_df,
                                      val_col=var,
                                      group_col='lab_number')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[
            group[var].values for name, group in test_df.groupby('lab_number')
        ])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(test_df,
                                       val_col=var,
                                       group_col='lab_number')
        else:
            posthoc = np.nan
Esempio n. 14
0
for df_col in df_cols_of_interest:
    lst_lst_data = [
        df[df_col][df['logo_class'] == c_of_i] for c_of_i in lst_ces_of_int
    ]
    print(f'Stat test  {df_col}:')
    print(stats.kruskal(*lst_lst_data))

# results all seem to have significantly different distributions

# %%
# Conduct Dunn-Bonferroni post-hoc
import scikit_posthocs as sp
dunn_b_p_val = pd.Series()
for df_col in df_cols_of_interest:
    dunn_b_p_val[df_col] = sp.posthoc_dunn(df,
                                           val_col=df_col,
                                           group_col='logo_class',
                                           p_adjust='bonferroni')
    print(f'Dunn-Bonferroni results for {df_col}:')
    print(dunn_b_p_val[df_col]['jp_morgan'])
# get means and std for reporting
df_img_results = pd.DataFrame()
for df_col in df_cols_of_interest:
    print(f'{df_col}:')
    dunn_b_p_val[df_col]
    (df_img_results[f'mean_{df_col}'], df_img_results[f'std_{df_col}'], _, _,
     _) = calc_class_stats(df, df_col, lst_ces_of_int, lst_ces_of_int_dis)
print(df_img_results)

df_img_results

# Notes:
cd = pd.read_csv("/Users/ameya/ICSE2020Data/CommonSuperType.csv")
c = pd.read_csv("/Users/ameya/ICSE2020Data/Composition.csv")
# nr = pd.read_csv("/Users/ameya/NoRelationship.csv")
# plt.figure(figsize=(11,2))

h_c = stats.ranksums(h.iloc[:, 0], c.iloc[:, 0])  #,alternative='less')
h_cd = stats.ranksums(h.iloc[:, 0], cd.iloc[:, 0])  #,alternative='less')
c_cd = stats.ranksums(cd.iloc[:, 0], c.iloc[:, 0])  #,alternative='less')
print("p value for hierarchy and composition", h_c)
print("p value for hierarchy and common st", h_cd)
print("p value for common st and composition", c_cd)

print(stats.f_oneway(h.iloc[:, 0], cd.iloc[:, 0], c.iloc[:, 0]))

f = (stats.kruskal(h.iloc[:, 0], cd.iloc[:, 0], c.iloc[:, 0]))
f1 = posthocs.posthoc_dunn([h.iloc[:, 0], c.iloc[:, 0], cd.iloc[:, 0]])

print("H: ", h.iloc[:, 0].mean(), h.iloc[:, 0].median())
print("CD: ", cd.iloc[:, 0].mean(), cd.iloc[:, 0].median())
print("C: ", c.iloc[:, 0].mean(), c.iloc[:, 0].median())

sns.set(font_scale=1.5)

fig, axes = plt.subplots(figsize=(8, 3))

#, sns.color_pallete("muted")
r = axes.violinplot(dataset=[h.iloc[:, 0], cd.iloc[:, 0], c.iloc[:, 0]],
                    showmeans=True,
                    showmedians=True)
r['cmeans'].set_color('r')
r['cmedians'].set_color('g')
Esempio n. 16
0
    for i in range(len(H)):
        F.append([np.mean(T[i]), np.std(T[i]), np.median(T[i]), np.min(T[i]), np.max(T[i])])
    F = pd.DataFrame(F, index=H)
    F.to_csv('stats_analysis/resume/resume_'+ids[c_ids], sep='\t', header=['mean', 'std', 'median', 'min', 'max'], float_format='%.6f')


## P_VALUE
if not os.path.exists('stats_analysis/p_value/'):
        os.makedirs('stats_analysis/p_value/')

kruskal_values = []
for c_ids in range(len(ids)):
    T = pd.read_csv(folder+ids[c_ids], sep='\t')
    H = T.columns.values
    T = np.array(T).T
    dunn_test = sp.posthoc_dunn(T)
    kruskal_values.append(stats.kruskal(*T)[1])
    #for i in range(len(H)):
        #dunn_test[i][i] = 1
    dunn_test = pd.DataFrame(dunn_test, index=H)
    dunn_test.to_csv('stats_analysis/p_value/dunn_test_'+ids[c_ids], sep='\t', header=H, float_format='%.6f')
    
kruskal_values = pd.DataFrame(np.array(kruskal_values).T, index=ids)
kruskal_values.to_csv('stats_analysis/p_value/kruskal.txt', sep='\t', header=['p_value'], float_format='%.6f')


## BOXPLOT
if not os.path.exists('stats_analysis/boxplot/'):
        os.makedirs('stats_analysis/boxplot/')

def significance(T, P):
Esempio n. 17
0
def plot_consistency_within_day(res, start, end, shuffle, pretraining,
                                figure_path):
    d = list(zip(start, end))
    res_temp = filter.filter_days_per_mouse(res, d)
    if pretraining:
        res_temp = filter.filter(res_temp, {'odor_valence': ['PT CS+']})
    else:
        res_temp = filter.filter(res_temp, {'odor_valence': ['CS+', 'CS-']})
    corr_res = _correlation(res_temp)
    corr_res.pop('data')

    analysis.add_naive_learned(corr_res, start, end, '0', '1')
    res_ = reduce.new_filter_reduce(
        corr_res,
        filter_keys=['mouse', 'odor_standard', 'training_day'],
        reduce_key='consistency_corrcoef')
    res_.pop('consistency_corrcoef_sem')
    filter.assign_composite(res_, loop_keys=['training_day', 'odor_valence'])

    if shuffle:
        s = '_shuffled'
    else:
        s = ''

    ax_args_copy = ax_args.copy()
    ax_args_copy.update({
        'xlim': [-.5, 2.5],
        'ylim': [0, .55],
        'yticks': np.arange(0, 1.1, .1)
    })

    swarm_args_copy = swarm_args.copy()
    if pretraining:
        swarm_args_copy.update({'palette': ['gray', 'orange', 'green', 'red']})
    else:
        swarm_args_copy.update({'palette': ['gray', 'gray', 'green', 'red']})

    ix = res_['training_day_odor_valence'] == '1_PT CS+'
    res_['training_day_odor_valence'][ix] = '1_APT CS+'
    plot.plot_results(res_,
                      x_key='training_day_odor_valence',
                      y_key='consistency_corrcoef',
                      path=figure_path,
                      plot_args=swarm_args_copy,
                      plot_function=sns.stripplot,
                      ax_args=ax_args_copy,
                      reuse=False,
                      save=False,
                      sort=True,
                      name_str=s)

    summary = reduce.new_filter_reduce(res_,
                                       filter_keys='training_day_odor_valence',
                                       reduce_key='consistency_corrcoef')
    plot.plot_results(summary,
                      x_key='training_day_odor_valence',
                      y_key='consistency_corrcoef',
                      error_key='consistency_corrcoef_sem',
                      colors='black',
                      path=figure_path,
                      plot_args=error_args,
                      plot_function=plt.errorbar,
                      save=True,
                      reuse=True,
                      legend=False,
                      name_str=s)

    print(summary['consistency_corrcoef'])

    ix_a = res_['training_day_odor_valence'] == '0_CS+'
    ix_b = res_['training_day_odor_valence'] == '0_CS-'
    ix_c = res_['training_day_odor_valence'] == '1_CS+'
    ix_d = res_['training_day_odor_valence'] == '1_CS-'
    a = res_['consistency_corrcoef'][ix_a]
    b = res_['consistency_corrcoef'][ix_b]
    c = res_['consistency_corrcoef'][ix_c]
    d = res_['consistency_corrcoef'][ix_d]

    from scipy.stats import ranksums, wilcoxon, kruskal
    import scikit_posthocs

    print(kruskal(a, b, c))
    x = scikit_posthocs.posthoc_dunn(a=[a, b, c, d], p_adjust=None)
    print(x)
        'perf_easy', 'threshold_l', 'threshold_r', 'threshold_n', 'bias_l',
        'bias_r', 'bias_n'
]):

    # Remove any animals with NaNs
    test_fits = biased_fits[biased_fits[var].notnull()]

    # Test for normality
    _, normal = stats.normaltest(test_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(test_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(test_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan

    posthoc_tests['posthoc_' + str(var)] = posthoc
    stats_tests.loc[i, 'variable'] = var
    stats_tests.loc[i, 'test_type'] = test_type
    stats_tests.loc[i, 'p_value'] = test[1]
Esempio n. 19
0
def rest():
    df = q1_median_q3_rep_wide
    pops = ["pdc", "dc-cd11b", "dc-cd8a"]

    stats_l = []
    for stat, (popa, popb) in product(["Q1", "median", "Q3"],
                                      product(pops, pops)):
        print(stat, popa, popb)

        popa = "hsc"
        popb = "pdc"
        stat = "median"

        mw_u, pvalue = scipy.stats.mannwhitneyu(
            [0.8, 0.81, 0.79],
            [0.4, 0.39, 0.41],
            # df.query("Population == @popa")[stat].to_numpy(),
            # df.query("Population == @popb")[stat].to_numpy(),
            use_continuity=True,
            alternative="two-sided",
        )
        pvalue

        stats_l.append([stat, popa, popb, mw_u, pvalue])
    stats_df = pd.DataFrame(stats_l).set_axis(
        ["stat", "popA", "popB", "U", "pvalue"], axis=1)

    kruskal_format_means = pd.pivot(
        q1_median_q3_rep_wide.query("Population in @pops"),
        index="Population",
        columns="Replicate",
        values="mean",
    )

    import scikit_posthocs

    stat, p_value = scipy.stats.kruskal(
        *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], )

    dunn_res_df = scikit_posthocs.posthoc_dunn(
        kruskal_format_means.to_numpy(),
        p_adjust='fdr_bh',
        sort=True,
    )

    stat, pvalue = scipy.stats.f_oneway(
        *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], )

    import statsmodels

    df = kruskal_format_means.stack().reset_index()

    kruskal_format_means

    res = statsmodels.stats.multicomp.pairwise_tukeyhsd(
        df[0], df['Population'].to_numpy(), alpha=0.05)

    res.pvalues
    res.summary()

    # wilcox.test(c(0.8, 0.79, 0.81), c(0.4, 0.39, 0.41), paired=F, exact=F)

    plot_pops = ["pdc", "dc-cd8a", "dc-cd11b"]

    results_dir = "/icgc/dkfzlsdf/analysis/hs_ontogeny/notebook-data/gNs4xcMJscaLLwlt"
    point_plot_quartiles_png = results_dir + "/point-plot-quartiles.png"

    q1_median_q3_rep_wide

    ggplot_data = (
        q1_median_q3_rep_long.query("Population in @plot_pops").sort_values(
            "value",
            ascending=False,
        ).groupby(["Population", "stat"]).apply(
            lambda df: df.assign(group_order=np.arange(1, df.shape[0] + 1))))

    g = (gg.ggplot(ggplot_data) + gg.aes_string(
        x="Population", y="value", group="group_order", color="stat") +
         gg.geom_point(position=gg.position_dodge(width=0.5), size=1) +
         mh_rpy2_styling.gg_paper_theme + gg.labs(y='Methylation (%)', x=''))
    a = 3

    rpy2_utils.image_png2(g, (ut.cm(6), ut.cm(6)))

    ut.save_and_display(
        g,
        png_path=point_plot_quartiles_png,
        # additional_formats=tuple(),
        height=ut.cm(6),
        width=ut.cm(6),
    )

    q1_median_q3_rep_wide

    g = (
        gg.ggplot(
            q1_median_q3_rep_wide.query("Population in @plot_pops").assign(
                sample=lambda df: df["Population"].astype(str) + df[
                    "Replicate"].astype(str))) + gg.geom_boxplot(
                        gg.aes_string(
                            x="Population",
                            fill="Population",
                            group="sample",
                            lower="Q1",
                            upper="Q3",
                            middle="median",
                            ymin="min1",
                            ymax="max99",
                            # position=gg.position_dodge(width=0.5),
                        ),
                        stat="identity",
                    )
        # + mh_rpy2_styling.gg_paper_theme
        + gg.theme(axis_text_x=gg.element_text(angle=90, hjust=1)) +
        gg.scale_fill_brewer(guide=False))
    a = 3
    ut.save_and_display(
        g,
        png_path=point_plot_quartiles_png,
        additional_formats=tuple(),
        height=ut.cm(6),
        width=ut.cm(7),
    )
    # image_png2(g, (ut.cm(12), ut.cm(12)))

    beta_values.loc[:, ("hsc", "1")]
# %% Statistics
    
stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate(['threshold_l', 'threshold_r', 'lapselow_l', 'lapselow_r', 'lapsehigh_l',
                         'lapsehigh_r', 'bias_l', 'bias_r']):
    _, normal = stats.normaltest(biased_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[group[var].values
                               for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(biased_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[group[var].values
                                for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(biased_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan

    posthoc_tests['posthoc_'+str(var)] = posthoc
    stats_tests.loc[i, 'variable'] = var
    stats_tests.loc[i, 'test_type'] = test_type
    stats_tests.loc[i, 'p_value'] = test[1]
# nr = pd.read_csv("/Users/ameya/NoRelationship.csv")
# plt.figure(figsize=(11,2))
sns.set(font_scale=1.5)

print("PRmax", pr.iloc[:, 0].median(), pr.iloc[:, 0].mean())
print("Pckgmax", pck.iloc[:, 0].median(), pck.iloc[:, 0].mean())
print("Clsmax", c.iloc[:, 0].median(), c.iloc[:, 0].mean())
# print("MMean", m.median(), m.mean())
print("Smean", s.iloc[:, 0].median(), s.iloc[:, 0].mean())

print(c.values.shape)

ii = np.concatenate([c.values, pr.values], axis=0)

print(stats.kruskal(pr.iloc[:, 0], pck.iloc[:, 0], c.iloc[:, 0], s.iloc[:, 0]))
f = posthocs.posthoc_dunn(
    [pr.iloc[:, 0], c.iloc[:, 0], s.iloc[:, 0], pck.iloc[:, 0]])
# f.compare_dunn(0,1)

print(f.shape)
print(f.values)

print("Project-Package",
      stats.mannwhitneyu(pr.iloc[:, 0], pck.iloc[:, 0], alternative='greater'))
print("Project-class",
      stats.mannwhitneyu(pr.iloc[:, 0], c.iloc[:, 0], alternative='greater'))
print("project-selective",
      stats.mannwhitneyu(pr.iloc[:, 0], s.iloc[:, 0], alternative='greater'))
print("package-selective",
      stats.mannwhitneyu(pck.iloc[:, 0], s.iloc[:, 0], alternative='greater'))
print("class-selective",
      stats.mannwhitneyu(s.iloc[:, 0], c.iloc[:, 0], alternative='greater'))
Esempio n. 22
0
#print(mannwhitneyu(m_l, u_l)[1])
"""
all_data["Gaussian"] = ga_l
all_data["Hessian"] = h_l
all_data["Laplacian"] = la_l
all_data["Ilastik"] = il_l

all_data["MitoSegNet"] = m_l
all_data["Finetuned\nFiji U-Net"] = u_l_pt
"""

p_val = kruskal(ga_l, h_l, la_l, il_l, m_l, u_l_pt)
print(p_val)

dt = posthoc_dunn([ga_l, h_l, la_l, il_l, m_l, u_l_pt])

#dt = posthoc_dunn(all_data, val_col="MitoSegNet", group_col="Ilastik")
print(dt)

dt.to_excel("dc_posthoc.xlsx")

print("\n")
print(mannwhitneyu(u_l_pt, h_l)[1])
print(mannwhitneyu(u_l_pt, la_l)[1])
print(mannwhitneyu(u_l_pt, ga_l)[1])
print(mannwhitneyu(u_l_pt, il_l)[1])
print("\n")


# pooled standard deviation for calculation of effect size (cohen's d)
Esempio n. 23
0
def peakPosKruskal(features,
                   csv="top100-high-level-complete.csv",
                   popSize=10,
                   plot=False,
                   dateRange=(1950, 2010)):
    features.append(" peakPos")
    features = list(set(features))
    data = getTimeSeries(features, csv)
    data = data[pd.notnull(data[' peakPos'])]
    alpha = 0.001

    data[' peakPos'] = data[' peakPos'].map(
        lambda a: math.floor(a / popSize) * popSize)

    print("\n")
    count = 0
    for i in features:
        # mod = sm.OLS(i+" ~ tags:originaldate", data=data)
        print("| ", i, ": ")
        clean_data = data[pd.notnull(data[i])]
        unique_pos = pd.unique(clean_data[' peakPos'])
        unique_pos.sort()

        l = pd.DataFrame()
        l2 = []
        for j in unique_pos:
            t = clean_data[clean_data[' peakPos'] == j]
            t = t[i]
            l[j] = t
            l2.append(t)

        kruskal = stats.kruskal(*l2)
        post_hoc = ph.posthoc_dunn(clean_data, group_col=" peakPos", val_col=i)
        dunn_significant = set()
        for y1 in range(post_hoc.shape[0]):
            for y2 in range(y1):
                if post_hoc.iloc[y1, y2] < alpha and y1 != y2:
                    dunn_significant.add((post_hoc.index.values[y1],
                                          post_hoc.columns.values[y2]))

        print("| ", kruskal)
        print("|  significant pairs: ", dunn_significant)
        print("| ", post_hoc < alpha)
        print("|___________________\n")
        if plot:
            f, axes = mpl.subplots(nrows=1, ncols=1, constrained_layout=True)
            for j in unique_pos:
                clean_data.boxplot(i, by=" peakPos", ax=axes)
            # clean_data.boxplot(i,by=" peakPos")

            axes.set_title("")
            axes.set_xlabel("")
            # for j in unique_pos:
            # sns.distplot(clean_data[clean_data[' peakPos']==j].loc[:,i],label=str(j),bins=40,norm_hist=True,ax=axes[0])
            # axes[0].legend(loc="upper right")
            # axes[0].set_xlabel("")
            mpl.suptitle(i)
            mpl.gcf().subplots_adjust(bottom=0.25)

            dunnStr = "P<" + str(alpha) + " (Dunn post-hoc): "
            c = len(dunnStr)
            for i in str(dunn_significant):
                c += 1
                dunnStr += i
                if (c % 55 == 0):
                    dunnStr += "\n"
            mpl.xlabel(dunnStr, fontsize=14)
        count += 1
    mpl.show()
# Stats
stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate(
    ['perf_easy', 'reaction_time', 'threshold', 'bias', 'n_trials']):
    _, normal = stats.normaltest(learned[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[
            group[var].values for name, group in learned.groupby('lab_number')
        ])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(learned,
                                      val_col=var,
                                      group_col='lab_number')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[
            group[var].values for name, group in learned.groupby('lab_number')
        ])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(learned,
                                       val_col=var,
                                       group_col='lab_number')
        else:
            posthoc = np.nan
Esempio n. 25
0
    def kruskal_wallis(self):
        dir_name = self.dir_name_stats
        file_name = dir_name + 'kruskal wallis results 0'
        i = 0
        while True:
            if not (os.path.isfile(file_name)):
                f = open(file_name, 'w')
                break
            else:
                i += 1
                file_name = file_name[:-1] + str(i)
        lst_groups = [
            self.df_for_box[col].dropna().values
            for col in self.treatments_to_plot
        ]
        k_w_res = stats.kruskal(*lst_groups)
        h_val, p_val = k_w_res
        f.write('h value = {0}\n'.format(h_val))
        f.write('p value = {0}\n'.format(p_val))
        print(
            "\npreformed kruskal wallis test, results in stats -> kruskal_wallis_results "
            + str(i))
        self.signature = self.get_signature()
        if p_val < 0.05:
            print(
                'the results are significant, preform post hoc tests? (y for yes)'
            )
            # inp_krus = 'y'
            inp_krus = input()
            if inp_krus == 'y':
                post_hoc_df = sp.posthoc_dunn(lst_groups, p_adjust='holm')
                cols = post_hoc_df.columns
                post_hoc_df = post_hoc_df.rename(columns={
                    cols[i]: self.treatments_to_plot[i]
                    for i in range(len(cols))
                })
                post_hoc_df = post_hoc_df.rename(index={
                    cols[i]: self.treatments_to_plot[i]
                    for i in range(len(cols))
                })
                post_hoc_df.to_excel(dir_name + "post hoc res " + str(i) +
                                     '.xlsx')
                groups = {}
                critical_val = 0.05
                means = [np.nanmean(l) for l in lst_groups]
                keys = list(self.treatments_to_plot)
                keys_sorted = [
                    x for _, x in sorted(zip(means, keys), reverse=True)
                ]
                seen_groups = []
                for i in range(len(keys_sorted)):
                    g = keys_sorted[i]
                    seen_groups.append(g)
                    groups[g] = []
                    groups_to_check = list(set(keys_sorted) - set(seen_groups))
                    values = post_hoc_df[g][groups_to_check]
                    for col in values.index:
                        if values[
                                col] > critical_val and col not in seen_groups:
                            groups[g].append(col)
                letters = {}
                c = 'a'
                seen_groups = []
                for k in keys_sorted:
                    if (len(groups[k]) > 0 and not already_contained(groups[k], letters)) or\
                            (len(groups[k]) == 0 and k not in seen_groups):
                        letters[c] = [k] + groups[k]
                        c = chr(ord(c) + 1)
                        seen_groups += groups[k]
                groups_letter = {}
                for t in self.treatments_to_plot:
                    groups_letter[t] = [c for c in letters if t in letters[c]]

                print(
                    "\npreformed dunn test, results in stats -> post hoc res")
                print(
                    'treatments are in the same group if they are not statistically different (p val is more than 0.05)'
                )
                print(
                    '\nshow results from post hoc test on graph? (y for yes)')

                f.write("results are significant!!\n\n")
                f.write(
                    'post hoc results (treatments are in the same group are not statistically different):\n'
                )
                f.write("\n".join([
                    "{0:<5s}:{1}".format(key, str(val))
                    for key, val in letters.items()
                ]))
                inp = input()
                # inp = 'y'
                if inp == 'y':
                    self.groups_letter_without = groups_letter
                    self.kruskal = True

        else:
            print('results are not significant')
        f.close()
with open('accuracies.json') as json_file:
    accuracies_json = json.load(json_file)

accs = [accuracies_json[k] for k in accuracies_json]
len(accs)

from scipy.stats import friedmanchisquare

friedmanchisquare(*accs)

#get_ipython().system(' pip install scikit-posthocs')

from scikit_posthocs import posthoc_dunn

p_values = posthoc_dunn(a=accs, p_adjust='holm', sort=True)
print('Post hocs dunn p-values: ')
p_values.head()


# In[141]:


pvalues = p_values.values
classifiers = accs.copy()
best = 0
best_acc = np.mean(accs[0])
for i in range(96):
    acc_i = np.mean(accs[i])
    rejected = np.where(pvalues[i,:]<0.05)[0]
    rejected = [x for x in rejected if x != i]
Esempio n. 27
0
def getKWmultiComp(data, labels, verbose=False):
    pVals = sp.posthoc_dunn(data, p_adjust='bonferroni')
    if verbose:
        print np.hstack((np.transpose([0]+labels).reshape(4,1),np.vstack((labels,pVals))))
    return [pVals[1,0], pVals[2,0], pVals[2,1]]
Esempio n. 28
0
def dunn(df: pd.DataFrame, group_variables, variable):
    df = prepare_data_frame(df, group_variables, variable)
    return sp.posthoc_dunn(df, val_col='values', group_col='groups', p_adjust='holm')
Esempio n. 29
0
training_time = pd.DataFrame(columns=['sessions'], data=ses.groupby('subject_nickname').size())
training_time['trials'] = ses.groupby('subject_nickname').sum()
training_time['lab'] = ses.groupby('subject_nickname')['institution_short'].apply(list).str[0]

# Change lab name into lab number
training_time['lab_number'] = training_time.lab.map(institution_map()[0])
training_time = training_time.sort_values('lab_number')

#  statistics
# Test normality
_, normal = stats.normaltest(training_time['sessions'])
if normal < 0.05:
    kruskal = stats.kruskal(*[group['sessions'].values
                              for name, group in training_time.groupby('lab')])
    if kruskal[1] < 0.05:  # Proceed to posthocs
        posthoc = sp.posthoc_dunn(training_time, val_col='sessions',
                                  group_col='lab_number')
else:
    anova = stats.f_oneway(*[group['sessions'].values
                             for name, group in training_time.groupby('lab')])
    if anova[1] < 0.05:
        posthoc = sp.posthoc_tukey(training_time, val_col='sessions',
                                   group_col='lab_number')


# %% PLOT

# Set figure style and color palette
use_palette = [[0.6, 0.6, 0.6]] * len(np.unique(training_time['lab']))
use_palette = use_palette + [[1, 1, 0.2]]
lab_colors = group_colors()
Esempio n. 30
0
print(
    mannwhitneyu(all_data["Laplacian"], all_data["Finetuned\nFiji U-Net"])[1])
print(mannwhitneyu(all_data["Ilastik"], all_data["Finetuned\nFiji U-Net"])[1])

#print(np.median(all_data["Hessian"]), np.median(all_data["Ilastik"]), np.median(all_data["MitoSegNet"]))
#print(np.average(all_data["Hessian"]), np.average(all_data["Ilastik"]), np.average(all_data["MitoSegNet"]))

print(
    kruskal(all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(),
            all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(),
            all_data["MitoSegNet"].tolist(),
            all_data["Finetuned\nFiji U-Net"].tolist()))

dt = posthoc_dunn([
    all_data["Gaussian"].tolist(), all_data["Hessian"].tolist(),
    all_data["Laplacian"].tolist(), all_data["Ilastik"].tolist(),
    all_data["MitoSegNet"].tolist(),
    all_data["Finetuned\nFiji U-Net"].tolist()
])

dt.to_excel("ed_posthoc.xlsx")


# pooled standard deviation for calculation of effect size (cohen's d)
def cohens_d(data1, data2):

    p_std = np.sqrt(
        ((len(data1) - 1) * np.var(data1) +
         (len(data2) - 1) * np.var(data2)) / (len(data1) + len(data2) - 2))

    cohens_d = np.abs(np.average(data1) - np.average(data2)) / p_std