Esempio n. 1
0
def friedman_test():
    df_acc = pd.read_csv(csv_file_name_acuracia)
    df_sim = pd.read_csv(csv_file_name_similaridade)

    #renomeia dataframes para que possam ser concatenados
    df_acc = df_acc[['acc_min', 'classifier_type', 'dataset', 'acc_mean']].rename(columns={'acc_min': 'parametro', 'acc_mean': 'measurement'})
    df_sim = df_sim[['qtde_classifiers', 'classifier_type', 'dataset', 'acc_mean']].rename(columns={'qtde_classifiers': 'parametro', 'acc_mean': 'measurement'})
    df_acc['metodo'] = 'acuracia'
    df_sim['metodo'] = 'similaridade'

    #concatena os dataframes
    df_all = pd.concat([df_acc, df_sim])

    

    #transforma metodo, classifier_type e parametro em uma coluna só
    df_all = df_all.astype({'parametro': str})
    df_all = df_all.set_index(keys=['metodo', 'classifier_type', 'parametro'])
    df_all.index = df_all.index.map('-'.join)
    df_all.reset_index(inplace=True)


    #cada algoritmo diferente vira uma coluna, cada linha corresponde a um dataset, e os valores são a acurácia
    df_all_pivoted = df_all.pivot(index='dataset', columns='index', values='measurement')

    #teste de friedman    
    statistic, pvalue = friedmanchisquare(*df_all_pivoted.values.tolist())
    print(f'p-value={pvalue}')

    #teste de nemenyi
    nemenyi = posthoc_nemenyi_friedman(df_all, melted=True, group_col='index', block_col='dataset', y_col='measurement')

    nemenyi.to_csv('resultado_nemenyi.csv')
Esempio n. 2
0
def calcstats(file, name, type):
    dat = pd.read_excel(file)
    df1 = dat[["Anger", "Fear", "Joy", "Sadness"]]
    friedman = st.friedmanchisquare(*df1.values)
    print(friedman)
    p_values = hocs.posthoc_nemenyi_friedman(df1.T)
    p_values.to_excel(r"Final1\results_" + type + "_" + name +
                      "_p_values.xlsx")
    ranks = pd.DataFrame(columns=df1.keys())
    for key in df1.keys():
        ranks[key] = df1[key].rank(ascending=False)
    df1["Totals"] = df1.sum(axis=1) / 4
    df1["Ranks"] = ranks.mean(axis=1)
    df1 = df1.sort_values(by=["Ranks"])
    df1 = df1.reset_index()
    R_1 = df1["Ranks"].iloc[0]
    df1["P-values"] = df1.apply(lambda row: min(
        test_pairs(4, len(df1.index), R_1, row["Ranks"]) * (row.name + 1), 1),
                                axis=1)
    for index, row in df1.iterrows():
        if index != len(df1.index) - 1:
            df1["P-values"].iloc[index] = max(
                df1["P-values"].iloc[index + 1:].max(),
                df1["P-values"].iloc[index])
    df1 = df1.sort_values(by=["index"])
    df1 = df1.reset_index(drop=True)
    df1.to_excel(r"Final1\results_" + type + "_" + name + ".xlsx")
Esempio n. 3
0
def friedman_posthoc_tests(experiment_pivot_df):
    """Returns p-value tables for various Friedman posthoc tests.

    Results should considered only if Friedman test rejects null hypothesis.
    """
    posthoc_tests = {}
    posthoc_tests['conover'] = sp.posthoc_conover_friedman(experiment_pivot_df)
    posthoc_tests['nemenyi'] = sp.posthoc_nemenyi_friedman(experiment_pivot_df)
    return posthoc_tests
Esempio n. 4
0
def do_statistical_test():
    """
    do the friedman and post-hoc tests that include the meta-learning results
    """
    df_results = pd.read_csv('meta_learning/average_results.csv')
    model_names = df_results.columns[1:]
    t_stat, p_val = friedmanchisquare(*[df_results[i] for i in model_names])
    print('\nfriedman test p-val = %s' % p_val)
    post_hoc_p_vals = posthoc_nemenyi_friedman(df_results.drop(columns='dataset').to_numpy())
    post_hoc_p_vals.columns = model_names
    print('\npost hoc p-vals:\n%s' % post_hoc_p_vals)
    post_hoc_p_vals.to_csv('meta_learning/post_hoc.csv', index=False)
def do_nemenyi_test(ranked_data, plot=False):
    ranks_per_dataset = ranked_data.iloc[:, 1:]

    if plot:
        names = list(ranked_data.columns)[1:]
        avg_ranks = ranks_per_dataset.mean(axis=0)
        cd = Orange.evaluation.compute_CD(
            avg_ranks, ranked_data.shape[0], alpha='0.05', test='nemenyi')
        Orange.evaluation.graph_ranks(avg_ranks, names, cd=cd, width=10, textspace=1.5)
        plt.show()

    return posthoc_nemenyi_friedman(ranks_per_dataset)
Esempio n. 6
0
def runFriedmanPython_array(data):
    import scipy.stats as ss
    import scikit_posthocs as sp
    p_statistic, p_value = ss.friedmanchisquare(*data.T)
    # https://scikit-posthocs.readthedocs.io/en/latest/generated/scikit_posthocs.posthoc_nemenyi_friedman/#id2
    # P. Nemenyi (1963) Distribution-free Multiple Comparisons. Ph.D. thesis, Princeton University.
    pc = sp.posthoc_nemenyi_friedman(data)
    return FriedmanResult("",
                          p_value,
                          None,
                          cmp_matrix=pc,
                          binary_cmp_matrix=False,
                          cmp_method="nemenyi")
Esempio n. 7
0
def benchmark_average(benchmark, posthocs=False):
    sums = None
    for prev_line in benchmark:
        line = prev_line[1:]
        if not isinstance(line[0], str):
            if sums is None:
                sums = [[value] for value in line]
            else:
                for values, value in zip(sums, line):
                    values.append(value)
        yield prev_line
    if sums is not None:
        yield ["Average"] + [sum(values) / len(values) for values in sums]
    if sums is not None and posthocs:
        import scikit_posthocs as ph
        print(ph.posthoc_nemenyi_friedman(np.array(sums).T))
Esempio n. 8
0
    def tabulate(self, ensemble=False):
        offset = 2 if ensemble else 1
        column_name = 'Non-parametric (Friedman, Nemenyi) (New)' if ensemble else 'Non-parametric (Friedman, Nemenyi)'
        self.weights = []
        for query_ in self.data:
            if ensemble:
                #  Appending Ensemble model as new row to each query
                results = [
                    sum(a * b for a, b in zip(self.ensemble, c))
                    for c in np.array(self.data[query_].iloc[1:(
                        len(self.methods) + 1), 1:-2].values.tolist()).T
                ]
                new_row = pd.Series(
                    dict(
                        zip(self.data[query_].columns,
                            ['Ensemble-model'] + results + [query_, '0'])))
                self.data[query_] = self.data[query_].append(new_row,
                                                             ignore_index=True)

            f_data = self.data[query_].iloc[1:(len(self.methods) + offset),
                                            1:-1 * offset].values.tolist()
            p = friedmanchisquare(*f_data)[1]
            ph_data = scikit_posthocs.posthoc_nemenyi_friedman(
                np.array(f_data).T)
            #ph_min = [math.sqrt(-1*np.prod(ph_data[i])) for i in range(len(ph_data[0]))]
            ph_min = [max(ph_data[i]) for i in range(len(ph_data[0]))]
            weights = [n / sum(ph_min) for n in ph_min]
            self.data[query_][column_name] = ['p=%f' % p] + weights
            self.table = self.table.append(self.data[query_])
            self.weights.append(weights)
        if ensemble:
            self.table = self.table[
                ['Queries', 'Model'] +
                ['Article{}/Ranking'.format(i + 1) for i in range(self.k)] +
                ['Non-parametric (Friedman, Nemenyi)'] +
                ['Non-parametric (Friedman, Nemenyi) (New)']].set_index(
                    'Queries', append=True).swaplevel(0, 1)
        else:
            self.table = self.table[
                ['Queries', 'Model'] +
                ['Article{}/Ranking'.format(i + 1) for i in range(self.k)] +
                ['Non-parametric (Friedman, Nemenyi)']].set_index(
                    'Queries', append=True).swaplevel(0, 1)
        self.ensembles = np.asarray(self.weights).T.tolist()
        return self.print_results()
Esempio n. 9
0
def friedman_test_acuracia():
    df = pd.read_csv(csv_file_name_acuracia)
    df = df[['acc_min', 'classifier_type', 'dataset', 'acc_mean']]

    #transforma metodo, classifier_type e parametro em uma coluna só
    df = df.astype({'acc_min': str})
    df = df.set_index(keys=['classifier_type', 'acc_min'])
    df.index = df.index.map('_'.join)
    df.reset_index(inplace=True)

    #teste de friedman    
    df_pivoted = df.pivot(index='dataset', columns='index', values='acc_mean')
    statistic, pvalue = friedmanchisquare(*df_pivoted.values.tolist())
    print(f'p-value={pvalue}')

    #teste de nemenyi
    nemenyi = posthoc_nemenyi_friedman(df, melted=True, group_col='index', block_col='dataset', y_col='acc_mean')

    nemenyi.to_csv('resultado_nemenyi_acuracia.csv')
Esempio n. 10
0
    def SignificancePlot(self, methods=None, metric='MAE'):

        # -- Method(s)
        if methods == None:
            methods = self.methods
        else:
            if set(methods) <= set(self.methods):
                raise ("Some method is wrong!")
            else:
                self.methods = methods

        # -- set metric
        self.metric = metric
        self.mag = self.metricSort[metric]

        # -- get data from dataset(s)
        if self.multidataset:
            Y = self.__getData()
        else:
            Y = self.__getDataMono()

        # -- Significance plot, a heatmap of p values
        methodNames = [x.upper() for x in self.methods]
        Ypd = pd.DataFrame(Y, columns=methodNames)
        ph = sp.posthoc_nemenyi_friedman(Ypd)
        cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
        heatmap_args = {
            'cmap': cmap,
            'linewidths': 0.25,
            'linecolor': '0.5',
            'clip_on': False,
            'square': True,
            'cbar_ax_bbox': [0.85, 0.35, 0.04, 0.3]
        }

        plt.figure(figsize=(5, 4))
        sp.sign_plot(ph, cbar=True, **heatmap_args)
        plt.title('p-vals')

        fname = 'SP_' + self.metric + '.pdf'
        plt.savefig(fname)
        plt.show()
Esempio n. 11
0
def allStats(fs):
    fsa = np.array(fs)
    print(len(fsa))
    # print(fsa.T)
    res = {
        'm_std': [{
            "mean": np.mean(f),
            'std1': np.std(f)
        } for f in fsa]
        # 'friedman': stats.friedmanchisquare(*fsa),
        # 'nemenyi': sp.posthoc_nemenyi_friedman(fsa.T)
    }
    if len(fsa) == 2:
        res["tvalue"] = stats.ttest_ind(fsa[0], fsa[1], equal_var=False)
        res["uvalue"] = stats.mannwhitneyu(fsa[0],
                                           fsa[1],
                                           alternative='greater')
    elif len(fsa) > 2:
        res["friedman"] = stats.friedmanchisquare(*fsa)
        res["nemenyi"] = sp.posthoc_nemenyi_friedman(fsa.T)
    return res
Esempio n. 12
0

# stats analysis of reaction times for recall data
rt_recall = rt_main.rt(df, sess1, ans_imm, ans_del, numRo1, worperro, conds, 
                       ids, control,recog = False)

#analysis of reaction times for recog data
rt_recog = rt_main.rt(df, sess1, ans_imm, ans_del, numRo1, worperro, conds, 
                      ids, control,recog = True)




friedman(diffs_mean)

nemeny_p= (sp.posthoc_nemenyi_friedman(rt_recog.iloc[:,[0,1,2]])).round(3)
  

#------------------------------SESSION 2--------------------------------------

print('------------SESSION2----------')


#LONGTERM
#import data from sess2
df2 = all_import_data.import_df2(sess2.datapath, control)

#get long-term results
_, lt1, lt2, ids2_ordered, lt_res_mean, lt_res_std =  lt_main.longterm(
        df2, sess1, sols, conds, worperro)
Esempio n. 13
0
    print(
        friedmanchisquare(split_data[curr_data]['DLIS(False)'],
                          split_data[curr_data]['JW-OS'],
                          split_data[curr_data]['DLCS'],
                          split_data[curr_data]['Dummy'],
                          split_data[curr_data]['RandomFalse'],
                          split_data[curr_data]['Random'],
                          split_data[curr_data]['DLIS(True)']))

    # pvalue matrix for pairwise test
    # Indicates that for all metrics Dummy and DLIS(True) are not
    # significantly different. Neither are the other solvers between
    # them.
    print('\nPairwise comparison:')
    print('\tDLIS(F)\tJW-OS\tDLCS\tDummy\tRandomF\tRandom\tDLIS(T)')
    print(posthoc_nemenyi_friedman(split_data[curr_data].values))

# Test whether there is a significant difference between Dummy and DLIS(True). #todo: this part is flawed
'''for curr_data in split_data:
    if False:
        split_data[curr_data].boxplot(column=['Dummy', 'DLIS(True)'])
        plt.title(curr_data)
        plt.ylim(0, 40)
        plt.show()
    print('\nWilcox test for Dummy against DLIS(True) for ' + curr_data + ' counts: ')
    print('means: randomfalse: ' + str(split_data[curr_data]['RandomFalse'].mean()) + '\t dlis(true): ' +str(split_data[curr_data]['DLIS(True)'].mean()))
    one=np.array(split_data[curr_data]['RandomFalse']).flatten()
    two=np.array(split_data[curr_data]['DLIS(True)']).squeeze()
    print(wilcoxon(np.array(split_data[curr_data]['RandomFalse']),
                   np.array(split_data[curr_data]['DLIS(True)'])))'''
Esempio n. 14
0
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 28 23:54:01 2018

@author: Delgado
"""

import scikit_posthocs as sp
import pandas as pd
import numpy as np
x = np.array([[79.52, 92.06, 79.59], [43.38, 54.54, 46.82],
              [79.43, 88.60, 79.57]])
sol = sp.posthoc_nemenyi_friedman(x)
from nonparametric_tests import friedman_aligned_ranks_test as ft
import Orange

data_MAE_df = pd.DataFrame(data_MAE, columns=all_methods)
print('\nFriedman Test MAE:')
#print(ss.friedmanchisquare(*data_MAE.T))
#print(' ')
t, p, ranks_mae, piv_mae = ft(data_MAE[:, 0], data_MAE[:, 1], data_MAE[:, 2],
                              data_MAE[:, 3], data_MAE[:, 4], data_MAE[:, 5],
                              data_MAE[:, 6], data_MAE[:, 7])
avranksMAE = list(np.divide(ranks_mae, n_datasets))

print('statistic: ' + str(t))
print('pvalue: ' + str(p))
print(' ')
pc = sp.posthoc_nemenyi_friedman(data_MAE_df)
cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
heatmap_args = {
    'cmap': cmap,
    'linewidths': 0.25,
    'linecolor': '0.5',
    'clip_on': False,
    'square': True,
    'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]
}

plt.figure()
sp.sign_plot(pc, **heatmap_args)
plt.title('Nemenyi Test MAE')

data_CC_df = pd.DataFrame(data_CC, columns=all_methods)
Esempio n. 16
0
def main():
    PART_NUMBER = 0
    dataset_paths = [
        os.path.join(CLASS_DBS_PATH, dataset_name)
        for dataset_name in sorted(os.listdir(CLASS_DBS_PATH))
    ]
    # [("db_name", read_cvs)]
    raw_dbs = [(os.path.basename(dataset_path), pd.read_csv(dataset_path))
               for dataset_path in dataset_paths]
    # [("db_name", X, y)]
    raw_dbs = [(raw_db[0], \
                raw_db[1].loc[:, raw_db[1].columns != raw_db[1].columns[-1]], \
                raw_db[1].loc[:, raw_db[1].columns[-1]]) \
               for raw_db in raw_dbs]

    raw_dbs = sorted(raw_dbs, key=lambda x: len(x[1]))  # sort by db length

    if len(
            sys.argv
    ) > 1:  # For distributed training of multiple dbs over multiple servers
        num_parts = int(sys.argv[1])
        curr_part = int(sys.argv[2])
        assert curr_part <= num_parts
        assert curr_part >= 1

        PART_NUMBER = curr_part

        print("working on dbs %s" %
              str(list(range(curr_part - 1, len(raw_dbs), num_parts))))
        raw_dbs = [
            raw_dbs[i] for i in range(curr_part - 1, len(raw_dbs), num_parts)
        ]

    preprocessing = DelayedColumnTransformer([(np.object, [
        SimpleImputer(strategy='constant'),
        OneHotEncoder(handle_unknown='ignore')
    ]), (np.number, [SimpleImputer(strategy='mean'),
                     VarianceThreshold(0.0)])])

    eval_metric = balanced_accuracy_score

    kf = StratifiedKFold(n_splits=EVAL_FOLDS, random_state=RANDOM_SEED)
    model = Pipeline(steps=[('model', RBoost() if USE_RBOOST else ELPBoost())])
    comp_model = Pipeline(steps=[('model', lgb.LGBMClassifier())])
    ova_model = OneVsRestClassifier(model)
    ova_comp_model = OneVsRestClassifier(comp_model)

    # {db_name: {our_model: reulsts, compare_model: results}}
    dbs_results = {}

    with open(os.path.join(WORKING_DIR, "bad-dbs.txt"), "w") as f:
        pass

    os.system('mkdir -p {}'.format(MODELS_DIR))

    for db_name, X, y in raw_dbs:
        dbs_results[db_name] = {}
        X, y = db_encode(db_name, X, y)

        N = len(X) * (1 - (1 / EVAL_FOLDS))
        # Our Model Hyper-Params
        model_params = {
            'estimator__model__kappa': [1 / 3, 1 / N, 2 / N, 3 / N],
            'estimator__model__T': [3, 5, 10],
            'estimator__model__reg': [1, 10, 20, 50, 100],
            'estimator__model__silent': [True],
            'estimator__model__verbose': [False]
        }
        fold_num = 1

        # list of results per fold
        folds_results = []
        comp_folds_results = []

        is_binary = len(y.unique()) == 2  # No special case for binary
        try:
            for train_index, test_index in kf.split(X, y):
                print("{}:{}:Fold_{}".format(datetime.now(), db_name,
                                             fold_num))
                # --- get fold and preprocess --- #
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                invalid_labels = set(y_test.unique()) - set(y_train.unique())
                # Introduce new labels, should occur only for outliers due to StratifiedKFold
                # Main assumption in classification is that all labels are known upfront
                if len(invalid_labels) > 0:
                    X_train = pd.concat([
                        X_train,
                        pd.DataFrame(
                            [[np.nan for _ in range(len(X_train.columns))]
                             for _ in range(len(invalid_labels))],
                            columns=X_train.columns)
                    ],
                                        ignore_index=True)
                    y_train = y_train.append(pd.Series(list(invalid_labels)),
                                             ignore_index=True)
                    X_train, y_train = db_encode(db_name, X_train, y_train)

                preprocessing.fit(X_train, y_train)
                X_train = preprocessing.transform(X_train)
                X_test = preprocessing.transform(X_test)

                # --- random search --- #
                cv = RandomizedSearchCV(estimator=ova_model,
                                        param_distributions=model_params,
                                        scoring=make_scorer(eval_metric),
                                        cv=HPT_FOLDS,
                                        n_iter=RANDOM_CV_ITER,
                                        random_state=RANDOM_SEED)
                comp_cv = RandomizedSearchCV(
                    estimator=ova_comp_model,
                    param_distributions=comp_model_params,
                    scoring=make_scorer(eval_metric),
                    cv=HPT_FOLDS,
                    n_iter=RANDOM_CV_ITER,
                    random_state=RANDOM_SEED)

                curr_fold_results = {'fold_num': fold_num}
                curr_fold_comp_results = {'fold_num': fold_num}

                # --- measure times - FIT + INFER --- #
                print("Training our model")
                curr_fold_results['train_time'], curr_fold_results[
                    'infer_time'] = get_time_metrics(cv, X_train, y_train,
                                                     X_test)
                print("Finished training our model")
                print("Training comparison model")
                curr_fold_comp_results['train_time'], curr_fold_comp_results[
                    'infer_time'] = get_time_metrics(comp_cv, X_train, y_train,
                                                     X_test)
                print("Finished training comparison model")

                # --- save trained models --- #
                model_path = MODELS_DIR + "/model_fold_" + str(
                    fold_num) + "_db_name_" + db_name
                comp_model_path = MODELS_DIR + "/comp_model_fold_" + str(
                    fold_num) + "_db_name_" + db_name
                dill.dump(cv.best_estimator_, open(model_path, 'wb'))
                dill.dump(comp_cv.best_estimator_, open(comp_model_path, 'wb'))

                # --- register best params --- #
                best_comp_params = comp_cv.best_params_
                best_params = cv.best_params_
                curr_fold_comp_results['best_params'] = best_comp_params
                curr_fold_results['best_params'] = best_params
                # --- get predictions for MultiRBoost --- #
                y_test_pred_per_label_scores = cv.predict_proba(X_test)
                y_test_pred = cv.predict(X_test)

                train_labels = cv.best_estimator_.classes_
                comp_train_labels = comp_cv.best_estimator_.classes_  # can be sorted differently

                # --- get predictions for LightGBM --- #
                y_test_pred_comp_per_label_scores = comp_cv.predict_proba(
                    X_test)
                y_test_pred_comp = comp_cv.predict(X_test)

                # --- replace nans with uniform - fixes an error in OneVsRest --- #
                y_test_pred_comp_per_label_scores[np.isnan(y_test_pred_comp_per_label_scores)] = 1.0 / \
                                                                                                 y_test_pred_comp_per_label_scores.shape[
                                                                                                     1]

                y_test_pred_per_label_scores[np.isnan(y_test_pred_per_label_scores)] = 1.0 / \
                                                                                       y_test_pred_per_label_scores.shape[
                                                                                           1]

                # metrics applicable in multiclass setting ---accuracy, precision--- #
                multiclass_metrics_dict = {0: 'accuracy', 1: 'precision'}

                multiclass_metrics = get_multiclass_metrics(
                    y_test, y_test_pred)
                multiclass_comp_metrics = get_multiclass_metrics(
                    y_test, y_test_pred_comp)

                for metric_pos, metric_name in multiclass_metrics_dict.items():
                    curr_fold_results[metric_name] = multiclass_metrics[
                        metric_pos]
                    curr_fold_comp_results[
                        metric_name] = multiclass_comp_metrics[metric_pos]

                # Metrics only applicable in a binary setting ---fpr, tpr, pr_auc, roc-auc--- #
                binary_metrics_dict = {
                    0: 'fpr',
                    1: 'tpr',
                    2: 'pr_auc',
                    3: 'roc_auc'
                }

                binary_metrics = get_binary_metrics(y_test, y_test_pred, y_test_pred_per_label_scores, \
                                                    train_labels)
                binary_comp_metrics = get_binary_metrics(y_test, y_test_pred_comp,
                                                         y_test_pred_comp_per_label_scores, \
                                                         comp_train_labels)

                for metric_pos, metric_name in binary_metrics_dict.items():
                    curr_fold_results[metric_name] = binary_metrics[metric_pos]
                    curr_fold_comp_results[metric_name] = binary_comp_metrics[
                        metric_pos]

                # add the current fold results to the results list
                folds_results.append(curr_fold_results)
                comp_folds_results.append(curr_fold_comp_results)

                fold_num += 1
            dbs_results[db_name][OUR_MODEL] = folds_results
            dbs_results[db_name][COMP_MODEL] = comp_folds_results

            write_single_db_results(dbs_results[db_name], db_name)

        except Exception as e:
            print("ERROR!", e)
            # catching weird values
            with open(os.path.join(WORKING_DIR, "bad-dbs.txt"), "a") as f:
                dbs_results.pop(db_name)
                f.write("{db_name}: {error}\n".format(db_name=db_name,
                                                      error=e))

            continue

    print(dbs_results)
    write_all_results(dbs_results, PART_NUMBER)

    print("Done writing results in part %d" % PART_NUMBER)

    # --- Statistical Tests Section --- #
    # --- Friedman Test --- #
    models_measures = np.zeros(shape=(len(dbs_results), 2))

    for model_idx, model_name in enumerate(MODELS_LIST):
        for db_idx, db_name in enumerate(dbs_results):
            models_measures[db_idx][model_idx] = np.average([dbs_results[db_name][model_name][i][STAT_CHOSEN_METRIC] \
                                                             for i in range(EVAL_FOLDS)])

    stats_per_db = [
        models_measures[i, :] for i in range(models_measures.shape[0])
    ]

    p_value = friedmanchisquare(*stats_per_db).pvalue
    print(p_value)

    if p_value <= P_THRESH:
        print("Statistically significant!")
        post_hoc_res = posthoc_nemenyi_friedman(models_measures)
        print("nemenyi post-hoc result: {res}".format(res=post_hoc_res))

    else:
        print("Not statistically significant!")

    # --- Meta Learning Section --- #
    per_dataset_winner = {}
    for db_name in dbs_results:
        our_model_metrics = dbs_results[db_name][OUR_MODEL]
        comp_model_metrics = dbs_results[db_name][COMP_MODEL]
        we_win = summarize_metrics(our_model_metrics) >= summarize_metrics(
            comp_model_metrics)
        per_dataset_winner[db_name.split('.')[0]] = 1 if we_win else -1

    X_raw = pd.read_csv(META_DBS_PATH, header=0, index_col='dataset')
    X = X_raw.loc[list(per_dataset_winner.keys()), :]
    y = pd.Series(
        [per_dataset_winner[db_name] for db_name in per_dataset_winner])

    db_names = [db_name for db_name in per_dataset_winner]

    loo = LeaveOneOut()
    meta_model_results = {}

    os.system('mkdir -p {plots_dir}/{inner_dir}'.format(
        plots_dir=PLOTS_DIR, inner_dir=IMPORTANCE_DIR))
    os.system('mkdir -p {plots_dir}/{inner_dir}'.format(plots_dir=PLOTS_DIR,
                                                        inner_dir=SHAP_DIR))

    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        curr_dataset = X_test.index[0]
        meta_model = xgb.XGBClassifier(booster='gbtree')
        meta_model.fit(X_train, y_train)
        y_pred = meta_model.predict(X_test)[0]

        meta_model_results[curr_dataset] = y_pred

        generate_importance(meta_model, db_names[test_index[0]])
        generate_shap(meta_model, db_names[test_index[0]], X_test)

    y_pred = pd.Series(
        [meta_model_results[db_name] for db_name in meta_model_results])
    print("Meta Model Accuracy: %f" % accuracy_score(y, y_pred))

    meta_results = pd.DataFrame(data={
        'db_name': X.index,
        'y_true': y,
        'y_pred': y_pred
    }).set_index('db_name')
    meta_results.to_csv(os.path.join('/kaggle/working', 'meta-results.csv'))
    meta_results.describe().to_csv(
        os.path.join('/kaggle/working', 'meta-results-describe.csv'))

    dbs_we_won = list(meta_results[meta_results.y_true == 1].index)
    dbs_we_lost = list(meta_results[meta_results.y_true == -1].index)

    won_dbs_lengths = [
        len(X) for (db_name, X, y) in raw_dbs
        if db_name.split('.')[0] in dbs_we_won
    ]
    lost_dbs_lengths = [
        len(X) for (db_name, X, y) in raw_dbs
        if db_name.split('.')[0] in dbs_we_lost
    ]

    print("Average winning dbs length: %f" % np.mean(won_dbs_lengths))
    print("Average losing dbs length: %f" % np.mean(lost_dbs_lengths))
Esempio n. 17
0
    'dataset': dataset_names
}  # will contain one row per dataset and models over columns
groups_by_model = df_results.groupby('Algorithm Name')
for model_name in model_names:
    df_model = groups_by_model.get_group(model_name)
    groups_by_dataset = df_model.groupby('Dataset Name')
    model_mean = []
    for dataset_name in dataset_names:
        model_mean.append(
            groups_by_dataset.get_group(dataset_name)
            [metric].mean())  # average over folds
    average_results[model_name] = model_mean
df_results = pd.DataFrame(average_results)
df_results.to_csv('results/average_results.csv', index=False)

# save ranks of algorithms (1 is best, |models| is worst)
df = df_results.drop(columns='dataset')
ranks = rankdata(df.to_numpy(), method='dense', axis=1)
df = pd.DataFrame(ranks, columns=df.columns)
df['dataset'] = df_results['dataset']
df.to_csv('results/ranks.csv')

# friedman and post hoc tests
t_stat, p_val = friedmanchisquare(*[df_results[i] for i in model_names])
print('\nfriedman test p-val = %s' % p_val)
post_hoc_p_vals = posthoc_nemenyi_friedman(
    df_results.drop(columns='dataset').to_numpy())
post_hoc_p_vals.columns = model_names
print('\npost hoc p-vals:\n%s' % post_hoc_p_vals)
post_hoc_p_vals.to_csv('results/post_hoc.csv', index=False)
Esempio n. 18
0
import pandas as pd
from scikit_posthocs import posthoc_nemenyi_friedman
from statsmodels.sandbox.stats.multicomp import multipletests

data = pd.read_csv("algo_performance.csv")
p = posthoc_nemenyi_friedman(data.values)

for i in range(0, 5):
    print("For the", i + 1,
          "algorithm and a=0.05 with Bonferroni method the result is:",
          multipletests(p.values[:, i], method='bonferroni', alpha=0.05)[0])
    print("For the", i + 1,
          "algorithm and a=0.1 with Bonferroni method the result is:",
          multipletests(p.values[:, i], method='bonferroni', alpha=0.1)[0])
    print("For the", i + 1,
          "algorithm and a=0.25 with Bonferroni method the result is:",
          multipletests(p.values[:, i], method='bonferroni', alpha=0.25)[0])
    print()
Esempio n. 19
0
def process_results(results, groupby='Detector', latex_path=None, fig_path=None,
        bold_best=False, alpha=0.05, one_fig=True, cd_diagram=True):
    '''
    args:
        results (dataframe): table of results.
        groupby (string, list of strings): the name of the column which groups values together
        path (string or None): the location to write the latex table to (if not None)
        bold_best (bool): should the best value in each column be bolded?
    '''
    full_results = results.drop(columns='dataset_name').groupby(groupby)#, as_index=False)
    results_summary = full_results.agg(lambda x: f'{np.mean(x):.2f} ({np.std(x):.2f})')

    ## Embolden the best values for each column ##
    results_means = full_results.mean()
    if bold_best:
        for col in results_means.columns:
            if col==groupby or col in groupby or col=='dataset_name':
                continue
            if col in ['Precision', 'F1', 'Recall']:
                idxbf = results_means[col].idxmax()
            elif col in ['Err-rate', 'Memory', 'Runtime', 'Mean Delay']:
                idxbf = results_means[col].idxmin()
            else:
                raise ValueError(f"Is it good if {col} is high or low?")
            results_summary.loc[idxbf, col] = '{bf ' + results_summary.loc[idxbf, col] + '}'

    # Make table latex ready
    results_latex = results_summary.to_latex()#index=False)

    # replace ll...l align with lr...r align
    (l_start, l_fin) = re.search('l(l+)', results_latex).span()
    l_start += 1
    results_latex = results_latex[:l_start ] + 'r'*(l_fin-l_start) + results_latex[l_fin: ]

    ## Make names more readable ##
    before_after = {
        'Mean Delay': 'Mean Delay',
        'Memory': 'Memory (bytes)',
        'Runtime': 'Runtime (ms)',
        'Err-rate': 'Err-rate (\%)',
        'PageHinkley': 'PH',
        'FHDDMS.add': 'FHDDMS$_{add}$',
        'MDDM.A.100': 'MDDM$_A$',
        'MDDM.E.100': 'MDDM$_E$',
        'MDDM.G.100': 'MDDM$_G$',
        'NO\_DETECTION': 'Null',
        'NO_DETECTION': 'Null',
        'NAIVE BAYES': 'NB',
        'PERCEPTRON': 'PR',
        'HOEFFDING TREE': 'HT',
        'LEDConceptDrift': 'LED',
        '\{bf ': '{\\fontseries{b}\\selectfont ',
        '\}': '}'
    }
    before_after.update({
        f'HDDM.{x}.test': f'HDDM$_{x}$' for x in 'AW'
    })
    # Replace underscores with spaces in mode names
    if 'Mode' in results.columns:
        before_after.update({
            x.replace('_', '\_'): ' '.join(x.split('_')).title() for x in results['Mode'].unique()
        })
        # print(before_after)
    for before, after in before_after.items():
        # if after != '}':
        #     after = after.rjust(len(before))
        results_latex = results_latex.replace(before, after)

    # Write the LaTeX table to disk
    if latex_path:
        with open(os.path.abspath(latex_path), 'w') as f:
            f.write(results_latex)
            print('Writing LaTeX table to', latex_path)

    ## CD DIAGRAMS ##
    if not cd_diagram:
        return results_latex
    if one_fig:
        nfigs = len(results_summary.columns)# if results not in ['Detector', 'dataset_name'])
        nfigs = nfigs+1 if nfigs%2==0 else nfigs
        nrows = nfigs // 2
        ncols = 2
        fig_i = 1
        width=10
        height=4
        fig = plt.figure(figsize=(width*2+1, height*nrows+1))
        fig.set_facecolor('white')
    # change names of detectors according to before_after dictionary
    results.loc[:, 'Detector'] = results.Detector.map(before_after).fillna(results['Detector'])
    for col in results_means.columns:

        # print(f'Processing {col}')

        # Figure out if the plot should be reversed or not
        if col==groupby or col in groupby or col=='dataset_name':
            continue
        if col in ['Precision', 'F1', 'Recall']:
            reverse=True
        elif col in ['Err-rate', 'Memory', 'Runtime', 'Mean Delay']:
            reverse=False
        else:
            raise ValueError(f"Is it good if {col} is high or low?")

        # Convert the column data into matrix form
        dets = results.Detector.unique()
        dsets = results.dataset_name.unique()
        data = []
        for dset in dsets:
            row = []
            for det in dets:
                # print(dset)
                x = list(results[(results['Detector']==det) & (results['dataset_name']==dset)][col])[0]
                row.append(x)
            data.append(row)
        data = np.array(data)

        # Replace all the NaNs in the data with zeros
        data = np.nan_to_num(data)

        # Perform Nemenyi-Friedman test
        nem = sp.posthoc_nemenyi_friedman(data)
        # print('Post-hocs computed.')

        # Put p-values in a form that the cd-diagram code can use
        p_vals = []
        for i, det1 in enumerate(dets):
            for j, det2 in enumerate(dets[i+1: ]):
                p_val = nem[i][j+i+1]
                p_vals.append(( det1, det2, p_val, p_val<alpha ))

        # Set span of CD-diagram and compute average values or average rank
        lowv, highv = None, None
        if col in ['Precision', 'Recall', 'F1']: # , 'Err-rate'
            lowv, highv = 0, 1
            average_vals = results.groupby('Detector').mean()[col]
            if col=='Err-rate':
                lowv, highv = 0, 100
        else:
            # Compute average rank
            average_vals = pd.DataFrame(columns=['Detector', col])
            average_vals[col] = -average_vals[col]
            for dset in results.dataset_name:
                results_i = results[ results['dataset_name']==dset ]
                # print(col)
                # print(results_i[['Detector', col]])
                results_i.loc[:, col] = results_i[col].rank(ascending=True)
                # print(results_i[['Detector', col]])
                average_vals = average_vals.append(results_i[['Detector', col]])
                # break
            # sys.exit()
            # print(col)
            # print(average_vals)
            average_vals = average_vals.groupby('Detector').mean()[col]
            # print(average_vals)
        # print('Average vals computed.')


        # Put the average values in a form that the cd-diagram code can use
        average_vals = average_vals.sort_values()
        if reverse:
            average_vals = average_vals[::-1]
        # minv = average_vals.min()
        # maxv = average_vals.max()

        # Plot the cd diagram
        if one_fig:
            ax = fig.add_subplot(nrows, ncols, fig_i)
            fig_i += 1
        else:
            ax = None
        graph_ranks(
            average_vals.values,
            average_vals.keys(),
            p_vals,
            cd=None,
            reverse=reverse,
            textspace=1, labels=False,
            highv=highv, lowv=lowv,
            ax=ax,
            width=width,
            height=height,
            # highv = int(maxv + (maxv-minv)*0.1),
            # lowv = int(minv - (maxv-minv)*0.1)
        )
        font = {'family': 'sans-serif',
            'color':  'black',
            'weight': 'normal',
            'size': 22,
        }
        ax.set_title(col,fontdict=font, x=0.5, y=0.95) # 0.9
        if not one_fig:
            # fig_path = os.path.abspath(path)+f"-{col.replace(' ', '_')}.pdf"
            plt.savefig(fig_path,bbox_inches='tight')

        # print(f'Completed plot for {col}')

    if one_fig:
        # fig_path = os.path.abspath(path)+".pdf"
        # plt.show()
        print('Writing cd diagrams to', fig_path)
        plt.savefig(fig_path, bbox_inches='tight')

    return results_latex
Esempio n. 20
0
for system_order in ['finite', 'infinite']:
    print(system_order.upper())

    for eval in range(len(function_evals_of_interest)):
        algorithms_at_fes = []
        for algorithm in ['sa', 'acfsa', 'pso', 'aiwpso', 'acor', 'baacor']:
            print(algorithm)
            # Load test costs of a given metaheuristic for a given system, considering some number of objective function evaluations
            base_filename = './results/' + algorithm + '_' + system_order
            test_costs_mat = np.load(base_filename + '_test_costs.npy')
            test_costs_of_interest = test_costs_mat[:, evals_mask]
            costs_fe = test_costs_of_interest[:, eval]

            algorithms_at_fes.append(list(costs_fe))
            print(
                str(function_evals_of_interest[eval]) + ':  \t' +
                str(np.mean(costs_fe)))

        algorithms_at_fes = np.array(algorithms_at_fes)
        print('\n Statistical significance')
        print(np.shape(algorithms_at_fes))
        print('Friedman p-val = ' +
              str(scipy.stats.friedmanchisquare(*algorithms_at_fes)[1]) +
              '\n\n')
        nm_posthoc = sp.posthoc_nemenyi_friedman(algorithms_at_fes.T)
        plt.figure()
        sp.sign_plot(nm_posthoc, **heatmap_args)
        plt.show()
        print('\n')
Esempio n. 21
0
def nemenyi():
    n = len(models)
    size = int(math.factorial(n) / (math.factorial(n - 2) * math.factorial(2)))

    print(size)
    nemenyi_results = {}

    for dataset in data1.keys():
        print(dataset)

        results1 = data1[dataset]
        results5 = data5[dataset]
        results10 = data10[dataset]
        results20 = data20[dataset]

        nemenyi_results[dataset] = np.zeros(shape=(size, 5), dtype=object)
        index = 0
        matrix1 = np.zeros((30, 7), dtype=float)
        matrix5 = np.zeros((30, 7), dtype=float)
        matrix10 = np.zeros((30, 7), dtype=float)
        matrix20 = np.zeros((30, 7), dtype=float)

        res1 = None
        res5 = None
        res10 = None
        res20 = None

        for i, model in enumerate(models):
            matrix1[:, i] = results1[model]
            matrix5[:, i] = results5[model]
            matrix10[:, i] = results10[model]
            matrix20[:, i] = results20[model]

        res1 = sp.posthoc_nemenyi_friedman(matrix1)
        res5 = sp.posthoc_nemenyi_friedman(matrix5)
        res10 = sp.posthoc_nemenyi_friedman(matrix10)
        res20 = sp.posthoc_nemenyi_friedman(matrix20)

        col = 1
        for row in range(res1.shape[0]):
            for m in range(len(models) - col):
                p1, p5, p10, p20 = None, None, None, None
                p1 = round(res1.iloc[row, col + m], 4)
                p5 = round(res5.iloc[row, col + m], 4)
                p10 = round(res10.iloc[row, col + m], 4)
                p20 = round(res20.iloc[row, col + m], 4)

                print("\\hline")
                if (p1 < 0.05):
                    p1 = "\\textit{%f}" % (p1)
                else:
                    p1 = "%f" % (p1)

                if (p5 < 0.05):
                    p5 = "\\textit{%f}" % (p5)
                else:
                    p5 = "%f" % (p5)

                if (p10 < 0.05):
                    p10 = "\\textit{%f}" % (p10)
                else:
                    p10 = "%f" % (p10)

                if (p20 < 0.05):
                    p20 = "\\textit{%f}" % (p20)
                else:
                    p20 = "%f" % (p20)

                comp = "%s vs %s" % (models[row], models[col + m])

                nemenyi_results[dataset][index, 0] = comp
                nemenyi_results[dataset][index, 1] = p1
                nemenyi_results[dataset][index, 2] = p5
                nemenyi_results[dataset][index, 3] = p10
                nemenyi_results[dataset][index, 4] = p20
                index += 1
                print("\\textbf{%s vs %s} & %s & %s & %s & %s \\\\" %
                      (models[row], models[col + m], p1, p5, p10, p20))
            col += 1

    return nemenyi_results
Esempio n. 22
0
    filename_, file_extension = os.path.splitext(filename)
    if file_extension == ".xlsx":
        data = pd.read_excel(dirname+'\\'+filename,usecols=['dataset_name', 'algorithm_name', 'roc_auc'])
    else:
        data = pd.read_csv(dirname+'\\'+filename, usecols=['dataset_name', 'algorithm_name', 'roc_auc'])
    # average auc for each dataset
    avg_auc = data.groupby(['dataset_name', 'algorithm_name'], as_index= False).mean()
    res_df = res_df.append(avg_auc)
res_df.reset_index(inplace=True, drop=True)
# get all datasets names with results from all four algorithms
dataset_names = res_df.groupby('dataset_name', as_index = False).count()
dataset_names = list(dataset_names[dataset_names['algorithm_name'] == 4]['dataset_name'])
# filter result to contain only datasets that are in dataset_names
res_df = res_df[res_df['dataset_name'].apply(lambda x: x in dataset_names)]
alog_names = list(res_df['algorithm_name'].unique())

# Friedman test
stat, p = stats.friedmanchisquare(res_df[res_df['algorithm_name'] == alog_names[0]].sort_values(by='dataset_name')['roc_auc'],
                              res_df[res_df['algorithm_name'] == alog_names[1]].sort_values(by='dataset_name')['roc_auc'],
                              res_df[res_df['algorithm_name'] == alog_names[2]].sort_values(by='dataset_name')['roc_auc'],
                              res_df[res_df['algorithm_name'] == alog_names[3]].sort_values(by='dataset_name')['roc_auc'])

# interpret results
alpha = 0.05
print('Statistics=%.3f, p=%.3f' % (stat, p))
if(p < alpha):
    print('null hypothesis rejected')
    # perform post-hoc test
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(sp.posthoc_nemenyi_friedman(res_df, y_col='roc_auc',block_col='dataset_name', group_col='algorithm_name',melted=True))
pip install scikit-posthocs
import pandas as pd
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# Teste de Friedman
# Primeiro criamos um dataframe com as views 0: fac, 1: fou, 2: kar e a linha 3 com a regra da soma.
# As colunas são bg: Bayesiano Gaussiano, bkv: Bayesiano K-vizinhos e Par: Parzen
tbacc = pd.DataFrame.from_dict({'view': {0: 0, 1: 1, 2: 2, 3: 3}, 'bg': {0: 0.805, 1: 0.58, 2: 0.735, 3: 0.815},
                                'bkv': {0: 0.79, 1: 0.63, 2: 0.775, 3: 0.855}, 'par': {0: 0.835, 1: 0.615, 2: 0.82,
                                3: 0.835}})
print(tbacc)
# Aplicamos o teste de Friedman sobre as acurácias dos três classificadores
result = friedmanchisquare(tbacc["bg"], tbacc["bkv"], tbacc["par"])
print(result)

# Em seguida, em caso de rejeição da hipótese Nula, fazemos o teste posthoc nemenyi
dados = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6:
2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3}, 'groups': {0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1,
8: 2, 9: 2, 10: 2, 11: 2},'y': {0: 0.805, 1: 0.58, 2: 0.735, 3: 0.815,
4: 0.79, 5: 0.63, 6: 0.775, 7: 0.855, 8: 0.835, 9: 0.615, 10: 0.820, 11: 0.835}})

print(dados)

sp.posthoc_nemenyi_friedman(dados, y_col='y', block_col='blocks', group_col='groups', melted= True)
Esempio n. 24
0
    t = round(t, 2)
    p = round(p, 2)
    res.append([t, p])
    return res


friedman_l = []
for i in range(len(all_data)):
    res2 = friedman(all_data[i])
    friedman_l.extend(res2)

friedman_l = pd.DataFrame(friedman_l)

friedman_recog = friedman(rt_recog)
rr_normality.qq_plot(rt_recog, recall=True)
nemeny_recog = sp.posthoc_nemenyi_friedman(rt_recog.iloc[:, [0, 1, 2]])
print('-----------', friedman_recog, nemeny_recog, '-------')


def anova(data):

    data = pd.melt(data,
                   id_vars='sub_id',
                   var_name='cond',
                   value_name='performance')

    #    #perform anova
    anovarm = AnovaRM(data, 'performance', 'sub_id', within=['cond'])
    res = anovarm.fit()

    #rounded p value
Esempio n. 25
0
def main(dataset, alpha=.05):
    os.chdir(os.path.dirname(os.path.realpath(__file__)) + '/../')

    directory = os.path.dirname(os.path.realpath(__file__)) + '/' + dataset + '/info/'
    files = glob.glob(directory + '*.json')

    BA_AUCs = {}
    BA_10s = {}

    print(os.getcwd())
    for file in files:
        fs_class = file.split('.')[-2].split('_')[-1]
        with open(file, 'r') as outfile:
            stats = json.load(outfile)
        n_features = np.asarray(stats['classification']['n_features'])
        for key in ['BA', 'svc_BA', 'model_BA']:
            if key not in stats['classification']:
                continue
            BA_key = fs_class + '_' + key
            BA = np.asarray(stats['classification'][key]).T
            BA_AUC = (.5 * (BA[:, 1:] + BA[:, :-1]) * (n_features[1:] - n_features[:-1]) / (n_features[-1] - n_features[0])).sum(axis=-1)
            BA_AUCs[BA_key] = BA_AUC
            BA_10s[BA_key] = BA[:, 0]
            print('method : ', fs_class)
            print('BA', key, ' : ', BA.mean(axis=0), '+-', BA.std(axis=0))
            print('BA_10', key, ' : ', BA_10s[BA_key].mean(axis=0), '+-', BA_10s[BA_key].std(axis=0))
            print('BA_AUC', key, ' : ', BA_AUC.mean(axis=0), '+-', BA_AUC.std(axis=0))


    for t, BA_dict in enumerate([BA_10s, BA_AUCs]):
        print('BA 10 features' if t == 0 else 'BA_AUC')

        keys = list(BA_dict.keys())
        # wilcoxon_matrix = np.zeros((len(keys), len(keys)))
        # for i in range(len(keys) - 1):
        #     BA_i = BA_dict[keys[i]]
        #     for j in range(i+1, len(keys)):
        #         BA_j = BA_dict[keys[j]]
        #         t, p_value = wilcoxon(BA_i, BA_j)
        #         if p_value < alpha:
        #             if BA_i.mean() > BA_j.mean():
        #                 wilcoxon_matrix[i, j] = 1
        #                 wilcoxon_matrix[j, i] = -1
        #             else:
        #                 wilcoxon_matrix[i, j] = -1
        #                 wilcoxon_matrix[j, i] = 1
        #
        # # print(keys)
        # # print(wilcoxon_matrix)
        #
        # min_wilkoxon = wilcoxon_matrix.min(axis=-1)
        # max_wilkoxon = wilcoxon_matrix.max(axis=-1)
        # best_methods = np.where((min_wilkoxon + 1) * max_wilkoxon > 0)[0]
        # print('wilcoxon best methods : ', np.asarray(keys)[best_methods])

        auc = tuple(list(BA_dict.values()))
        _, p_value = friedmanchisquare(*auc)
        print('friedman p_value : ', p_value)
        nemenyi = sp.posthoc_nemenyi_friedman(np.array(auc).T).values
        nemenyi_matrix = np.zeros((len(keys), len(keys)))
        for i in range(len(keys) - 1):
            BA_i = BA_dict[keys[i]]
            for j in range(i + 1, len(keys)):
                BA_j = BA_dict[keys[j]]
                p_value = nemenyi[i,j]
                if p_value < alpha:
                    if BA_i.mean() > BA_j.mean():
                        nemenyi_matrix[i, j] = 1
                        nemenyi_matrix[j, i] = -1
                    else:
                        nemenyi_matrix[i, j] = -1
                        nemenyi_matrix[j, i] = 1

        min_nemenyi = nemenyi_matrix.min(axis=-1)
        max_nemenyi = nemenyi_matrix.max(axis=-1)
        best_methods = np.where((min_nemenyi + 1) * max_nemenyi > 0)[0]
        print('nemenyi best methods : ', np.asarray(keys)[best_methods])