def run_eval(dataset, iterations):
    suffixes = [ 'Adaboost', 'AdaFair', 'SMOTEBoost', 'Fish et al.' ]

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "dutch":
        X, y, sa_index, p_Group, x_control = load_dutch_data()
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "credit":
        X, y, sa_index, p_Group, x_control = load_credit()
    elif dataset == "diabetes":
        X, y, sa_index, p_Group, x_control = load_diabetes()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()

    else:
        exit(1)
    create_temp_files(dataset, suffixes)
    threads = []
    mutex = []
    for lock in range(0, 8):
        mutex.append(Lock())
    print (dataset)
    random.seed(int(time.time()))

    for iter in range(0, iterations):

        sss = StratifiedShuffleSplit(n_splits=1, test_size=.5, random_state=iter)
        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # for proc in range(0, 4):
            #     threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc],proc, 200, 1, dataset)))
            threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[1], mutex[1],1, 500, 1, dataset)))
            break
    for process in threads:
        process.start()

    for process in threads:
        process.join()

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix, 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()

    plot_my_results_sp(results, suffixes, "Images/StatisticalParity/" + dataset, dataset)
    delete_temp_files(dataset, suffixes)
Example #2
0
def load_datasets(dataset, names):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    names.add(dataset)
    output = []
    output.append(X.shape[0])
    output.append(X.shape[1])
    output.append(float(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f')))

    return output
def get_dataset(dataset):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset not in ['bloob', 'circle', 'moon']:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    return X, y, cl_names
def run_eval(dataset, base_learners, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']

    y[y != 1] = 0

    list_of_scores = []
    processes = []

    for method in methods:
        p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names))  # Passing the list
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    for method in methods:
        with open('temp/' + method, 'rb') as filehandle:
            # read the data as binary data stream
            list_of_scores.append(pickle.load(filehandle))

    y[y != 1] = -1

    for idx in range(0, len(list_of_scores)):
        list_of_scores[idx] = numpy.array(list_of_scores[idx]) * y

    overall_confs = []
    positive_confs = []
    negative_confs = []

    for conf in list_of_scores:
        overall_confs.append(conf)
        positive_confs.append(conf[y == 1])
        negative_confs.append(conf[y == -1])

    num_bins = 40
    fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
    plt.rcParams.update({'font.size': 12})
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato']
    default_cycler = (cycler(color=colors) +
                      cycler(linestyle=['-', (0, (1, 1)), '--', '-.',
                                        (0, (5, 10)),
                                        (0, (5, 1)),
                                        '-', (0, (1, 1)), '--', '-.',
                                        (0, (5, 10))]))

    ax1.set_prop_cycle(default_cycler)
    ax2.set_prop_cycle(default_cycler)
    ax3.set_prop_cycle(default_cycler)

    ax1.set_title("Positive CDF")
    ax1.grid(True)
    ax1.set_xlim(-1, 1)
    ax2.set_xlim(-1, 1)
    ax3.set_xlim(-1, 1)

    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato', 'indigo', 'lightskyblue']

    output = defaultdict(list)

    for idx in range(0, len(positive_confs)):
        pos_conf = positive_confs[idx]
        counts_positives, bin_edges_positives = numpy.histogram(pos_conf, bins=num_bins, normed=True)
        cdf_positives = numpy.cumsum(counts_positives)
        # ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx],color=colors[idx])
        ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_positives[:-1])
        output[methods[idx]].append(cdf_positives)

    # ax1.legend(loc='best')
    ax1.set_xlabel("Margin")

    ax1.set_ylabel("Cumulative Distribution")
    ax1.axhline(0, color='black')
    ax1.axvline(0, color='black')

    ax2.grid(True)

    ax2.axhline(0, color='black')
    ax2.axvline(0, color='black')
    ax2.set_title("Negative CDF")

    for idx in range(0, len(negative_confs)):
        if idx == 0:
            ax2.set_ylabel("Cumulative Distribution")
            ax2.set_xlabel("Margin")

        neg_conf = negative_confs[idx]
        counts_negatives, bin_edges_negatives = numpy.histogram(neg_conf, bins=num_bins, normed=True)
        cdf_negatives = numpy.cumsum(counts_negatives)
        # ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx],color=colors[idx])
        ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_negatives[:-1])
        output[methods[idx]].append(cdf_negatives)

    ax3.grid(True)

    ax3.axhline(0, color='black')
    ax3.axvline(0, color='black')
    ax3.set_title("Overall CDF")

    for idx in range(0, len(negative_confs)):
        if idx == 0:
            ax3.set_ylabel("Cumulative Distribution")
            ax3.set_xlabel("Margin")

        over_conf = overall_confs[idx]
        counts_overall, bin_edges_overall = numpy.histogram(over_conf, bins=num_bins, normed=True)
        cdf_overall = numpy.cumsum(counts_overall)
        # ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx], color=colors[idx])
        ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_overall[:-1])
        output[methods[idx]].append(cdf_overall)

    plt.legend(loc='upper center', bbox_to_anchor=(-0.7, 1.305), ncol=5)

    if not os.path.exists("Images/cdf_plots/" + dataset):
        os.makedirs("Images/cdf_plots/" + dataset)

    plt.savefig("Images/cdf_plots/" + dataset + "/cdf_" + str(base_learners) + ".png", bbox_inches='tight',
                dpi=200)
    return output
Example #5
0
def run_eval(dataset, baseL, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']

    y[y != 1] = 0
    print("===============-- " + dataset + " --===============")

    processes = []
    for method in methods:
        p = Process(target=train_and_predict, args=(X, y, baseL, method))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    list_of_dicts = []

    for method in methods:
        with open('temp_preds_adaac/' + method, 'rb') as filehandle:
            list_of_dicts.append(update_stats(pickle.load(filehandle)))

    plot_amort_vs_non_amort(methods, list_of_dicts, baseL, "Images/Amort_vs_non_amort/" + dataset + "/")
    return list_of_dicts
def run_eval(dataset, folds, iterations, baseL, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "rain_aus":
        X, y, cl_names = load_rain_aus()
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    unique_attr = set([i.split("?")[0] for i in cl_names])
    print(dataset + "\t" + str(len(unique_attr)) + "\t" + str(f'{sum(abs(y[y == 1])):,}') + "\t" + str(
        f'{len(abs(y[y != 1])):,}') + "\t1:" + str(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f')))

    list_of_dicts = []
    list_of_dicts_stats = []

    for t_dict in range(0, len(methods)):
        list_of_dicts.append(defaultdict(dict))
        list_of_dicts_stats.append(defaultdict(dict))

    for weak_learners in baseL:
        for item in list_of_dicts:
            item[weak_learners] = defaultdict(list)

    for weak_learners in baseL:
        for item in list_of_dicts_stats:
            item[weak_learners] = defaultdict(list)

    for samples in range(0, iterations):

        sss = StratifiedKFold(n_splits=folds, shuffle=True, random_state=int(time.time()))
        for weak_learners in baseL:
            print("iteration=", samples, " weak learners=", weak_learners)

            # for weak_learners in baseL:
            for train_index, test_index in sss.split(X, y):

                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                processes = []
                for method in methods:
                    p = Process(target=train_and_predict,
                                args=(X_train, y_train, X_test, weak_learners, method, cl_names))
                    p.start()
                    processes.append(p)

                for p in processes:
                    p.join()

                for index, method in enumerate(methods):
                    with open('temp_preds/' + method, 'rb') as filehandle:
                        list_of_dicts[index] = update_performance_stats(
                            calculate_performance(y_test, pickle.load(filehandle)),
                            list_of_dicts[index],
                            weak_learners
                        )

                    with open('temp_preds/stats_' + method, 'rb') as filehandle:
                        list_of_dicts_stats[index] = update_resource_stats(pickle.load(filehandle),
                                                                           list_of_dicts_stats[index],
                                                                           weak_learners,
                                                                           method
                                                                           )
    plot_single_dataset(methods, list_of_dicts, "Images/Performance/" + dataset + "/", baseL)
    plot_resource_stats_time(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL)
    plot_resource_stats_scores(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL)
    return list_of_dicts, list_of_dicts_stats
Example #7
0
def run_eval(dataset, base_learners, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']

    y[y != 1] = 0

    processes = []

    for method in methods:
        p = Process(target=train_classifier,
                    args=(X, y, base_learners, method,
                          cl_names))  # Passing the list
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    N = len(methods)
    ind = numpy.arange(N)  # the x locations for the groups
    width = 0.35  # the width of the bars: can also be len(x) sequence

    raw_data = dict()

    for method in methods:
        with open('temp_features/' + method, 'rb') as filehandle:
            # read the data as binary data stream
            model = pickle.load(filehandle)
            # print (method, model.feature_importances_)
            raw_data[method] = model.feature_importances_
            f_num = len(model.feature_importances_)
    index = ["Feature " + str(k) for k in range(1, f_num + 1)]
    # index = ["Atrribute 1","Atrribute 2","Atrribute 3","Atrribute 4","Atrribute 5","Atrribute 6"]
    df = pd.DataFrame(raw_data, index=index)
    df = df.transpose()

    ax = df.plot.bar(stacked=True, alpha=0.75, rot=25)
    ax.set_ylabel("Feature importance")
    ax.set_xlabel("Methods")
    ax.legend(loc='center left', bbox_to_anchor=(0.1, 01.07),
              ncol=3)  # here is the magic

    ax.figure.savefig('Images/features/' + dataset + '.png',
                      bbox_inches='tight',
                      dpi=200)