def check_if_dominance_interval(iterations=100):
    """Check to see wheter the evaluation is inside a given interval found."""
    random.seed(0)
    datasets = ("SHA", "EPI", "HR")
    header = ["", "Neither", "OR", "AND"]
    n = 100
    percentiles = (0, 12.5, 25, 37.5, 50, 62.5, 75, 87.5, 100)

    res = []
    for dataset in datasets:
        print("\n"*2, "-"*35, dataset, "-"*35, "\n")
        filename = 'data/' + dataset + '/raw.csv'
        A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1]
        A = random.sample(A, n)
        A = normalize(A, axis=0, copy=True, norm='max')
        A = [list(alt) for alt in A]
        k = len(A[0])

        res_dataset = [0 for o in range(3)]
        for it in range(iterations):
            iteration_res = check_if_interval_iteration(A, n, k)
            for col in iteration_res:
                res_dataset[col] += 1
        res.append([dataset] + [o/iterations for o in res_dataset])

    helpers.printmatrix([header] + res)
def check_good_dominance_interval(iterations):
    """Check to see wheter the evaluation is inside a given interval found."""
    random.seed(0)
    datasets = ("SHA", "EPI", "HR")
    header = [""] + list(datasets)
    n = 100
    percentiles = (0, 12.5, 25, 37.5, 50, 62.5, 75, 87.5, 100)

    res_tot = [[p] for p in percentiles]
    res_tot.append(['av'])

    for dataset in datasets:
        print("\n"*2, "-"*35, dataset, "-"*35, "\n")
        filename = 'data/' + dataset + '/raw.csv'
        A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1]
        A = random.sample(A, n)
        A = normalize(A, axis=0, copy=True, norm='max')
        A = [list(alt) for alt in A]
        k = len(A[0])

        res = [[p, 0] for p in percentiles]
        res.append(['av', 0])

        for it in range(iterations):
            iteration_res = check_good_interval_iteration(A, n, k, percentiles)
            for col in iteration_res:
                res[col][1] += 1

        for i in range(len(res)):
            res[i][1] /= iterations
            res_tot[i].append(res[i][1])

        helpers.printmatrix(res)
    helpers.printmatrix([header] + res_tot)
Beispiel #3
0
def estimate_by_dom_with_criteria(A_plus, c, a_miss, indices):
    """Estimate the evaluation of a_miss on c with dominace extrapolation."""
    # helpers.printmatrix(A_plus)
    # print(a_miss)
    # print()
    better = A_plus
    worse = A_plus

    for i in indices:
        if type(a_miss[i]) == str:
            helpers.printmatrix(A_plus)
            print()
            print(a_miss)

    for i in indices:
        better = [a for a in better if a[i] >= a_miss[i]]
        worse = [a for a in worse if a[i] <= a_miss[i]]

    # better.sort(key=better[c], reverse=True)
    better_c = [a[c] for a in better]
    worse_c = [a[c] for a in worse]

    if better_c == [] and worse_c == []:
        # print('no dominated neither dominant')
        all_c = [a[c] for a in A_plus]
        # return (np.mean(all_c), -11)
        return ('no dominated neither dominant', -11)

    elif worse_c == []:
        return min(better_c), -10
    elif better_c == []:
        return max(worse_c), -1
    else:
        estimation = (max(worse_c) + min(better_c)) / 2
        return (estimation, abs(max(worse_c) - min(better_c)))
def test_layers():
    random.seed(0)
    A = [[random.random() for i in range(3)] for j in range(15)]

    layers = layer_rg.compute_layers(A)

    for i, layer in enumerate(layers):
        print("layer :", i)
        helpers.printmatrix(layer)
        print()
def check_dominance_assumption(iterations=10):
    """Test if dominance is still respected."""
    datasets = ("SHA", "EPI", "HR")
    header = ["", "MEAN", "STD"]
    n = 100
    res = []

    for dataset in datasets:
        print("\n"*2, "-"*35, dataset, "-"*35, "\n")
        filename = 'data/' + dataset + '/raw.csv'
        A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1]
        A = random.sample(A, n)
        A = normalize(A, axis=0, copy=True, norm='max')
        A = [list(alt) for alt in A]
        k = len(A[0])

        res = [[] for i in range(9)]

        for it in range(iterations):
            i = random.randint(0, n - 1)
            c = random.randint(0, k - 1)
            a = A[i]
            del A[i]
            a_miss = a[:]
            a_miss[c] = NULL
            indices = de.train_dom(A, c, a_miss)

            dominant, dominated = de.count_dominant_alts(A, indices, a_miss)
            indices.append(c)
            dominant_c, dominated_c = de.count_dominant_alts(A, indices, a)

            res[0].append(dominant)
            res[1].append(dominant_c)

            res[2].append(dominant_c/dominant if dominant else 0)

            res[3].append(dominated)
            res[4].append(dominated_c)
            res[5].append(dominated_c/dominated if dominated else 0)

            res[6].append(dominated + dominant)
            res[7].append(dominated_c + dominant_c)
            res[8].append((dominated_c + dominant_c)/(dominated + dominant)
                          if (dominated + dominant) else 0)

            A.insert(i, a)

        final_res = [[" ", "   ", "MEAN", "STD"]]
        lines = ["Dom+", "Dc+", "ratio", "dom-", "dc-", "ratio",
                 "Tot", "tot_c", "ratio"]

        for i in range(9):
            final_res.append([lines[i], " ", np.mean(res[i]), np.std(res[i])])

        helpers.printmatrix(final_res, width=5)
Beispiel #6
0
def estimate_by_dom_with_criteria_knn(A_plus, c, a_miss, indices, k):
    """Estimate the evaluation of a_miss on c with dominace extrapolation."""
    # helpers.printmatrix(A_plus)
    # print(a_miss)
    # print()
    better = A_plus
    worse = A_plus

    for i in indices:
        if a_miss[i] == NULL:
            helpers.printmatrix(A_plus)
            print(a_miss)

    for i in indices:
        better = [a for a in better if a[i] >= a_miss[i]]
        worse = [a for a in worse if a[i] <= a_miss[i]]

    # better.sort(key=better[c], reverse=True)
    better_ind = [[a[i] for i in indices] for a in better]
    worse_ind = [[a[i] for i in indices] for a in worse]

    target = [a_miss[i] for i in indices]
    target = np.array(target)

    if len(better_ind) > 0:
        better_ind = np.array(better_ind)
        nnbrg = NearestNeighbors(n_neighbors=min(k, len(better_ind)))
        nnbrg = nnbrg.fit(better_ind)
        distances, best_k_indices = nnbrg.kneighbors(target)
        # print('best k ind:', best_k_indices, k)
        best_k_indices = best_k_indices[0]
        better_estimation = sum([better[j][c] for j in best_k_indices]) / k

    if len(worse_ind) > 0:
        worse_ind = np.array(worse_ind)
        nnbrg = NearestNeighbors(n_neighbors=min(k, len(worse_ind)))
        nnbrg = nnbrg.fit(worse_ind)
        distances, worse_k_indices = nnbrg.kneighbors(target)
        worse_k_indices = worse_k_indices[0]
        worse_estimation = sum([worse[j][c] for j in worse_k_indices]) / k

    if len(better_ind) == 0 and len(worse_ind) == 0:
        # print('no dominated neither dominant')
        # all_c = [a[c] for a in A_plus]
        # return (np.mean(all_c), -11)
        return 'no dominated neither dominant'

    elif len(better_ind) == 0:
        return worse_estimation
    elif len(worse_ind) == 0:
        return better_estimation
    else:
        estimation = (worse_estimation + better_estimation) / 2
        return estimation
def compare_rankings(alt_num=20, it=500, del_num=1):
    """Compare strategies."""
    random.seed(1)
    datasets = ('HR', 'SHA', 'EPI', 'HP')
    # datasets = ('SHA',)
    header = ["    "] + list(datasets) + ["mean", "std"]
    methods = {  # 'sreg': mv.replace_by_sreg,
        # 'creg': mv.replace_by_creg,
        # 'ereg': mv.replace_by_ereg,
        'sreg': mv.replace_by_sreg,
        'dom': mv.replace_by_dominance,
        'd_diff': mv.replace_by_dominance_smallest_diff,
        'knn': mv.replace_by_knn,
        'mean': mv.replace_by_mean,
        'med': mv.replace_by_med
    }
    #          'pij': mv.replace_by_pij}

    results = {method: [] for method in methods}
    meth_std = {method: [] for method in methods}

    for dataset in datasets:
        print('---------------------- ', dataset, ' -----------------------')
        t0 = time.time()
        results_dataset = {method: [] for method in methods}

        filename = 'data/' + dataset + '/raw.csv'
        all_alts, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1]
        if weights == []:
            weights = None

        for i in range(it):
            taus = compare_rankings_once(all_alts, alt_num, weights, del_num,
                                         methods)
            # print(taus)
            for method in methods:
                results_dataset[method].append(taus[method])

        for method in methods:
            results[method].append(sum(results_dataset[method]) / it)
            meth_std[method] += results_dataset[method]
        print('time:', time.time() - t0)

    final_matrix = [header]
    for m in methods:
        results[m].append(np.mean(results[m]))
        results[m].append(np.std(meth_std[m]))
        final_matrix.append([m] + results[m])

    helpers.printmatrix(final_matrix)
def main(n=25):
    random.seed(0)
    # test_dominates()
    # test_layers()
    dataset = "SHA"
    A = helpers.get_dataset(dataset, n)
    k = len(A[0])
    i, j = random.randint(0, n - 1), random.randint(0, k - 1)
    print(A[i][j])
    A[i][j] = NULL

    helpers.printmatrix(A)

    res = layer_rg.layer_regression(A)

    print("\n", res)
    def get_quantiles(self, A, perc):
        helpers.printmatrix(A)
        print(perc)
        ceils = []
        n = len(A)
        k = len(A[0])
        for c in range(k):
            diffs = []
            for i in range(n - 1):
                for j in range(i + 1, n):
                    diffs.append(abs(A[i][c] - A[j][c]))

            ceil = [
                np.percentile(np.array(diffs), perc[0]),
                np.percentile(np.array(diffs), perc[1])
            ]
            ceils.append(ceil)
        return ceils
def compare_evaluations(alt_num=100,
                        iterations=2,
                        outputdir='res/local_regression/'):
    """Compare strategies.

    Output in different files:
        1. All the errors for each dataset (prefix dataset):
            i, j, ev, reg, ...

        2. Statistics for each dataset (prefix dataset_statistics):
                 MEAN   STD
            reg
            ...

        3. Global statistics (prefix Global
                 SHA ... MEAN   STD
            reg
            ...
    """
    datasets = ('SHA', )
    datasets = ('HDI', 'SHA', 'HP', 'CPU')
    global_header = ["    ", "mean", "std"]
    methods = {
        'reg': rg.get_regression,
        # 'lrg': lrg.get_estimation_by_local_regression,
        # 'dom': de.get_estimations_by_dominance,
        'lay_all': layrg.layer_regression_all,
        'lay_guess': layrg.layer_regression_guess_layer,
        # 'diff': de.get_estimations_by_dominance_diff,
        # 'dk': de.get_estimations_by_dominance_knn,
        # 'dk2': de.get_estimations_by_dominance_knn_2,
        # 'dk3': de.get_estimations_by_dominance_knn_3,
        # 'dk4': de.get_estimations_by_dominance_knn_4,
        # 'knn': knn.get_knn,
        'mean': mv.get_mean,
        'med': mv.get_med
    }

    dataset_header = [
        'i',
        'c',
        'ev',
        'lay_all',
        "lay_guess",
        # 'lrg',
        'reg',
        # 'dom', 'diff', 'dk', 'dk2',
        # 'dk3', 'dk4', 'knn',
        'mean',
        'med'
    ]

    row_methods_order = dataset_header[3:]

    global_res = {method: [] for method in methods}
    # global_std = {method: [] for method in methods}

    for dataset in datasets:
        print('---------------------- ', dataset, ' -----------------------')
        t0 = time.time()

        # output file for dataset
        dataset_output = outputdir + dataset + '.csv'
        dataset_statistics_output = outputdir + dataset + '_statistics.csv'

        dataset_res = []
        dataset_res.append(dataset_header)
        # used for std and mean
        dataset_res_dico = {method: [] for method in methods}

        filename = 'data/' + dataset + '/raw.csv'
        all_alts, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1]

        A = random.sample(all_alts, alt_num)
        A = normalize(A, axis=0, copy=True, norm='max')
        A = [list(alt) for alt in A]

        for it in range(iterations):
            res_it = []
            i, c = random.randint(0,
                                  len(A) - 1), random.randint(
                                      0,
                                      len(A[0]) - 1)

            res_it.append(i)
            res_it.append(c)

            ev = A[i][c]
            A[i][c] = NULL
            errors = compare_evaluations_once(A, ev, methods)
            A[i][c] = ev

            res_it.append(ev)

            for m in row_methods_order:
                res = errors[m]
                res_it.append(res)
                dataset_res_dico[m].append(res)

            dataset_res.append(res_it)

        # print(dataset_res)
        # helpers.matrix_to_csv(dataset_res, dataset_output)

        # Make the matrix for the statistics of the given dataset
        dataset_statistics_res = []
        dataset_statistics_res.append([dataset, "MEAN", "STD"])

        for method in methods:
            # keep all the errors for the global satistics
            global_res[method] += dataset_res_dico[method]

            line = [
                method,
                np.mean(dataset_res_dico[method]),
                np.std(dataset_res_dico[method])
            ]

            dataset_statistics_res.append(line)

        helpers.printmatrix(dataset_statistics_res)
        # helpers.matrix_to_csv(dataset_statistics_res, dataset_statistics_output)

        print('time:', time.time() - t0)

    global_matrix = [global_header]
    for m in methods:
        std = np.std(global_res[m])
        mean = np.mean(global_res[m])
        global_matrix.append([m, mean, std])

    helpers.printmatrix(global_matrix)