def check_if_dominance_interval(iterations=100): """Check to see wheter the evaluation is inside a given interval found.""" random.seed(0) datasets = ("SHA", "EPI", "HR") header = ["", "Neither", "OR", "AND"] n = 100 percentiles = (0, 12.5, 25, 37.5, 50, 62.5, 75, 87.5, 100) res = [] for dataset in datasets: print("\n"*2, "-"*35, dataset, "-"*35, "\n") filename = 'data/' + dataset + '/raw.csv' A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1] A = random.sample(A, n) A = normalize(A, axis=0, copy=True, norm='max') A = [list(alt) for alt in A] k = len(A[0]) res_dataset = [0 for o in range(3)] for it in range(iterations): iteration_res = check_if_interval_iteration(A, n, k) for col in iteration_res: res_dataset[col] += 1 res.append([dataset] + [o/iterations for o in res_dataset]) helpers.printmatrix([header] + res)
def check_good_dominance_interval(iterations): """Check to see wheter the evaluation is inside a given interval found.""" random.seed(0) datasets = ("SHA", "EPI", "HR") header = [""] + list(datasets) n = 100 percentiles = (0, 12.5, 25, 37.5, 50, 62.5, 75, 87.5, 100) res_tot = [[p] for p in percentiles] res_tot.append(['av']) for dataset in datasets: print("\n"*2, "-"*35, dataset, "-"*35, "\n") filename = 'data/' + dataset + '/raw.csv' A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1] A = random.sample(A, n) A = normalize(A, axis=0, copy=True, norm='max') A = [list(alt) for alt in A] k = len(A[0]) res = [[p, 0] for p in percentiles] res.append(['av', 0]) for it in range(iterations): iteration_res = check_good_interval_iteration(A, n, k, percentiles) for col in iteration_res: res[col][1] += 1 for i in range(len(res)): res[i][1] /= iterations res_tot[i].append(res[i][1]) helpers.printmatrix(res) helpers.printmatrix([header] + res_tot)
def estimate_by_dom_with_criteria(A_plus, c, a_miss, indices): """Estimate the evaluation of a_miss on c with dominace extrapolation.""" # helpers.printmatrix(A_plus) # print(a_miss) # print() better = A_plus worse = A_plus for i in indices: if type(a_miss[i]) == str: helpers.printmatrix(A_plus) print() print(a_miss) for i in indices: better = [a for a in better if a[i] >= a_miss[i]] worse = [a for a in worse if a[i] <= a_miss[i]] # better.sort(key=better[c], reverse=True) better_c = [a[c] for a in better] worse_c = [a[c] for a in worse] if better_c == [] and worse_c == []: # print('no dominated neither dominant') all_c = [a[c] for a in A_plus] # return (np.mean(all_c), -11) return ('no dominated neither dominant', -11) elif worse_c == []: return min(better_c), -10 elif better_c == []: return max(worse_c), -1 else: estimation = (max(worse_c) + min(better_c)) / 2 return (estimation, abs(max(worse_c) - min(better_c)))
def test_layers(): random.seed(0) A = [[random.random() for i in range(3)] for j in range(15)] layers = layer_rg.compute_layers(A) for i, layer in enumerate(layers): print("layer :", i) helpers.printmatrix(layer) print()
def check_dominance_assumption(iterations=10): """Test if dominance is still respected.""" datasets = ("SHA", "EPI", "HR") header = ["", "MEAN", "STD"] n = 100 res = [] for dataset in datasets: print("\n"*2, "-"*35, dataset, "-"*35, "\n") filename = 'data/' + dataset + '/raw.csv' A, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1] A = random.sample(A, n) A = normalize(A, axis=0, copy=True, norm='max') A = [list(alt) for alt in A] k = len(A[0]) res = [[] for i in range(9)] for it in range(iterations): i = random.randint(0, n - 1) c = random.randint(0, k - 1) a = A[i] del A[i] a_miss = a[:] a_miss[c] = NULL indices = de.train_dom(A, c, a_miss) dominant, dominated = de.count_dominant_alts(A, indices, a_miss) indices.append(c) dominant_c, dominated_c = de.count_dominant_alts(A, indices, a) res[0].append(dominant) res[1].append(dominant_c) res[2].append(dominant_c/dominant if dominant else 0) res[3].append(dominated) res[4].append(dominated_c) res[5].append(dominated_c/dominated if dominated else 0) res[6].append(dominated + dominant) res[7].append(dominated_c + dominant_c) res[8].append((dominated_c + dominant_c)/(dominated + dominant) if (dominated + dominant) else 0) A.insert(i, a) final_res = [[" ", " ", "MEAN", "STD"]] lines = ["Dom+", "Dc+", "ratio", "dom-", "dc-", "ratio", "Tot", "tot_c", "ratio"] for i in range(9): final_res.append([lines[i], " ", np.mean(res[i]), np.std(res[i])]) helpers.printmatrix(final_res, width=5)
def estimate_by_dom_with_criteria_knn(A_plus, c, a_miss, indices, k): """Estimate the evaluation of a_miss on c with dominace extrapolation.""" # helpers.printmatrix(A_plus) # print(a_miss) # print() better = A_plus worse = A_plus for i in indices: if a_miss[i] == NULL: helpers.printmatrix(A_plus) print(a_miss) for i in indices: better = [a for a in better if a[i] >= a_miss[i]] worse = [a for a in worse if a[i] <= a_miss[i]] # better.sort(key=better[c], reverse=True) better_ind = [[a[i] for i in indices] for a in better] worse_ind = [[a[i] for i in indices] for a in worse] target = [a_miss[i] for i in indices] target = np.array(target) if len(better_ind) > 0: better_ind = np.array(better_ind) nnbrg = NearestNeighbors(n_neighbors=min(k, len(better_ind))) nnbrg = nnbrg.fit(better_ind) distances, best_k_indices = nnbrg.kneighbors(target) # print('best k ind:', best_k_indices, k) best_k_indices = best_k_indices[0] better_estimation = sum([better[j][c] for j in best_k_indices]) / k if len(worse_ind) > 0: worse_ind = np.array(worse_ind) nnbrg = NearestNeighbors(n_neighbors=min(k, len(worse_ind))) nnbrg = nnbrg.fit(worse_ind) distances, worse_k_indices = nnbrg.kneighbors(target) worse_k_indices = worse_k_indices[0] worse_estimation = sum([worse[j][c] for j in worse_k_indices]) / k if len(better_ind) == 0 and len(worse_ind) == 0: # print('no dominated neither dominant') # all_c = [a[c] for a in A_plus] # return (np.mean(all_c), -11) return 'no dominated neither dominant' elif len(better_ind) == 0: return worse_estimation elif len(worse_ind) == 0: return better_estimation else: estimation = (worse_estimation + better_estimation) / 2 return estimation
def compare_rankings(alt_num=20, it=500, del_num=1): """Compare strategies.""" random.seed(1) datasets = ('HR', 'SHA', 'EPI', 'HP') # datasets = ('SHA',) header = [" "] + list(datasets) + ["mean", "std"] methods = { # 'sreg': mv.replace_by_sreg, # 'creg': mv.replace_by_creg, # 'ereg': mv.replace_by_ereg, 'sreg': mv.replace_by_sreg, 'dom': mv.replace_by_dominance, 'd_diff': mv.replace_by_dominance_smallest_diff, 'knn': mv.replace_by_knn, 'mean': mv.replace_by_mean, 'med': mv.replace_by_med } # 'pij': mv.replace_by_pij} results = {method: [] for method in methods} meth_std = {method: [] for method in methods} for dataset in datasets: print('---------------------- ', dataset, ' -----------------------') t0 = time.time() results_dataset = {method: [] for method in methods} filename = 'data/' + dataset + '/raw.csv' all_alts, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1] if weights == []: weights = None for i in range(it): taus = compare_rankings_once(all_alts, alt_num, weights, del_num, methods) # print(taus) for method in methods: results_dataset[method].append(taus[method]) for method in methods: results[method].append(sum(results_dataset[method]) / it) meth_std[method] += results_dataset[method] print('time:', time.time() - t0) final_matrix = [header] for m in methods: results[m].append(np.mean(results[m])) results[m].append(np.std(meth_std[m])) final_matrix.append([m] + results[m]) helpers.printmatrix(final_matrix)
def main(n=25): random.seed(0) # test_dominates() # test_layers() dataset = "SHA" A = helpers.get_dataset(dataset, n) k = len(A[0]) i, j = random.randint(0, n - 1), random.randint(0, k - 1) print(A[i][j]) A[i][j] = NULL helpers.printmatrix(A) res = layer_rg.layer_regression(A) print("\n", res)
def get_quantiles(self, A, perc): helpers.printmatrix(A) print(perc) ceils = [] n = len(A) k = len(A[0]) for c in range(k): diffs = [] for i in range(n - 1): for j in range(i + 1, n): diffs.append(abs(A[i][c] - A[j][c])) ceil = [ np.percentile(np.array(diffs), perc[0]), np.percentile(np.array(diffs), perc[1]) ] ceils.append(ceil) return ceils
def compare_evaluations(alt_num=100, iterations=2, outputdir='res/local_regression/'): """Compare strategies. Output in different files: 1. All the errors for each dataset (prefix dataset): i, j, ev, reg, ... 2. Statistics for each dataset (prefix dataset_statistics): MEAN STD reg ... 3. Global statistics (prefix Global SHA ... MEAN STD reg ... """ datasets = ('SHA', ) datasets = ('HDI', 'SHA', 'HP', 'CPU') global_header = [" ", "mean", "std"] methods = { 'reg': rg.get_regression, # 'lrg': lrg.get_estimation_by_local_regression, # 'dom': de.get_estimations_by_dominance, 'lay_all': layrg.layer_regression_all, 'lay_guess': layrg.layer_regression_guess_layer, # 'diff': de.get_estimations_by_dominance_diff, # 'dk': de.get_estimations_by_dominance_knn, # 'dk2': de.get_estimations_by_dominance_knn_2, # 'dk3': de.get_estimations_by_dominance_knn_3, # 'dk4': de.get_estimations_by_dominance_knn_4, # 'knn': knn.get_knn, 'mean': mv.get_mean, 'med': mv.get_med } dataset_header = [ 'i', 'c', 'ev', 'lay_all', "lay_guess", # 'lrg', 'reg', # 'dom', 'diff', 'dk', 'dk2', # 'dk3', 'dk4', 'knn', 'mean', 'med' ] row_methods_order = dataset_header[3:] global_res = {method: [] for method in methods} # global_std = {method: [] for method in methods} for dataset in datasets: print('---------------------- ', dataset, ' -----------------------') t0 = time.time() # output file for dataset dataset_output = outputdir + dataset + '.csv' dataset_statistics_output = outputdir + dataset + '_statistics.csv' dataset_res = [] dataset_res.append(dataset_header) # used for std and mean dataset_res_dico = {method: [] for method in methods} filename = 'data/' + dataset + '/raw.csv' all_alts, weights = dr.open_raw(filename)[0], dr.open_raw(filename)[1] A = random.sample(all_alts, alt_num) A = normalize(A, axis=0, copy=True, norm='max') A = [list(alt) for alt in A] for it in range(iterations): res_it = [] i, c = random.randint(0, len(A) - 1), random.randint( 0, len(A[0]) - 1) res_it.append(i) res_it.append(c) ev = A[i][c] A[i][c] = NULL errors = compare_evaluations_once(A, ev, methods) A[i][c] = ev res_it.append(ev) for m in row_methods_order: res = errors[m] res_it.append(res) dataset_res_dico[m].append(res) dataset_res.append(res_it) # print(dataset_res) # helpers.matrix_to_csv(dataset_res, dataset_output) # Make the matrix for the statistics of the given dataset dataset_statistics_res = [] dataset_statistics_res.append([dataset, "MEAN", "STD"]) for method in methods: # keep all the errors for the global satistics global_res[method] += dataset_res_dico[method] line = [ method, np.mean(dataset_res_dico[method]), np.std(dataset_res_dico[method]) ] dataset_statistics_res.append(line) helpers.printmatrix(dataset_statistics_res) # helpers.matrix_to_csv(dataset_statistics_res, dataset_statistics_output) print('time:', time.time() - t0) global_matrix = [global_header] for m in methods: std = np.std(global_res[m]) mean = np.mean(global_res[m]) global_matrix.append([m, mean, std]) helpers.printmatrix(global_matrix)