def run_experiment(datasets, alg_engine, epsilons, seed, num_bins): total_runs = len(epsilons)*len(datasets) print("total runs: ", total_runs) num_done = 0 experiment_results = [] for i in in range(len(datasets)): dataset = datasets[i] if sum(dataset) == 0 or len(dataset) <= 2: print("bad dataset") continue # for some reason there are '0' data vectors # also, for branching, length should be at least 3 dataset = np.array(dataset) scale = sum(dataset) domain_size = len(dataset) data_range = max(dataset) - min(dataset) std_dev = math.sqrt(np.var(dataset)) uniform_distance = algs.uniform_distance(dataset) for epsilon in epsilons: w = workload.Prefix1D(domain_shape_int=len(dataset)) dataset_hat = alg_engine.Run(w, dataset, epsilon, seed) histogram, bin_size = algs.get_histogram(dataset, num_bins) private_hist, bin_size = algs.get_histogram(dataset_hat, num_bins) error = algs.get_scaled_error(histogram, private_hist) experiment_results.append((scale, domain_size, error, data_range, std_dev, uniform_distance, epsilon, data_set_index, i)) num_done +=1 if num_done % 50 ==0 : print("num done: ", num_done) return experiment_results
def get_error_data(): for i in range(reps): for data_vector in dataset_vectors_ext: for epsilon in epsilons: #num_iterations = 5*math.log(sum(data_vector)) num_iterations = 2 queries = algs.get_queries(num_bins, len(data_vector)) private_dataset = algs.mwem(data_vector, queries, num_iterations, epsilon) histogram, bin_size = algs.get_histogram(data_vector, num_bins) private_hist, bin_size = algs.get_histogram( private_dataset, num_bins) # collect statistics scale = sum(data_vector) domain_size = len(data_vector) std_dev = math.sqrt(np.var(data_vector)) uniform_distance = algs.uniform_distance(data_vector) private_histograms_data.append( (private_hist, histogram, sum(histogram), domain_size, data_range, std_dev, uniform_distance, epsilon)) done += 1 if done % 500 == 0: print "num done = ", done
error_errors = [] all_results = [] num_correct = 0 for i in range(len(data_files)): data_file = data_files[i] dataset = np.load(data_file) epsilon = .01 w = workload.Prefix1D(domain_shape_int=len(dataset)) results = {} predicted_error = predictions[i]['dataset_stat'][2] for alg_engine in alg_engines: predicted_epsilon = predictions[i][alg_engine.short_name][0] dataset_hat = alg_engine.Run(w, dataset, predicted_epsilon, seed) histogram, bin_size = algs.get_histogram(dataset, num_bins) private_hist, bin_size = algs.get_histogram(dataset_hat, num_bins) error = algs.get_scaled_error(histogram, private_hist) error_errors.append(abs(predicted_error - error)) results[alg_engine.short_name] = error actual_best = min(results, key=results.get) predictions_algs = {} for key in predictions[i].keys(): if key == 'dataset_stat': continue predictions_algs[key] = predictions[i][key][0] #isolate epsilons predicted_best = min(predictions_algs, key=predictions_algs.get) if actual_best == predicted_best: num_correct += 1
num_total = reps * len(dataset_vectors_ext) * len(epsilons) print "num total: ", num_total # for data_vector in dataset_vectors_ext: # print "my scale: ", sum(data_vector) # print "my domain size: ", len(data_vector) for i in range(reps): for data_vector in dataset_vectors_ext: for epsilon in epsilons: #num_iterations = 5*math.log(sum(data_vector)) num_iterations = 2 queries = algs.get_queries(num_bins, len(data_vector)) private_dataset = algs.mwem(data_vector, queries, num_iterations, epsilon) histogram, bin_size = algs.get_histogram(data_vector, num_bins) private_hist, bin_size = algs.get_histogram( private_dataset, num_bins) # collect statistics scale = sum(data_vector) domain_size = len(data_vector) data_range = max(data_vector) - min(data_vector) std_dev = math.sqrt(np.var(data_vector)) uniform_distance = algs.uniform_distance(data_vector) private_histograms_data.append( (private_hist, histogram, sum(histogram), domain_size, data_range, std_dev, uniform_distance, epsilon)) done += 1 if done % 500 == 0: