def main(): data_path = os.getcwd() + '/../data' dataset_name = 'housing' original_dataset, attributes = read_data_path(data_path + '/' + dataset_name + '/' + dataset_name + '.csv') anonymised_dataset = pd.read_csv(data_path + '/result/sc_test/1_1/' + dataset_name + '/datasets/eps-1.0_1.csv') #dataset_postfix = '/test_dataset/test_dataset.csv' #anonymised_postfix = '/test_dataset/test_anonymised.csv' #original_dataset, attributes = read_data_path(data_path + dataset_postfix) #anonymised_dataset = pd.read_csv(data_path + anonymised_postfix) #taxonomies = [Taxonomy({'*': [TaxNode('*', None, True)]}, 1) for i in range(len(attributes))] cropped_dataset = original_dataset.values[:1000] original_dataset = pd.DataFrame(cropped_dataset, columns=attributes) cropped_dataset_a = anonymised_dataset.values[:1000] anonymised_dataset = pd.DataFrame(cropped_dataset_a, columns=attributes) taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances_all(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) print('Taxonomies found') # reverse_mapping(original_dataset, anonymised_dataset, taxonomies) #permutation_distance(original_dataset.values[2], anonymised_dataset, taxonomies) start_time = time.time() record_linkages = record_linkage(anonymised_dataset, original_dataset, taxonomies) print('record linkage', record_linkages, 'time:', time.time() - start_time) return
def test_3_1(dataset_name, model): iterations = 25 path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) epsilons = ['1.0', '2.0'] original_dataset_path = \ path+'data/' + dataset_name + '/'+dataset_name+'.csv' dataset_top_path = path+'data/result/'+model+'_test/3_1/'\ +dataset_name datasets_path = dataset_top_path + '/datasets' metrics = ['sse', 'record_linkage'] original_dataset, attributes = read_data_path(original_dataset_path) taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances_all(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) if model == 'safepub': ks = [45, 74] filename_combinations = generate_filename_combos( 'granularity_eps-', '.csv', [epsilons], iterations) elif model == 'sc': ks = [32] * len(epsilons) filename_combinations = generate_filename_combos( 'eps-', '.csv', [epsilons], iterations) elif model == 'k-anonym': ks = [5] filename_combinations = [ generate_filename_combos('k5_suppression', '.csv', [[""]]) ] else: raise RuntimeError('Does not recognise model', model) compute_score_iterations2(datasets_path, dataset_top_path, filename_combinations, metrics, ks, taxonomies, original_dataset) result_filenames = [ filen for filen in os.listdir(dataset_top_path) if re.match("^result_", filen) ] normalise_scores(dataset_name, dataset_top_path, result_filenames, ks) norm_result_filenames = [ filen for filen in os.listdir(dataset_top_path) if re.match("^norm_result_", filen) ] compute_mean_var(dataset_top_path, norm_result_filenames) return
def run_experiment_3(dataset_name, attributes, k, num_attributes, float_vals, num_decimals): epsilon = 1.0 path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) datasets_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv' iterations = 25 attr_to_idx = {attr: n for n, attr in enumerate(attributes)} all_taxonomies = [create_taxonomy(dataset_name, attribute) for attribute in attributes] add_semantic_distances(all_taxonomies) for taxonomy in all_taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) for a in range(num_attributes, num_attributes+1): current_datasets_path = datasets_path+'/'+str(a) files = os.listdir(current_datasets_path) output_path = path+'data/result/sc_test/2_1/'+dataset_name+'/datasets/'+str(a) cluster_root_path = path+'anonymisation/S_C/clusters/'+dataset_name +'/clusters_attribute_subsets/'+str(a) for f, filename in enumerate(files): data, current_attrs = read_data_path(current_datasets_path+'/'+filename) current_taxonomies = [all_taxonomies[attr_to_idx[attr]] for attr in current_attrs] cluster_path = cluster_root_path+'/'+'-'.join(current_attrs)+'_'+str(k)+'.csv' current_float_vals = [float_vals[attr_to_idx[attr]] for attr in current_attrs] current_num_decimals = [num_decimals[attr_to_idx[attr]] for attr in current_attrs] for i in range(iterations): output_file = output_path+'/'+'-'.join(current_attrs)+'_'+str(i+1)+'.csv' try: with open(output_file, 'r') as test_file: print("File found") except FileNotFoundError: X_bar = microaggregation(data, k, current_taxonomies, epsilon, add_cluster_noise=True, cluster_path=cluster_path) anon_data = sanitise(X_bar.values, epsilon, k, current_taxonomies, current_float_vals, current_num_decimals) anonymised = pd.DataFrame(anon_data, columns=data.columns) anonymised.to_csv(output_file, index=False) print("---------------- file", f, "of", len(files), "done") print("#########################", a, 'attributes done') return
def run_experiment_2(dataset_name, float_vals, num_decimals): path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) dataset_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv' data, attributes = read_data_path(dataset_path) taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) if dataset_name == 'adult': ks = list(range(200, 6000, 300))[:16] elif dataset_name == 'housing': ks = list(range(200, 6000, 300))[:16] else: raise RuntimeError('Does not recognise dataset', dataset_name) iterations = 50 epsilon = 1.0 output_path = path+'data/result/sc_test/1_2/' + dataset_name + '/datasets' for k in ks: cluster_root_path = path+'anonymisation/S_C/clusters/' + dataset_name + '/k_' \ + str(k) + '.csv' print('####################### k ' + str(k)) for i in range(iterations): output_file = output_path + '/k_'+str(k)+'_'+str(i+1)+'.csv' try: with open(output_file, 'r') as test_file: print("File found") except FileNotFoundError: X_bar = microaggregation(data, k, taxonomies, epsilon, add_cluster_noise=True, cluster_path=cluster_root_path) anon_data = sanitise(X_bar.values, epsilon, k, taxonomies, float_vals, num_decimals) anonymised = pd.DataFrame(anon_data, columns=data.columns) anonymised.to_csv(output_file, index=False)
def run_experiment4(dataset_name, k, float_vals, num_decimals): path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) dataset_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv' data, attributes = read_data_path(dataset_path) taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) iterations = 25 epsilons = [1.0, 2.0] output_path = path + 'data/result/sc_test/3_1/' + dataset_name + '/datasets' cluster_root_path = path + 'anonymisation/S_C/clusters/' + dataset_name + '/k_' + str( k) + '.csv' for epsilon in epsilons: print('####################### epsilon ' + str(epsilon)) for i in range(iterations): output_file = output_path + '/eps-' + str(epsilon) + '_' + str( i + 1) + '.csv' X_bar = microaggregation(data, k, taxonomies, epsilon, add_cluster_noise=True, cluster_path=cluster_root_path) anon_data = sanitise(X_bar.values, epsilon, k, taxonomies, float_vals, num_decimals) anonymised = pd.DataFrame(anon_data, columns=data.columns) anonymised.to_csv(output_file, index=False)
def test_2_1(dataset_name, model, attributes): # Compute the scores and the normalised scores for each attribute combination iterations = 25 path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) original_dataset_path = path + 'data/' + dataset_name + '/attribute_subsets' metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances_all(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) attr_to_idx = {attr: n for n, attr in enumerate(attributes)} if dataset_name == 'adult': attribute_range = list(range(2, 9)) elif dataset_name == 'housing': attribute_range = list(range(2, 10)) elif dataset_name == 'musk': attribute_range = list(range(2, 21)) else: raise RuntimeError("Does not recognise dataset", dataset_name) if model == 'safepub': # Safepub top_path = path + 'data/result/safepub_test/2_1/' + dataset_name dataset_paths = top_path + '/datasets' if dataset_name == 'musk': ks = [45] else: ks = [59] for a in attribute_range: output_path = top_path + '/' + str(a) original_filenames = os.listdir(original_dataset_path + '/' + str(a)) for n, filename in enumerate(original_filenames): original_dataset, current_attrs = read_data_path( original_dataset_path + '/' + str(a) + '/' + filename) filenames = [ '-'.join(current_attrs) + '_' + str(i + 1) + '.csv' for i in range(iterations) ] # current_taxomomies = [ taxonomies[attr_to_idx[attr]] for attr in current_attrs ] # compute_score_iterations2(dataset_paths + '/' + str(a), output_path, [filenames], metrics, ks, current_taxomomies, original_dataset) print("File", n, "of", len(original_filenames)) result_filenames = [ filen for filen in os.listdir(output_path) if re.match("^result_", filen) ] normalise_scores_subattributes(dataset_name, output_path, result_filenames, ks * len(result_filenames), a) norm_result_filenames = [ filen for filen in os.listdir(output_path) if re.match("^norm_result_", filen) ] compute_mean_var(output_path, norm_result_filenames) print("Attribute", a, "done") elif model == 'sc': # S-C top_path = path + 'data/result/sc_test/2_1/' + dataset_name dataset_paths = top_path + '/datasets' if dataset_name == 'adult': ks = [174] elif dataset_name == 'housing': ks = [143] elif dataset_name == 'musk': ks = [82] else: raise RuntimeError('Does not recognise dataset', dataset_name) for a in attribute_range: output_path = top_path + '/' + str(a) original_filenames = os.listdir(original_dataset_path + '/' + str(a)) for n, filename in enumerate(original_filenames): original_dataset, current_attrs = read_data_path( original_dataset_path + '/' + str(a) + '/' + filename) filenames = [ '-'.join(current_attrs) + '_' + str(i + 1) + '.csv' for i in range(iterations) ] current_taxomomies = [ taxonomies[attr_to_idx[attr]] for attr in current_attrs ] compute_score_iterations2(dataset_paths + '/' + str(a), output_path, [filenames], metrics, ks, current_taxomomies, original_dataset) print("File", n, "of", len(original_filenames)) result_filenames = [ filen for filen in os.listdir(output_path) if re.match("^result_", filen) ] normalise_scores_subattributes(dataset_name, output_path, result_filenames, ks * len(result_filenames), a) norm_result_filenames = [ filen for filen in os.listdir(output_path) if re.match("^norm_result_", filen) ] compute_mean_var(output_path, norm_result_filenames) print("Attribute", a, "done") return
def test_1_2(dataset_name, model): # Read the datasets, compute the score for each dataset iterations = 50 path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) original_dataset, attributes = read_data_path(path + 'data/' + dataset_name + '/' + dataset_name + '.csv') taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances_all(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) if model == 'safepub': # SafePub top_path = path + "data/result/safepub_test/1_2/" + dataset_name dataset_path = top_path + '/datasets' metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] ks = [ 59, 74, 88, 100, 114, 129, 141, 155, 170, 184, 199, 211, 225, 240, 252, 266 ] deltas = [ 1E-5, 1E-6, 1E-7, 1E-8, 1E-9, 1E-10, 1E-11, 1E-12, 1E-13, 1E-14, 1E-15, 1E-16, 1E-17, 1E-18, 1E-19, 1E-20 ] deltas_string = [ '1.0E-5', '1.0E-6', '1.0E-7', '1.0E-8', '1.0E-9', '1.0E-10', '1.0E-11', '1.0E-12', '1.0E-13', '1.0E-14', '1.0E-15', '1.0E-16', '1.0E-17', '1.0E-18', '1.0E-19', '1.0E-20' ] filename_combinations = generate_filename_combos( 'delta_', '.csv', [deltas_string], iterations) compute_score_iterations2(dataset_path, top_path, filename_combinations, metrics, ks, taxonomies, original_dataset) result_file_combinations = generate_filename_combos( 'result_delta_', '.csv', [deltas_string], 1) normalise_scores(dataset_name, top_path, result_file_combinations, ks) norm_combinations = generate_filename_combos('norm_result_delta_', '.csv', [deltas_string]) compute_mean_var(top_path, norm_combinations) elif model == 'sc': # Soria-Comas top_path = path + "data/result/sc_test/1_2/" + dataset_name dataset_path = top_path + '/datasets' metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] if dataset_name == 'adult': ks = list(range(200, 4701, 300)) elif dataset_name == 'housing': ks = list(range(200, 4701, 300)) else: raise RuntimeError('Does not recognse dataset name', dataset_name) # ks_string = [str(k) for k in ks] filename_combinations = generate_filename_combos( 'k_', '.csv', [ks_string], 50) compute_score_iterations2(dataset_path, top_path, filename_combinations, metrics, ks, taxonomies, original_dataset) result_file_combinations = generate_filename_combos( 'result_k_', '.csv', [ks_string], 1) normalise_scores(dataset_name, top_path, result_file_combinations, ks) norm_combinations = generate_filename_combos('norm_result_k_', '.csv', [ks_string]) compute_mean_var(top_path, norm_combinations)
def test_1_1(dataset_name, model): iterations = 50 epsilons = [ 2.0, 1.5, 1.25, 1.0986122886681098, 1.0, 0.75, 0.6931471805599453, 0.5, 0.1, 0.01 ] eps_str = [str(eps) for eps in epsilons] path_in = os.getcwd() pattern = '^.*/thesis-data-anonymisation/' path = re.search(pattern, path_in).group(0) original_dataset, attributes = read_data(dataset_name) taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes] add_semantic_distances_all(taxonomies) for taxonomy in taxonomies: taxonomy.add_boundary(compute_boundary(taxonomy)) # SafePub if model == 'safepub': top_path = path + "data/result/safepub_test/1_1/" + dataset_name data_metrics = [ 'granularity', 'intensity', 'discernibility', 'entropy', 'groupsize' ] ks = [103, 72, 67, 61, 59, 54, 53, 50, 50, 50] * len(data_metrics) dataset_path = top_path + '/datasets' eval_metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] filename_combinations = generate_filename_combos( '', '.csv', [data_metrics, ['eps'], eps_str], iterations) filename_combinations = [[ filename.replace('eps_', 'eps-') for filename in file_list ] for file_list in filename_combinations] compute_score_iterations2(dataset_path, top_path, filename_combinations, eval_metrics, ks, taxonomies, original_dataset) result_file_combinations = generate_filename_combos( 'result_', '.csv', [data_metrics, ['eps'], eps_str]) result_file_combinations = [ filename.replace('eps_', 'eps-') for filename in result_file_combinations ] normalise_scores(dataset_name, top_path, result_file_combinations, ks) # norm_result_combinations = generate_filename_combos( 'norm_result_', '.csv', [data_metrics, ['eps'], eps_str]) norm_result_combinations = [ filename.replace('eps_', 'eps-') for filename in norm_result_combinations ] compute_mean_var(top_path, norm_result_combinations) elif model == 'sc': # Soria-Comas top_path = path + "data/result/sc_test/1_1/" + dataset_name if dataset_name == 'adult': ks = [174] * len(epsilons) elif dataset_name == 'housing': ks = [143] * len(epsilons) else: raise RuntimeError('Does not recognise the dataset') dataset_path = top_path + '/datasets' metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] filename_combinations = generate_filename_combos( 'eps-', '.csv', [eps_str], iterations) compute_score_iterations2(dataset_path, top_path, filename_combinations, metrics, ks, taxonomies, original_dataset) result_file_combinations = generate_filename_combos( 'result_eps-', '.csv', [eps_str]) normalise_scores(dataset_name, top_path, result_file_combinations, ks) norm_result_combinations = generate_filename_combos( 'norm_result_eps-', '.csv', [eps_str]) compute_mean_var(top_path, norm_result_combinations) elif model == 'sc_spec': top_path = path + "data/result/sc_spec_test/1_1/" + dataset_name ks = [800] * len(epsilons) result_file_combinations = generate_filename_combos( 'result_eps-', '.csv', [eps_str]) normalise_scores(dataset_name, top_path, result_file_combinations, ks) norm_result_combinations = generate_filename_combos( 'norm_result_eps-', '.csv', [eps_str]) compute_mean_var(top_path, norm_result_combinations) elif model == 'k-anonym': # k5-anon k_name = 'k5_suppression.csv' top_path = path + "data/result/k-anonym_test/1_1/" + dataset_name dataset_path = top_path + "/datasets" data = read_data_path(dataset_path + '/' + k_name)[0] metrics = ['discernibility', 'entropy', 'groupsize', 'sse'] k = 5 scores = np.array( compute_metric_scores(data, metrics, k, taxonomies, original_dataset)).reshape(1, -1) df = pd.DataFrame(scores, columns=metrics) df.to_csv(top_path + '/result_' + k_name, index=False) # normalise_scores(dataset_name, top_path, ['result_' + k_name], [k]) else: raise RuntimeError('Does not recognise model', model)