def process_my_own_algo_db(db_loc, epsilon, batch_id=None): """ Takes a synthetic database generated by the my_own_algo algorithm and generates error statistics to be used in further analyses. Copies the synthetic database into the data/test_output folder (in a batch if specified) and calculates errors compared to the original database. Also generates a summary file with the same format as kendall analyses, however since there is no parameter `k`, leave that field as 0. """ with open(params.attribute_file, 'r') as file: attr_list = [attr.strip() for attr in file.readlines()] # Create relevant directories and copy file batch_dir = get_batch_directory(batch_id) timestamp = datetime.now().strftime(TIMESTAMP_FMT) epsilon_str = str(epsilon).replace('.', '-') data_dir = f'{batch_dir}/my_own_algo_{timestamp}_{epsilon_str}' makedirs(data_dir, exist_ok=True) db_file = f'{data_dir}/synthetic_data.csv' if not path.exists(db_file): copy(db_loc, db_file) # Load the synthetic data. This algorithm formats the database as a # list of binary attributes with values of 1 if the row has the # attribute and 0 otherwise. synthetic_data = np.genfromtxt(db_file, dtype=int, delimiter=',') original_db = Database() original_db.load_from_file(params.input_data_file, params.attribute_file) abs_errors = [] rel_errors = [] with open(f'{data_dir}/histogram_comparison.csv', 'w+') as file: file.write('Attribute,Original count,Synthetic count,' 'Absolute error,Relative Error\n') for index, attr in enumerate(attr_list): original = original_db.one_way_histogram[attr] synthetic = sum(synthetic_data[:, index]) abs_error = abs(synthetic - original) rel_error = round(abs_error / original * 100, 2) file.write(f'{attr},{original},{synthetic},{abs_error},' f'{rel_error}%\n') abs_errors.append(abs_error) rel_errors.append(rel_error) with open(f'{data_dir}/summary.txt', 'w+') as file: file.write('epsilon,k,avg absolute error,avg relative error,' 'median absolute error,median relative error\n') file.write(f'{epsilon},0,{np.mean(abs_errors)},' f'{np.mean(rel_errors)},{np.median(abs_errors)},' f'{np.median(rel_errors)}\n')
def test_get_marginal_histogram_shorter(self): short_db = Database() short_db.load_from_file('data/auxiliary/short_orig.csv', 'data/input/adult_columns.csv') print(short_db.data) print(short_db.numerical_attr_table) marginal = short_db.get_marginal_histogram(0) print(marginal) dp_marginals = get_dp_marginals(short_db, 1) print(dp_marginals[0])
def setup_files(self): """ Copies all data files required for the experiment into the experiment folder. Generates new Kendall data but reuses my_own_algo data as it takes much longer to generate. Copies files for all values of epsilon. """ super().setup_files() for file in os.listdir(self.data_dir): shutil.copy(f'{self.data_dir}/{file}', f'{self.exp_path}/{file}') for file in os.listdir(self.attr_dir): shutil.copy(f'{self.attr_dir}/{file}', f'{self.exp_path}/{file}') original_db = Database() original_db.load_from_file(self.original_data_src, f'{self.exp_path}/adult_columns.csv') for epsilon in self.epsilons: eps1 = 8 / 9 * epsilon eps2 = 1 / 9 * epsilon kendall_data = kendall_algorithm(original_db, eps1, eps2) kendall_db = Database(kendall_data, original_db.attr_table) kendall_db.save_to_file(f'{self.exp_path}/' f'kendall_synthetic_{epsilon}.csv')
def single_analysis(epsilon, k=8, batch_id=None, replace=False): """ Generates a single set of synthetic data and a one-way histogram count of all attribute values. epsilon: the total privacy parameter for the algorithm k: the ratio of epsilon1 to epsilon2 batch_id: a str identifier to organise different analyses replace: should replace analyses with same epsilon value Results are saved into 'DPCopula/data/test_output/' with each analysis with the same batch_id being saved in a batch_[batch_id] folder. """ # Pick batch folder batch_dir = get_batch_directory(batch_id) makedirs(batch_dir, exist_ok=True) if replace: for data_dir in listdir(batch_dir): if 'kendall' not in data_dir: continue eps = float(data_dir.split('_')[2].replace('-', '.')) if round(eps, 6) == round(epsilon, 6): rmtree(f'{batch_dir}/{data_dir}') epsilon = round(epsilon, 6) # Ensure no floating point errors epsilon1 = k / (k + 1) * epsilon epsilon2 = 1 / (k + 1) * epsilon epsilon_str = str(epsilon).replace('.', '-') timestamp = datetime.now().strftime(TIMESTAMP_FMT) output_dir = f'{batch_dir}/kendall_{timestamp}_{epsilon_str}' makedirs(output_dir, exist_ok=True) print(f'Running DPCopula-Kendall\tε = {epsilon}') # Load data and create DP synthetic data original_db = Database() original_db.load_from_file(params.input_data_file, params.attribute_file) synthetic_data = kendall_algorithm(original_db, epsilon1, epsilon2) synthetic_db = Database(synthetic_data, original_db.attr_table) synthetic_db.save_to_file(f'{output_dir}/synthetic_data.csv') # Calculate error statistics abs_errors, rel_errors = compare_databases(original_db, synthetic_db, output_dir) # Calculate summary statistics with open(f'{output_dir}/summary.txt', 'w+') as file: file.write('epsilon,k,avg absolute error,avg relative error,' 'median absolute error,median relative error\n') file.write(f'{epsilon},{k},{np.mean(abs_errors)},' f'{np.mean(rel_errors)},{np.median(abs_errors)},' f'{np.median(rel_errors)}\n')
def generate_two_way_histogram(self, database_file): """ Generates a two-way histogram for a single synthetic database specified by the name of the algorithm used and the value of epsilon. This operates identically to the function generating the one-way histogram except the output file will have the format '[attribute value 1],[attribute value 2],[count]'. """ database_type = database_file.split('_')[0] if 'original' == database_type: db = Database() db.load_from_file(f'{self.exp_path}/{database_file}.csv', f'{self.exp_path}/adult_columns.csv') two_way_hist = db.generate_two_way_hist() with open(f'{self.exp_path}/original_histogram.csv', 'w+') as file: for pair, count in two_way_hist: key1 = Database.avp_to_key(pair[0]) key2 = Database.avp_to_key(pair[1]) file.write(f'{key1},{key2},{count}\n') elif 'kendall' == database_type: epsilon = database_file.split('_')[2] db = Database() db.load_from_file(f'{self.exp_path}/{database_file}.csv', f'{self.exp_path}/adult_columns.csv') two_way_hist = db.generate_two_way_hist() with open(f'{self.exp_path}/kendall_histogram_{epsilon}.csv', 'w+') as file: for pair, count in two_way_hist: key1 = Database.avp_to_key(pair[0]) key2 = Database.avp_to_key(pair[1]) file.write(f'{key1},{key2},{count}\n') elif 'my_own_algo' == database_type: epsilon = database_file.split('_')[2] reordered_data = self.reorder_my_own_algo_data(database_file) col_file = f'{self.exp_path}/adult_columns.csv' attr_table = Database.get_attrs_from_file(col_file) attr_val_pairs = [(attr, val) for attr in attr_table for val in attr_table[attr]] with open(col_file, 'r') as file: attr_list = [attr.strip() for attr in file.readlines()] with open(f'{self.exp_path}/my_own_algo_histogram_{epsilon}.csv', 'w+') as file: for avp1 in attr_val_pairs: for avp2 in attr_val_pairs: if avp1[0] >= avp2[0]: continue key1 = Database.avp_to_key(avp1) key2 = Database.avp_to_key(avp2) col1 = attr_list.index(key1) col2 = attr_list.index(key2) count = sum(reordered_data[:, col1] & reordered_data[:, col2]) file.write(f'{key1},{key2},{count}\n')
def generate_one_way_histogram(self, database_file): """ Generates a one-way histogram for a specific data file. Parameters: database_file = exact filename of the data file without the extension This assumes the database file has either of the following formats: [algorithm]_synthetic_[epsilon].csv original_data.csv The file used will have the following format: data/experiments/[folder_name]/[database_file].csv The histogram will be stored in [algorithm]_histogram_[epsilon].csv if the database is synthetic or original_histogram.csv if it is the original database. Each line has the format '[attribute value],[count]'. If the database is my_own_algo synthetic, the columns will be reordered as to generate the same histogram order as the original and Kendall synthetic data produces. """ database_type = database_file.split('_')[0] col_file = f'{self.exp_path}/adult_columns.csv' with open(col_file, 'r') as file: attr_list = [attr.strip() for attr in file.readlines()] if database_type == 'original': db = Database() db.load_from_file(f'{self.exp_path}/{database_file}.csv', f'{self.exp_path}/adult_columns.csv') one_way_hist = db.generate_one_way_hist() with open(f'{self.exp_path}/original_histogram.csv', 'w+') as file: for attr in attr_list: file.write(f'{attr},{one_way_hist[attr]}\n') elif database_type == 'kendall': epsilon = database_file.split('_')[2] db = Database() db.load_from_file(f'{self.exp_path}/{database_file}.csv', f'{self.exp_path}/adult_columns.csv') one_way_hist = db.generate_one_way_hist() with open(f'{self.exp_path}/kendall_histogram_{epsilon}.csv', 'w+') as file: for attr in attr_list: file.write(f'{attr},{one_way_hist[attr]}\n') elif database_type == 'my_own_algo': epsilon = database_file.split('_')[2] reordered_data = self.reorder_my_own_algo_data(database_file) with open(f'{self.exp_path}/my_own_algo_histogram_{epsilon}.csv', 'w+') as file: for index, attr in enumerate(attr_list): file.write(f'{attr},{sum(reordered_data[:, index])}\n')
def setUpClass(cls): cls.db = Database() cls.db.load_from_file(params.input_data_file, params.attribute_file)