def mutual_information_heatmap(self): private_mi = pairwise_attributes_mutual_information(self.private_df) synthetic_mi = pairwise_attributes_mutual_information(self.synthetic_df) fig = plt.figure(figsize=(15, 6)) fig.suptitle('Pairwise Mutual Information Comparison (Private vs Synthetic)', fontsize=20) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) sns.heatmap(private_mi, ax=ax1, cmap="GnBu") sns.heatmap(synthetic_mi, ax=ax2, cmap="GnBu") ax1.set_title('Private, max=1', fontsize=15) ax2.set_title('Synthetic, max=1', fontsize=15) fig.autofmt_xdate() fig.tight_layout() plt.subplots_adjust(top=0.83)
def test_datasynthesizer(): data_dir = Path(__file__).parent / 'data' input_data = data_dir / 'adult_tiny.csv' description_file = data_dir / 'description.json' output_data = data_dir / 'output.csv' uniform_data = data_dir / 'output_uniform.csv' threshold_value = 20 categorical_attributes = {'education': True} epsilon = 1 degree_of_bayesian_network = 2 num_tuples_to_generate = 10000 describer = DataDescriber(category_threshold=threshold_value) describer.describe_dataset_in_correlated_attribute_mode( dataset_file=input_data, epsilon=epsilon, k=degree_of_bayesian_network, attribute_to_is_categorical=categorical_attributes) describer.save_dataset_description_to_file(description_file) generator = DataGenerator() generator.generate_dataset_in_correlated_attribute_mode( num_tuples_to_generate, description_file) generator.save_synthetic_data(output_data) generator.generate_dataset_in_random_mode(num_tuples_to_generate, description_file) generator.save_synthetic_data(uniform_data) df_input = pd.read_csv(input_data, skipinitialspace=True) df_output = pd.read_csv(output_data) df_uniform = pd.read_csv(uniform_data) for col in df_input: if col == 'age': assert ks_test(df_input, df_output, col) < 0.1 else: assert kl_test(df_input, df_output, col) < 0.01 df_input_mi = pairwise_attributes_mutual_information(df_input) df_output_mi = pairwise_attributes_mutual_information(df_output) df_uniform_mi = pairwise_attributes_mutual_information(df_uniform) output_diff = (df_output_mi - df_input_mi).abs().sum().sum() uniform_diff = (df_uniform_mi - df_input_mi).abs().sum().sum() assert output_diff < 5 * uniform_diff
def get_heatmap_data(dataset_filename): df = pd.read_csv(dataset_filename) values = pairwise_attributes_mutual_information(df) out = [] attributes = values.columns for x, xattr in enumerate(attributes): for y, yattr in enumerate(attributes): out.append([x, y, int(round(1000 * values.loc[xattr, yattr])) / 1000]) return out