def compare_histograms(self, attribute): datatype = self.attribute_description[attribute]['data_type'] is_categorical = self.attribute_description[attribute]['is_categorical'] # ignore datetime attributes, since they are converted into timestamps if datatype == 'DateTime': return # ignore non-categorical string attributes elif datatype == 'String' and not is_categorical: return elif attribute in self.candidate_keys: return else: fig = plt.figure(figsize=(15, 5), dpi=120) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) if is_categorical: dist_priv = self.private_df[attribute].value_counts() dist_synt = self.synthetic_df[attribute].value_counts() for idx, number in dist_priv.iteritems(): if idx not in dist_synt.index: dist_synt.loc[idx] = 0 for idx, number in dist_synt.iteritems(): if idx not in dist_priv.index: dist_priv.loc[idx] = 0 dist_priv.index = [str(i) for i in dist_priv.index] dist_synt.index = [str(i) for i in dist_synt.index] dist_priv.sort_index(inplace=True) dist_synt.sort_index(inplace=True) pos_priv = list(range(len(dist_priv))) pos_synt = list(range(len(dist_synt))) ax1.bar(pos_priv, normalize_given_distribution(dist_priv.values)) ax2.bar(pos_synt, normalize_given_distribution(dist_synt.values)) ax1.set_xticks(arange(min(pos_priv), max(pos_priv) + 1, 1.0)) ax2.set_xticks(arange(min(pos_synt), max(pos_synt) + 1, 1.0)) ax1.set_xticklabels(dist_priv.index.tolist(), fontsize=15) ax2.set_xticklabels(dist_synt.index.tolist(), fontsize=15) # the rest are non-categorical numeric attributes. else: ax1.hist(self.private_df[attribute].dropna(), bins=15, align='left', density=True) ax2.hist(self.synthetic_df[attribute].dropna(), bins=15, align='left', density=True) ax1_x_min, ax1_x_max = ax1.get_xlim() ax2_x_min, ax2_x_max = ax2.get_xlim() ax1_y_min, ax1_y_max = ax1.get_ylim() ax2_y_min, ax2_y_max = ax2.get_ylim() x_min = min(ax1_x_min, ax2_x_min) x_max = max(ax1_x_max, ax2_x_max) y_min = min(ax1_y_min, ax2_y_min) y_max = max(ax1_y_max, ax2_y_max) ax1.set_xlim([x_min, x_max]) ax1.set_ylim([y_min, y_max]) ax2.set_xlim([x_min, x_max]) ax2.set_ylim([y_min, y_max]) fig.autofmt_xdate()
def infer_distribution(self): if self.is_categorical: distribution = self.data_dropna.value_counts() for value in set(self.distribution_bins) - set(distribution.index): distribution[value] = 0 distribution.sort_index(inplace=True) self.distribution_probabilities = normalize_given_distribution(distribution) self.distribution_bins = np.array(distribution.index) else: distribution = np.histogram(self.timestamps, bins=self.histogram_size, range=(self.min, self.max)) self.distribution_probabilities = normalize_given_distribution(distribution[0])
def construct_noisy_conditional_distributions(bayesian_network, encoded_dataset, epsilon=0.1): """See more in Algorithm 1 in PrivBayes. """ k = len(bayesian_network[-1][1]) conditional_distributions = {} # first k+1 attributes root = bayesian_network[0][1][0] kplus1_attributes = [root] for child, _ in bayesian_network[:k]: kplus1_attributes.append(child) noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes( kplus1_attributes, encoded_dataset, epsilon) # generate noisy distribution of root attribute. root_stats = noisy_dist_of_kplus1_attributes.loc[:, [root, 'count']].groupby( root).sum()['count'] conditional_distributions[root] = normalize_given_distribution( root_stats).tolist() for idx, (child, parents) in enumerate(bayesian_network): conditional_distributions[child] = {} if idx < k: stats = noisy_dist_of_kplus1_attributes.copy( ).loc[:, parents + [child, 'count']] else: stats = get_noisy_distribution_of_attributes( parents + [child], encoded_dataset, epsilon) stats = DataFrame(stats.loc[:, parents + [child, 'count']].groupby(parents + [child]).sum()) if len(parents) == 1: for parent_instance in stats.index.levels[0]: dist = normalize_given_distribution( stats.loc[parent_instance]['count']).tolist() conditional_distributions[child][str([parent_instance])] = dist else: for parents_instance in product(*stats.index.levels[:-1]): dist = normalize_given_distribution( stats.loc[parents_instance]['count']).tolist() conditional_distributions[child][str( list(parents_instance))] = dist return conditional_distributions
def infer_distribution(self): if self.is_categorical: distribution = self.data_dropna.value_counts() for value in set(self.distribution_bins) - set(distribution.index): distribution[value] = 0 distribution.sort_index(inplace=True) self.distribution_probabilities = utils.normalize_given_distribution(distribution) self.distribution_bins = np.array(distribution.index) else: distribution = np.histogram(self.data_dropna_len, bins=self.histogram_size) self.distribution_bins = distribution[1][:-1] self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])
def inject_laplace_noise(self, epsilon, num_valid_attributes): if epsilon > 0: sensitivity = 2 / self.data.size privacy_budget = epsilon / num_valid_attributes noise_scale = sensitivity / privacy_budget laplace_noises = np.random.laplace(0, scale=noise_scale, size=len(self.distribution_probabilities)) noisy_distribution = self.distribution_probabilities + laplace_noises self.distribution_probabilities = utils.normalize_given_distribution(noisy_distribution)
def exponential_mechanism(dataset, mutual_info_list, epsilon=0.1): """Applied in Exponential Mechanism to sample outcomes.""" num_tuples, num_attributes = dataset.shape mi_array = np.array(mutual_info_list) mi_array = mi_array / (2 * delta(num_attributes, num_tuples, epsilon)) mi_array = np.exp(mi_array) mi_array = normalize_given_distribution(mi_array) return mi_array
def inject_laplace_noise(self, epsilon=0.1, num_valid_attributes=10): if epsilon > 0: noisy_scale = num_valid_attributes / (epsilon * self.data.size) laplace_noises = np.random.laplace( 0, scale=noisy_scale, size=len(self.distribution_probabilities)) noisy_distribution = self.distribution_probabilities + laplace_noises self.distribution_probabilities = utils.normalize_given_distribution( noisy_distribution)
def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes): """Applied in Exponential Mechanism to sample outcomes.""" delta_array = [] for (child, parents) in parents_pair_list: sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary) delta = calculate_delta(num_attributes, sensitivity, epsilon) delta_array.append(delta) mi_array = np.array(mutual_info_list) / (2 * np.array(delta_array)) mi_array = np.exp(mi_array) mi_array = normalize_given_distribution(mi_array) return mi_array