Beispiel #1
0
    def compare_histograms(self, attribute):
        datatype = self.attribute_description[attribute]['data_type']
        is_categorical = self.attribute_description[attribute]['is_categorical']

        # ignore datetime attributes, since they are converted into timestamps
        if datatype == 'DateTime':
            return
        # ignore non-categorical string attributes
        elif datatype == 'String' and not is_categorical:
            return
        elif attribute in self.candidate_keys:
            return
        else:
            fig = plt.figure(figsize=(15, 5), dpi=120)
            ax1 = fig.add_subplot(121)
            ax2 = fig.add_subplot(122)

            if is_categorical:
                dist_priv = self.private_df[attribute].value_counts()
                dist_synt = self.synthetic_df[attribute].value_counts()
                for idx, number in dist_priv.iteritems():
                    if idx not in dist_synt.index:
                        dist_synt.loc[idx] = 0
                for idx, number in dist_synt.iteritems():
                    if idx not in dist_priv.index:
                        dist_priv.loc[idx] = 0
                dist_priv.index = [str(i) for i in dist_priv.index]
                dist_synt.index = [str(i) for i in dist_synt.index]
                dist_priv.sort_index(inplace=True)
                dist_synt.sort_index(inplace=True)
                pos_priv = list(range(len(dist_priv)))
                pos_synt = list(range(len(dist_synt)))
                ax1.bar(pos_priv, normalize_given_distribution(dist_priv.values))
                ax2.bar(pos_synt, normalize_given_distribution(dist_synt.values))
                ax1.set_xticks(arange(min(pos_priv), max(pos_priv) + 1, 1.0))
                ax2.set_xticks(arange(min(pos_synt), max(pos_synt) + 1, 1.0))
                ax1.set_xticklabels(dist_priv.index.tolist(), fontsize=15)
                ax2.set_xticklabels(dist_synt.index.tolist(), fontsize=15)
            # the rest are non-categorical numeric attributes.
            else:
                ax1.hist(self.private_df[attribute].dropna(), bins=15, align='left', density=True)
                ax2.hist(self.synthetic_df[attribute].dropna(), bins=15, align='left', density=True)

            ax1_x_min, ax1_x_max = ax1.get_xlim()
            ax2_x_min, ax2_x_max = ax2.get_xlim()
            ax1_y_min, ax1_y_max = ax1.get_ylim()
            ax2_y_min, ax2_y_max = ax2.get_ylim()
            x_min = min(ax1_x_min, ax2_x_min)
            x_max = max(ax1_x_max, ax2_x_max)
            y_min = min(ax1_y_min, ax2_y_min)
            y_max = max(ax1_y_max, ax2_y_max)
            ax1.set_xlim([x_min, x_max])
            ax1.set_ylim([y_min, y_max])
            ax2.set_xlim([x_min, x_max])
            ax2.set_ylim([y_min, y_max])
            fig.autofmt_xdate()
Beispiel #2
0
 def infer_distribution(self):
     if self.is_categorical:
         distribution = self.data_dropna.value_counts()
         for value in set(self.distribution_bins) - set(distribution.index):
             distribution[value] = 0
         distribution.sort_index(inplace=True)
         self.distribution_probabilities = normalize_given_distribution(distribution)
         self.distribution_bins = np.array(distribution.index)
     else:
         distribution = np.histogram(self.timestamps, bins=self.histogram_size, range=(self.min, self.max))
         self.distribution_probabilities = normalize_given_distribution(distribution[0])
Beispiel #3
0
def construct_noisy_conditional_distributions(bayesian_network,
                                              encoded_dataset,
                                              epsilon=0.1):
    """See more in Algorithm 1 in PrivBayes.

    """

    k = len(bayesian_network[-1][1])
    conditional_distributions = {}

    # first k+1 attributes
    root = bayesian_network[0][1][0]
    kplus1_attributes = [root]
    for child, _ in bayesian_network[:k]:
        kplus1_attributes.append(child)

    noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes(
        kplus1_attributes, encoded_dataset, epsilon)

    # generate noisy distribution of root attribute.
    root_stats = noisy_dist_of_kplus1_attributes.loc[:,
                                                     [root, 'count']].groupby(
                                                         root).sum()['count']
    conditional_distributions[root] = normalize_given_distribution(
        root_stats).tolist()

    for idx, (child, parents) in enumerate(bayesian_network):
        conditional_distributions[child] = {}

        if idx < k:
            stats = noisy_dist_of_kplus1_attributes.copy(
            ).loc[:, parents + [child, 'count']]
        else:
            stats = get_noisy_distribution_of_attributes(
                parents + [child], encoded_dataset, epsilon)

        stats = DataFrame(stats.loc[:, parents +
                                    [child, 'count']].groupby(parents +
                                                              [child]).sum())

        if len(parents) == 1:
            for parent_instance in stats.index.levels[0]:
                dist = normalize_given_distribution(
                    stats.loc[parent_instance]['count']).tolist()
                conditional_distributions[child][str([parent_instance])] = dist
        else:
            for parents_instance in product(*stats.index.levels[:-1]):
                dist = normalize_given_distribution(
                    stats.loc[parents_instance]['count']).tolist()
                conditional_distributions[child][str(
                    list(parents_instance))] = dist

    return conditional_distributions
 def infer_distribution(self):
     if self.is_categorical:
         distribution = self.data_dropna.value_counts()
         for value in set(self.distribution_bins) - set(distribution.index):
             distribution[value] = 0
         distribution.sort_index(inplace=True)
         self.distribution_probabilities = utils.normalize_given_distribution(distribution)
         self.distribution_bins = np.array(distribution.index)
     else:
         distribution = np.histogram(self.data_dropna_len, bins=self.histogram_size)
         self.distribution_bins = distribution[1][:-1]
         self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])
 def inject_laplace_noise(self, epsilon, num_valid_attributes):
     if epsilon > 0:
         sensitivity = 2 / self.data.size
         privacy_budget = epsilon / num_valid_attributes
         noise_scale = sensitivity / privacy_budget
         laplace_noises = np.random.laplace(0, scale=noise_scale, size=len(self.distribution_probabilities))
         noisy_distribution = self.distribution_probabilities + laplace_noises
         self.distribution_probabilities = utils.normalize_given_distribution(noisy_distribution)
Beispiel #6
0
def exponential_mechanism(dataset, mutual_info_list, epsilon=0.1):
    """Applied in Exponential Mechanism to sample outcomes."""
    num_tuples, num_attributes = dataset.shape
    mi_array = np.array(mutual_info_list)
    mi_array = mi_array / (2 * delta(num_attributes, num_tuples, epsilon))
    mi_array = np.exp(mi_array)
    mi_array = normalize_given_distribution(mi_array)
    return mi_array
 def inject_laplace_noise(self, epsilon=0.1, num_valid_attributes=10):
     if epsilon > 0:
         noisy_scale = num_valid_attributes / (epsilon * self.data.size)
         laplace_noises = np.random.laplace(
             0,
             scale=noisy_scale,
             size=len(self.distribution_probabilities))
         noisy_distribution = self.distribution_probabilities + laplace_noises
         self.distribution_probabilities = utils.normalize_given_distribution(
             noisy_distribution)
def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes):
    """Applied in Exponential Mechanism to sample outcomes."""
    delta_array = []
    for (child, parents) in parents_pair_list:
        sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary)
        delta = calculate_delta(num_attributes, sensitivity, epsilon)
        delta_array.append(delta)

    mi_array = np.array(mutual_info_list) / (2 * np.array(delta_array))
    mi_array = np.exp(mi_array)
    mi_array = normalize_given_distribution(mi_array)
    return mi_array