def infer_domain_of_string_attribute(self, attribute):
        datatype = self.attribute_to_datatype[attribute]
        column_values = self.input_dataset[attribute]
        column_dropna = column_values.dropna()
        column_value_lengths = column_dropna.astype(str).map(len)

        is_categorical_attribute = self.is_categorical(attribute)
        if is_categorical_attribute:
            distribution = column_dropna.value_counts()
            distribution.sort_index(inplace=True)
            distribution_probabilities = utils.normalize_given_distribution(distribution).tolist()
            distribution_bins = np.array(distribution.index).tolist()
        else:
            distribution = np.histogram(column_value_lengths, bins=self.histogram_size)
            distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist()
            distribution_bins = distribution[1][:-1].tolist()
            distribution_bins[0] = distribution_bins[0] - 0.001 * (distribution_bins[1] - distribution_bins[0])

        attribute_info = {'datatype': datatype,
                          'is_categorical': is_categorical_attribute,
                          'min_length': int(column_value_lengths.min()),
                          'max_length': int(column_value_lengths.max()),
                          'distribution_bins': distribution_bins,
                          'distribution_probabilities': distribution_probabilities,
                          'missing_rate': column_values.isnull().sum() / column_values.index.size}

        return attribute_info
    def infer_domain(self, column):
        assert isinstance(column, Series)
        self.data = column
        self.data_dropna = self.data.dropna()
        self.missing_rate = (self.data.size -
                             self.data_dropna.size) / self.data.size
        epoch_datetime = parse('1970-01-01')
        timestamps = self.data_dropna.map(lambda x: int(
            (parse(x) - epoch_datetime).total_seconds()))
        self.min = float(timestamps.min())
        self.max = float(timestamps.max())

        if self.is_categorical:
            distribution = self.data_dropna.value_counts()
            distribution.sort_index(inplace=True)
            self.distribution_probabilities = normalize_given_distribution(
                distribution).tolist()
            self.distribution_bins = np.array(distribution.index).tolist()
        else:
            distribution = np.histogram(timestamps, bins=self.histogram_size)
            self.distribution_probabilities = normalize_given_distribution(
                distribution[0]).tolist()
            bins = distribution[1][:-1].tolist()
            bins[0] = bins[0] - 0.001 * (bins[1] - bins[0])
            self.distribution_bins = bins
    def infer_domain_of_numeric_attribute(self, attribute):
        datatype = self.attribute_to_datatype[attribute]
        column_values = self.input_dataset[attribute]
        column_dropna = column_values.dropna()

        # use timestamp to represent datetime
        if datatype == 'datetime':
            column_dropna = column_dropna.map(lambda x: parse(x).timestamp())

        is_categorical_attr = self.is_categorical(attribute)
        if is_categorical_attr:
            distribution = column_dropna.value_counts()
            distribution.sort_index(inplace=True)
            distribution_probabilities = utils.normalize_given_distribution(distribution).tolist()
            distribution_bins = np.array(distribution.index).tolist()
        else:
            distribution = np.histogram(column_dropna, bins=self.histogram_size)
            distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist()
            distribution_bins = distribution[1][:-1].tolist()
            distribution_bins[0] = distribution_bins[0] - 0.001 * (distribution_bins[1] - distribution_bins[0])

        attribute_info = {'datatype': datatype,
                          'is_categorical': is_categorical_attr,
                          'min': float(column_dropna.min()),
                          'max': float(column_dropna.max()),
                          'distribution_bins': distribution_bins,
                          'distribution_probabilities': distribution_probabilities,
                          'missing_rate': column_values.isnull().sum() / column_values.index.size}

        if datatype == 'integer':
            attribute_info['min'] = int(column_dropna.min())
            attribute_info['max'] = int(column_dropna.max())

        return attribute_info
Exemple #4
0
def construct_noisy_conditional_distributions(bayesian_network,
                                              encoded_dataset,
                                              epsilon=0.1):
    """See more in Algorithm 1 in PrivBayes.

    """

    k = len(bayesian_network[-1][1])
    conditional_distributions = {}

    # first k+1 attributes
    root = bayesian_network[0][1][0]
    kplus1_attributes = [root]
    for child, _ in bayesian_network[:k]:
        kplus1_attributes.append(child)

    noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes(
        kplus1_attributes, encoded_dataset, epsilon)

    # generate noisy distribution of root attribute.
    root_stats = noisy_dist_of_kplus1_attributes.loc[:,
                                                     [root, 'count']].groupby(
                                                         root).sum()['count']
    conditional_distributions[root] = normalize_given_distribution(
        root_stats).tolist()

    for idx, (child, parents) in enumerate(bayesian_network):
        conditional_distributions[child] = {}

        if idx < k:
            stats = noisy_dist_of_kplus1_attributes.copy(
            ).loc[:, parents + [child, 'count']]
        else:
            stats = get_noisy_distribution_of_attributes(
                parents + [child], encoded_dataset, epsilon)

        stats = DataFrame(stats.loc[:, parents +
                                    [child, 'count']].groupby(parents +
                                                              [child]).sum())

        if len(parents) == 1:
            for parent_instance in stats.index.levels[0]:
                dist = normalize_given_distribution(
                    stats.loc[parent_instance]['count']).tolist()
                conditional_distributions[child][str([parent_instance])] = dist
        else:
            for parents_instance in product(*stats.index.levels[:-1]):
                dist = normalize_given_distribution(
                    stats.loc[parents_instance]['count']).tolist()
                conditional_distributions[child][str(
                    list(parents_instance))] = dist

    return conditional_distributions
 def infer_distribution(self):
     if self.is_categorical:
         distribution = self.data_dropna.value_counts()
         for value in set(self.distribution_bins) - set(distribution.index):
             distribution[value] = 0
         distribution.sort_index(inplace=True)
         self.distribution_probabilities = utils.normalize_given_distribution(distribution)
         self.distribution_bins = np.array(distribution.index)
     else:
         distribution = np.histogram(self.data_dropna_len, bins=self.histogram_size)
         self.distribution_bins = distribution[1][:-1]
         self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])
def construct_noisy_conditional_distributions(bayesian_network,
                                              encoded_dataset,
                                              epsilon=0.1):
    """See more in Algorithm 1 in PrivBayes."""

    k = len(bayesian_network[-1][1])
    conditional_distributions = {}

    # first k+1 attributes
    root = bayesian_network[0][1][0]
    kplus1_attributes = [root]
    for child, _ in bayesian_network[:k]:
        kplus1_attributes.append(child)

    noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes(
        kplus1_attributes, encoded_dataset, epsilon)

    # generate noisy distribution of root attribute.
    root_stats = noisy_dist_of_kplus1_attributes.loc[:,
                                                     [root, 'count']].groupby(
                                                         root).sum()['count']
    conditional_distributions[root] = normalize_given_distribution(
        root_stats).tolist()

    for idx, (child, parents) in enumerate(bayesian_network):
        conditional_distributions[child] = {}

        if idx <= k - 2:
            stats = noisy_dist_of_kplus1_attributes.copy(
            ).loc[:, parents + [child, 'count']]
            stats = stats.groupby(parents + [child], as_index=False).sum()
        elif idx == k - 1:
            stats = noisy_dist_of_kplus1_attributes.loc[:, parents +
                                                        [child, 'count']]
        else:
            stats = get_noisy_distribution_of_attributes(
                parents + [child], encoded_dataset, epsilon)
            stats = stats.loc[:, parents + [child, 'count']]

        for parents_instance, stats_sub in stats.groupby(parents):
            stats_sub = stats_sub.sort_values(by=child)
            dist = normalize_given_distribution(stats_sub['count']).tolist()

            if len(parents) == 1:
                parents_key = str([parents_instance])
            else:
                parents_key = str(list(parents_instance))

            conditional_distributions[child][parents_key] = dist

    return conditional_distributions
 def infer_distribution(self):
     if self.is_categorical:
         distribution = self.data_dropna.value_counts()
         for value in set(self.distribution_bins) - set(distribution.index):
             distribution[value] = 0
         distribution.sort_index(inplace=True)
         self.distribution_probabilities = normalize_given_distribution(
             distribution)
         self.distribution_bins = np.array(distribution.index)
     else:
         distribution = np.histogram(self.timestamps,
                                     bins=self.histogram_size,
                                     range=(self.min, self.max))
         self.distribution_probabilities = normalize_given_distribution(
             distribution[0])
Exemple #8
0
 def inject_laplace_noise(self, epsilon=0.1, num_valid_attributes=10):
     noisy_scale = num_valid_attributes / (epsilon * self.data.size)
     laplace_noises = np.random.laplace(
         0, scale=noisy_scale, size=len(self.distribution_probabilities))
     noisy_distribution = np.asarray(
         self.distribution_probabilities) + laplace_noises
     self.distribution_probabilities = utils.normalize_given_distribution(
         noisy_distribution).tolist()
Exemple #9
0
def exponential_mechanism(dataset, mutual_info_list, epsilon=0.1):
    """Applied in Exponential Mechanism to sample outcomes."""
    num_tuples, num_attributes = dataset.shape
    mi_array = np.array(mutual_info_list)
    mi_array = mi_array / (2 * delta(num_attributes, num_tuples, epsilon))
    mi_array = np.exp(mi_array)
    mi_array = normalize_given_distribution(mi_array)
    return mi_array
 def inject_laplace_noise_into_distribution_per_attribute(self, epsilon=0.1):
     h = self.input_dataset.columns.size
     for attr in self.dataset_description['attribute_description']:
         distribution = self.dataset_description['attribute_description'][attr]['distribution_probabilities']
         noisy_scale = h / (epsilon * self.input_dataset.shape[0])
         laplace_noises = np.random.laplace(0, scale=noisy_scale, size=len(distribution))
         noisy_distribution = np.asarray(distribution) + laplace_noises
         noisy_distribution = utils.normalize_given_distribution(noisy_distribution).tolist()
         self.dataset_description['attribute_description'][attr]['distribution_probabilities'] = noisy_distribution
    def infer_domain(self, column):
        """ Infer domain, including min, max, and 1-D distribution."""
        assert isinstance(column, pd.Series)
        self.data = column
        self.data_dropna = self.data.dropna()
        self.missing_rate = (self.data.size - self.data_dropna.size) / self.data.size
        self.min = float(self.data_dropna.min())
        self.max = float(self.data_dropna.max())

        if self.is_categorical:
            distribution = self.data_dropna.value_counts()
            distribution.sort_index(inplace=True)
            self.distribution_probabilities = utils.normalize_given_distribution(distribution).tolist()
            self.distribution_bins = np.array(distribution.index).tolist()
        else:
            distribution = np.histogram(self.data_dropna, bins=self.histogram_size)
            self.distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist()
            bins = distribution[1][:-1].tolist()
            bins[0] = bins[0] - 0.001 * (bins[1] - bins[0])
            self.distribution_bins = bins
 def inject_laplace_noise(self, epsilon, num_valid_attributes):
     if epsilon > 0:
         sensitivity = 2 / self.data.size
         privacy_budget = epsilon / num_valid_attributes
         noise_scale = sensitivity / privacy_budget
         laplace_noises = np.random.laplace(
             0,
             scale=noise_scale,
             size=len(self.distribution_probabilities))
         noisy_distribution = self.distribution_probabilities + laplace_noises
         self.distribution_probabilities = utils.normalize_given_distribution(
             noisy_distribution)
def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list,
                          attr_to_is_binary, num_tuples, num_attributes):
    """Applied in Exponential Mechanism to sample outcomes."""
    delta_array = []
    for (child, parents) in parents_pair_list:
        sensitivity = calculate_sensitivity(num_tuples, child, parents,
                                            attr_to_is_binary)
        delta = calculate_delta(num_attributes, sensitivity, epsilon)
        delta_array.append(delta)

    mi_array = np.array(mutual_info_list) / (2 * np.array(delta_array))
    mi_array = np.exp(mi_array)
    mi_array = normalize_given_distribution(mi_array)
    return mi_array
 def describe_dataset_in_random_mode(self,
                                     dataset_file,
                                     attribute_to_datatype={},
                                     attribute_to_is_categorical={},
                                     seed=0):
     self.describe_dataset_in_independent_attribute_mode(dataset_file,
                                                         attribute_to_datatype=attribute_to_datatype,
                                                         attribute_to_is_categorical=attribute_to_is_categorical,
                                                         seed=seed)
     # After running independent attribute mode, 1) make all distributions uniform; 2) set missing rate to zero.
     for attr in self.dataset_description['attribute_description']:
         distribution = self.dataset_description['attribute_description'][attr]['distribution_probabilities']
         uniform_distribution = np.ones_like(distribution)
         uniform_distribution = utils.normalize_given_distribution(uniform_distribution).tolist()
         self.dataset_description['attribute_description'][attr]['distribution_probabilities'] = uniform_distribution
         self.dataset_description['attribute_description'][attr]['missing_rate'] = 0
    def compare_histograms(self, attribute):
        datatype = self.attribute_description[attribute]['data_type']
        is_categorical = self.attribute_description[attribute][
            'is_categorical']

        # ignore datetime attributes, since they are converted into timestamps
        if datatype == 'DateTime':
            return
        # ignore non-categorical string attributes
        elif datatype == 'String' and not is_categorical:
            return
        elif attribute in self.candidate_keys:
            return
        else:
            fig = plt.figure(figsize=(15, 5), dpi=120)
            ax1 = fig.add_subplot(121)
            ax2 = fig.add_subplot(122)

            if is_categorical:
                dist_priv = self.private_df[attribute].value_counts()
                dist_synt = self.synthetic_df[attribute].value_counts()
                for idx, number in dist_priv.iteritems():
                    if idx not in dist_synt.index:
                        dist_synt.loc[idx] = 0
                for idx, number in dist_synt.iteritems():
                    if idx not in dist_priv.index:
                        dist_priv.loc[idx] = 0
                dist_priv.index = [str(i) for i in dist_priv.index]
                dist_synt.index = [str(i) for i in dist_synt.index]
                dist_priv.sort_index(inplace=True)
                dist_synt.sort_index(inplace=True)
                pos_priv = list(range(len(dist_priv)))
                pos_synt = list(range(len(dist_synt)))
                ax1.bar(pos_priv,
                        normalize_given_distribution(dist_priv.values))
                ax2.bar(pos_synt,
                        normalize_given_distribution(dist_synt.values))
                ax1.set_xticks(arange(min(pos_priv), max(pos_priv) + 1, 1.0))
                ax2.set_xticks(arange(min(pos_synt), max(pos_synt) + 1, 1.0))
                ax1.set_xticklabels(dist_priv.index.tolist(), fontsize=15)
                ax2.set_xticklabels(dist_synt.index.tolist(), fontsize=15)
            # the rest are non-categorical numeric attributes.
            else:
                ax1.hist(self.private_df[attribute].dropna(),
                         bins=15,
                         align='left',
                         density=True)
                ax2.hist(self.synthetic_df[attribute].dropna(),
                         bins=15,
                         align='left',
                         density=True)

            ax1_x_min, ax1_x_max = ax1.get_xlim()
            ax2_x_min, ax2_x_max = ax2.get_xlim()
            ax1_y_min, ax1_y_max = ax1.get_ylim()
            ax2_y_min, ax2_y_max = ax2.get_ylim()
            x_min = min(ax1_x_min, ax2_x_min)
            x_max = max(ax1_x_max, ax2_x_max)
            y_min = min(ax1_y_min, ax2_y_min)
            y_max = max(ax1_y_max, ax2_y_max)
            ax1.set_xlim([x_min, x_max])
            ax1.set_ylim([y_min, y_max])
            ax2.set_xlim([x_min, x_max])
            ax2.set_ylim([y_min, y_max])
            fig.autofmt_xdate()