Example #1
0
    def get_preprocessor(self):
        """DP Count preprocessor"""
        if self.has_error():
            return

        # Have we already already assembled it?
        #
        if self.preprocessor is not None:
            # Yes!
            return self.preprocessor

        preprocessor = (
            # Selects a column of df, Vec<str>
            make_select_column(key=self.col_index, TOA=str) >>
            # Cast the column to str
            make_cast(TIA=str, TOA=str) >>
            # Impute missing values
            make_impute_constant(self.fixed_value) >>
            # Count!
            make_count(TIA=str))

        self.scale = binary_search(
            lambda s: self.check_scale(s, preprocessor, 1, self.epsilon),
            bounds=(0.0, 1000.0))

        preprocessor = preprocessor >> make_base_geometric(self.scale)

        # keep a pointer to the preprocessor to re-use for .run_chain(...)
        self.preprocessor = preprocessor

        return preprocessor
Example #2
0
    def check_scale(self, scale, preprocessor, dataset_distance, epsilon):
        """
        Return T/F
        :param scale:
        :param preprocessor:
        :param dataset_distance:
        # :param epsilon:
        :return:
        """
        if self.has_error():
            return

        return (preprocessor >> make_base_geometric(scale)).check(
            dataset_distance, epsilon)
Example #3
0
 def _compute_noise_scale(self):
     if self.scale is not None:
         return
     lower = self.lower
     upper = self.upper
     max_contrib = self.max_contrib
     # should probably just check and throw if not int
     bounds = (int(lower), int(upper))
     enable_features('floating-point', 'contrib')
     bounded_sum = (
         make_clamp(bounds=bounds) >> make_bounded_sum(bounds=bounds))
     discovered_scale = binary_search_param(
         lambda s: bounded_sum >> make_base_geometric(scale=s),
         d_in=max_contrib,
         d_out=(self.epsilon))
     self.scale = discovered_scale
Example #4
0
    def __init__(self,
                 data,
                 output_info,
                 log_frequency,
                 *ignore,
                 per_column_epsilon=None,
                 discrete_column_category_prob=None,
                 **kwargs):
        self._data = data
        self._per_column_epsilon = per_column_epsilon

        if per_column_epsilon:
            if per_column_epsilon <= 0.0:
                raise ValueError("per_column_epsilon must be positive")
            bounds = (0, 1)
            max_contrib = 1
            enable_features('contrib')
            bounded_sum = (
                make_clamp(bounds=bounds) >> make_bounded_sum(bounds=bounds))
            discovered_scale = binary_search_param(
                lambda s: bounded_sum >> make_base_geometric(scale=s),
                d_in=max_contrib,
                d_out=(self._per_column_epsilon))
            self._per_column_scale = discovered_scale
        else:
            self._per_column_scale = None
            if discrete_column_category_prob is None:
                warnings.warn(
                    "per_column_epsilon is not set, and no cached probabilites have been provided. "
                    "Sampler will not privatize frequencies, which may cause privacy leaks"
                )

        def is_discrete_column(column_info):
            return (len(column_info) == 1
                    and column_info[0].activation_fn == "softmax")

        n_discrete_columns = sum([
            1 for column_info in output_info if is_discrete_column(column_info)
        ])

        self._discrete_column_matrix_st = np.zeros(n_discrete_columns,
                                                   dtype="int32")

        # Store the row id for each category in each discrete column.
        # For example _rid_by_cat_cols[a][b] is a list of all rows with the
        # a-th discrete column equal value b.
        self._rid_by_cat_cols = []

        # Compute _rid_by_cat_cols
        st = 0
        for column_info in output_info:
            if is_discrete_column(column_info):
                span_info = column_info[0]
                ed = st + span_info.dim

                rid_by_cat = []
                for j in range(span_info.dim):
                    rid_by_cat.append(np.nonzero(data[:, st + j])[0])
                self._rid_by_cat_cols.append(rid_by_cat)
                st = ed
            else:
                st += sum([span_info.dim for span_info in column_info])
        assert st == data.shape[1]

        # Prepare an interval matrix for efficiently sample conditional vector
        max_category = max([
            column_info[0].dim
            for column_info in output_info if is_discrete_column(column_info)
        ],
                           default=0)

        self._discrete_column_cond_st = np.zeros(n_discrete_columns,
                                                 dtype='int32')
        self._discrete_column_n_category = np.zeros(n_discrete_columns,
                                                    dtype='int32')
        self._discrete_column_category_prob = np.zeros(
            (n_discrete_columns, max_category))
        self._n_discrete_columns = n_discrete_columns
        self._n_categories = sum([
            column_info[0].dim for column_info in output_info
            if is_discrete_column(column_info)
        ])

        eps_tot = 0.0
        st = 0
        current_id = 0
        current_cond_st = 0
        for column_info in output_info:
            if is_discrete_column(column_info):
                span_info = column_info[0]
                ed = st + span_info.dim
                category_freq = np.sum(data[:, st:ed], axis=0)
                # insert privacy here
                if self._per_column_scale:
                    geom = make_base_geometric(self._per_column_scale)
                    category_freq = [geom(int(v)) for v in category_freq]
                    eps_tot += self._per_column_epsilon
                category_freq = [1 if v < 1 else v for v in category_freq]
                if np.sum(category_freq) < 100:
                    # not enough data; use uniform distribution
                    category_freq = [1 for _ in category_freq]
                category_freq = np.array(category_freq, dtype='float64')
                if log_frequency:
                    category_freq = np.log(category_freq + 1)
                category_prob = category_freq / np.sum(category_freq)
                self._discrete_column_category_prob[
                    current_id, :span_info.dim] = (category_prob)
                self._discrete_column_cond_st[current_id] = current_cond_st
                self._discrete_column_n_category[current_id] = span_info.dim
                current_cond_st += span_info.dim
                current_id += 1
                st = ed
            else:
                st += sum([span_info.dim for span_info in column_info])
        self.total_spent = eps_tot

        if discrete_column_category_prob is not None:
            assert len(discrete_column_category_prob) == n_discrete_columns
            for i in range(n_discrete_columns):
                self._discrete_column_category_prob[
                    i, :] = discrete_column_category_prob[i]
            self.total_spent = 0.0  # don't have to pay for cached noise
Example #5
0
 def release(self, vals):
     enable_features('floating-point', 'contrib')
     meas = make_base_geometric(self.scale)
     vals = [meas(int(v)) for v in vals]
     return vals