def get_preprocessor(self): """Preprocessor for DP Sum (float)""" if self.has_error(): return None # Have we already already assembled it? # if self.preprocessor is not None: # Yes! return self.preprocessor # Build the preprocessor! # preprocessor = ( make_select_column(self.col_index, TOA=str) >> make_cast(TIA=str, TOA=float) >> make_impute_constant(constant=self.fixed_value) >> make_clamp(bounds=self.get_bounds()) >> make_bounded_resize( size=self.dataset_size, bounds=self.get_bounds(), constant=self.fixed_value) >> make_sized_bounded_sum( size=self.dataset_size, bounds=self.get_bounds())) self.scale = binary_search( lambda s: self.check_scale(s, preprocessor, 1, self.epsilon), bounds=(0.0, 1000.0)) preprocessor = preprocessor >> make_base_laplace(self.scale) # keep a pointer to the preprocessor to re-use for .run_chain(...) self.preprocessor = preprocessor return preprocessor
def _compute_noise_scale(self): if self.scale is not None: return lower = self.lower upper = self.upper max_contrib = self.max_contrib # should probably just check and throw if not int bounds = (int(lower), int(upper)) enable_features('floating-point', 'contrib') bounded_sum = ( make_clamp(bounds=bounds) >> make_bounded_sum(bounds=bounds)) discovered_scale = binary_search_param( lambda s: bounded_sum >> make_base_geometric(scale=s), d_in=max_contrib, d_out=(self.epsilon)) self.scale = discovered_scale
def _compute_noise_scale(self): if self.scale is not None: return lower = self.lower upper = self.upper max_contrib = self.max_contrib bounds = (float(lower), float(upper)) sensitivity = upper - lower if sensitivity > 1000: self.scale = (float(sensitivity) * max_contrib) / self.epsilon else: enable_features('floating-point', 'contrib') bounded_sum = ( make_clamp(bounds=bounds) >> make_bounded_sum(bounds=bounds)) discovered_scale = binary_search_param( lambda s: bounded_sum >> make_base_laplace(scale=s), d_in=max_contrib, d_out=(self.epsilon)) self.scale = discovered_scale
def __init__(self, data, output_info, log_frequency, *ignore, per_column_epsilon=None, discrete_column_category_prob=None, **kwargs): self._data = data self._per_column_epsilon = per_column_epsilon if per_column_epsilon: if per_column_epsilon <= 0.0: raise ValueError("per_column_epsilon must be positive") bounds = (0, 1) max_contrib = 1 enable_features('contrib') bounded_sum = ( make_clamp(bounds=bounds) >> make_bounded_sum(bounds=bounds)) discovered_scale = binary_search_param( lambda s: bounded_sum >> make_base_geometric(scale=s), d_in=max_contrib, d_out=(self._per_column_epsilon)) self._per_column_scale = discovered_scale else: self._per_column_scale = None if discrete_column_category_prob is None: warnings.warn( "per_column_epsilon is not set, and no cached probabilites have been provided. " "Sampler will not privatize frequencies, which may cause privacy leaks" ) def is_discrete_column(column_info): return (len(column_info) == 1 and column_info[0].activation_fn == "softmax") n_discrete_columns = sum([ 1 for column_info in output_info if is_discrete_column(column_info) ]) self._discrete_column_matrix_st = np.zeros(n_discrete_columns, dtype="int32") # Store the row id for each category in each discrete column. # For example _rid_by_cat_cols[a][b] is a list of all rows with the # a-th discrete column equal value b. self._rid_by_cat_cols = [] # Compute _rid_by_cat_cols st = 0 for column_info in output_info: if is_discrete_column(column_info): span_info = column_info[0] ed = st + span_info.dim rid_by_cat = [] for j in range(span_info.dim): rid_by_cat.append(np.nonzero(data[:, st + j])[0]) self._rid_by_cat_cols.append(rid_by_cat) st = ed else: st += sum([span_info.dim for span_info in column_info]) assert st == data.shape[1] # Prepare an interval matrix for efficiently sample conditional vector max_category = max([ column_info[0].dim for column_info in output_info if is_discrete_column(column_info) ], default=0) self._discrete_column_cond_st = np.zeros(n_discrete_columns, dtype='int32') self._discrete_column_n_category = np.zeros(n_discrete_columns, dtype='int32') self._discrete_column_category_prob = np.zeros( (n_discrete_columns, max_category)) self._n_discrete_columns = n_discrete_columns self._n_categories = sum([ column_info[0].dim for column_info in output_info if is_discrete_column(column_info) ]) eps_tot = 0.0 st = 0 current_id = 0 current_cond_st = 0 for column_info in output_info: if is_discrete_column(column_info): span_info = column_info[0] ed = st + span_info.dim category_freq = np.sum(data[:, st:ed], axis=0) # insert privacy here if self._per_column_scale: geom = make_base_geometric(self._per_column_scale) category_freq = [geom(int(v)) for v in category_freq] eps_tot += self._per_column_epsilon category_freq = [1 if v < 1 else v for v in category_freq] if np.sum(category_freq) < 100: # not enough data; use uniform distribution category_freq = [1 for _ in category_freq] category_freq = np.array(category_freq, dtype='float64') if log_frequency: category_freq = np.log(category_freq + 1) category_prob = category_freq / np.sum(category_freq) self._discrete_column_category_prob[ current_id, :span_info.dim] = (category_prob) self._discrete_column_cond_st[current_id] = current_cond_st self._discrete_column_n_category[current_id] = span_info.dim current_cond_st += span_info.dim current_id += 1 st = ed else: st += sum([span_info.dim for span_info in column_info]) self.total_spent = eps_tot if discrete_column_category_prob is not None: assert len(discrete_column_category_prob) == n_discrete_columns for i in range(n_discrete_columns): self._discrete_column_category_prob[ i, :] = discrete_column_category_prob[i] self.total_spent = 0.0 # don't have to pay for cached noise