def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False): if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_mean, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) array = clip_to_bounds(np.ravel(array), bounds) _func = np.nanmean if nan else np.mean actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) mech = LaplaceTruncated(epsilon=epsilon, delta=0, sensitivity=(upper - lower) / array.size, lower=lower, upper=upper) output = mech.randomise(actual_mean) accountant.spend(epsilon, 0) return output
def dp_contingency_table(data, epsilon): """Compute differentially private contingency table of input data""" contingency_table_ = contingency_table(data) # if we remove one record from X the count in one cell decreases by 1 while the rest stays the same. sensitivity = 1 dp_mech = LaplaceTruncated(epsilon=epsilon, lower=0, upper=maxsize, sensitivity=sensitivity) contingency_table_values = contingency_table_.values.flatten() dp_contingency_table = np.zeros_like(contingency_table_values) for i in np.arange(dp_contingency_table.shape[0]): # round counts upwards to preserve bins with noisy count between [0, 1] dp_contingency_table[i] = np.ceil( dp_mech.randomise(contingency_table_values[i])) return Factor(dp_contingency_table, states=contingency_table_.states)
def dp_joint_distribution(data, epsilon): """Compute differentially private joint distribution of input data""" joint_distribution_ = joint_distribution(data) # removing one record from X will decrease probability 1/n in one cell of the # joint distribution and increase the probability 1/n in the remaining cells sensitivity = 2 / data.shape[0] dp_mech = LaplaceTruncated(epsilon=epsilon, lower=0, upper=maxsize, sensitivity=sensitivity) joint_distribution_values = joint_distribution_.values.flatten() dp_joint_distribution_ = np.zeros_like(joint_distribution_values) for i in np.arange(dp_joint_distribution_.shape[0]): dp_joint_distribution_[i] = dp_mech.randomise( joint_distribution_values[i]) dp_joint_distribution_ = _normalize_distribution(dp_joint_distribution_) return JPT(dp_joint_distribution_, states=joint_distribution_.states)
def dp_marginal_distribution(data, epsilon): """Compute differentially private marginal distribution of input data""" marginal_ = marginal_distribution(data) # removing one record from X will decrease probability 1/n in one cell of the # marginal distribution and increase the probability 1/n in the remaining cells sensitivity = 2 / data.shape[0] dp_mech = LaplaceTruncated(epsilon=epsilon, lower=0, upper=maxsize, sensitivity=sensitivity) marginal_values = marginal_.values.flatten() dp_marginal = np.zeros_like(marginal_.values) for i in np.arange(dp_marginal.shape[0]): # round counts upwards to preserve bins with noisy count between [0, 1] dp_marginal[i] = dp_mech.randomise(marginal_.values[i]) dp_marginal = _normalize_distribution(dp_marginal) return Factor(dp_marginal, states=marginal_.states)
def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) _func = np.nanmean if nan else np.mean output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims) vector_out = (np.ndim(output_form) == 1) n_datapoints = np.sum(np.ones_like(array), axis=axis, keepdims=keepdims).flat[0] if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) if np.ndim(output_form) <= 1: bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims)) else: bounds = (np.min(array), np.max(array)) lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float) array = np.clip(array, lower, upper) actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) if isinstance(actual_mean, np.ndarray): dp_mean = np.zeros_like(actual_mean) iterator = np.nditer(actual_mean, flags=['multi_index']) while not iterator.finished: idx = iterator.multi_index _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0]) local_diam = _upper - _lower dp_mech = LaplaceTruncated().set_epsilon(epsilon).set_sensitivity(local_diam / n_datapoints).\ set_bounds(_lower, _upper) dp_mean[iterator.multi_index] = dp_mech.randomise(actual_mean[idx]) iterator.iternext() accountant.spend(epsilon, 0) return dp_mean local_diam = upper[0] - lower[0] dp_mech = LaplaceTruncated().set_epsilon(epsilon).set_sensitivity(local_diam / n_datapoints).\ set_bounds(lower[0], upper[0]) accountant.spend(epsilon, 0) return dp_mech.randomise(actual_mean)
def setup_method(self, method): if method.__name__.endswith("prob"): global_seed(314159) self.mech = LaplaceTruncated()
class TestLaplaceTruncated(TestCase): def setup_method(self, method): if method.__name__.endswith("prob"): global_seed(314159) self.mech = LaplaceTruncated() def teardown_method(self, method): del self.mech def test_not_none(self): self.assertIsNotNone(self.mech) def test_class(self): from diffprivlib.mechanisms import DPMechanism self.assertTrue(issubclass(LaplaceTruncated, DPMechanism)) def test_no_params(self): with self.assertRaises(ValueError): self.mech.randomise(1) def test_no_sensitivity(self): self.mech.set_epsilon(1).set_bounds(0, 1) with self.assertRaises(ValueError): self.mech.randomise(1) def test_no_epsilon(self): self.mech.set_sensitivity(1).set_bounds(0, 1) with self.assertRaises(ValueError): self.mech.randomise(1) def test_inf_epsilon(self): self.mech.set_sensitivity(1).set_epsilon(float("inf")).set_bounds(0, 1) for i in range(1000): self.assertEqual(self.mech.randomise(0.5), 0.5) def test_complex_epsilon(self): with self.assertRaises(TypeError): self.mech.set_epsilon(1 + 2j) def test_string_epsilon(self): with self.assertRaises(TypeError): self.mech.set_epsilon("Two") def test_no_bounds(self): self.mech.set_sensitivity(1).set_epsilon(1) with self.assertRaises(ValueError): self.mech.randomise(1) def test_non_numeric(self): self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1) with self.assertRaises(TypeError): self.mech.randomise("Hello") def test_zero_median_prob(self): self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1) vals = [] for i in range(10000): vals.append(self.mech.randomise(0.5)) median = float(np.median(vals)) self.assertAlmostEqual(np.abs(median), 0.5, delta=0.1) def test_neighbors_prob(self): epsilon = 1 runs = 10000 self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1) count = [0, 0] for i in range(runs): val0 = self.mech.randomise(0) if val0 <= 0.5: count[0] += 1 val1 = self.mech.randomise(1) if val1 <= 0.5: count[1] += 1 self.assertGreater(count[0], count[1]) self.assertLessEqual(count[0] / runs, np.exp(epsilon) * count[1] / runs + 0.1) def test_within_bounds(self): self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1) vals = [] for i in range(1000): vals.append(self.mech.randomise(0.5)) vals = np.array(vals) self.assertTrue(np.all(vals >= 0)) self.assertTrue(np.all(vals <= 1))
def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None, n_noisy=None): """Compute online update of Gaussian mean and variance. Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf Parameters ---------- n_past : int Number of samples represented in old mean and variance. If sample weights were given, this should contain the sum of sample weights represented in old mean and variance. mu : array-like, shape (number of Gaussians,) Means for Gaussians in original set. var : array-like, shape (number of Gaussians,) Variances for Gaussians in original set. sample_weight : ignored Ignored in diffprivlib. n_noisy : int, optional Noisy count of the given class, satisfying differential privacy. Returns ------- total_mu : array-like, shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. total_var : array-like, shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. """ if n_noisy is None: warnings.warn( "Noisy class count has not been specified and will be read from the data. To use this " "method correctly, make sure it is run by the parent GaussianNB class.", PrivacyLeakWarning) n_noisy = X.shape[0] if not n_noisy: return mu, var if sample_weight is not None: warn_unused_args("sample_weight") # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance n_features = X.shape[1] local_epsilon = self.epsilon / 3 / n_features new_mu = np.zeros((n_features, )) new_var = np.zeros((n_features, )) for feature in range(n_features): _X = X[:, feature] lower, upper = self.bounds[0][feature], self.bounds[1][feature] local_diameter = upper - lower mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter, lower=lower * n_noisy, upper=upper * n_noisy) _mu = mech_mu.randomise(_X.sum()) / n_noisy local_sq_sens = max(_mu - lower, upper - _mu)**2 mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0, upper=local_sq_sens * n_noisy) _var = mech_var.randomise(((_X - _mu)**2).sum()) / n_noisy new_mu[feature] = _mu new_var[feature] = _var if n_past == 0: return new_mu, new_var n_total = float(n_past + n_noisy) # Combine mean of old and new data, taking into consideration # (weighted) number of observations total_mu = (n_noisy * new_mu + n_past * mu) / n_total # Combine variance of old and new data, taking into consideration # (weighted) number of observations. This is achieved by combining # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_noisy * new_var total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * ( n_noisy * mu - n_noisy * new_mu)**2 total_var = total_ssd / n_total return total_mu, total_var
def execute(self): r""" Raises: ValueError: if self.__columns is empty inner methods may raise Exception, only if parameters have not been set correctly """ laplace: LaplaceTruncated = LaplaceTruncated() binary: Binary = Binary() if self.__columns == {}: raise ValueError('No columns added for execution') for column_name, details in self.__columns.items(): if details['category'] is 'numeric': self.sdf = self.sdf.withColumn(colName=column_name, col=self.sdf[column_name].cast( DoubleType())) laplace.set_epsilon_delta(epsilon=details['epsilon'], delta=details['delta']) laplace.set_sensitivity(details['sensitivity']) laplace.set_bounds(lower=details['lower_bound'], upper=details['upper_bound']) if 'round' in details: def round_randomise(cell): return float( round(laplace.randomise(cell), details['round']) ) if cell is not None else None round_randomise_udf = udf(f=round_randomise, returnType=DoubleType()) self.sdf = self.sdf.withColumn( colName=column_name, col=round_randomise_udf(column_name)) else: def randomise(cell): return float(laplace.randomise( cell)) if cell is not None else None randomise_udf = udf(f=randomise, returnType=DoubleType()) self.sdf = self.sdf.withColumn( colName=column_name, col=randomise_udf(column_name)) elif details['category'] is 'boolean': self.sdf = self.sdf.withColumn(colName=column_name, col=self.sdf[column_name].cast( StringType())) binary.set_epsilon_delta(epsilon=details['epsilon'], delta=details['delta']) binary.set_labels(value0=details['label1'], value1=details['label2']) def randomise(cell): return binary.randomise(cell) if cell is not None else None randomise_udf = udf(f=randomise, returnType=StringType()) self.sdf = self.sdf.withColumn(colName=column_name, col=randomise_udf(column_name))