def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True, **unused_args): warn_unused_args(unused_args) if check_input: X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES) elif copy: X = X.copy(order='K') y = np.asarray(y, dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if fit_intercept: bounds_X = check_bounds(bounds_X, X.shape[1]) bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1) X = clip_to_bounds(X, bounds_X) y = clip_to_bounds(y, bounds_y) X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, accountant=BudgetAccountant()) X -= X_offset y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, accountant=BudgetAccountant()) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale
def test_incorrect_parameterisation(self): with self.assertRaises(TypeError): clip_to_bounds([1, 2, 3], (0, 5)) with self.assertRaises(TypeError): clip_to_bounds(np.ones((5, 1)), [1, 2]) with self.assertRaises(Exception): clip_to_bounds(np.ones((5, 1)), ("One", "Two"))
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=False, nan=False): if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype, keepdims=keepdims, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nansum if nan else np.sum actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) mech = GeometricTruncated if dtype is not None and issubclass( dtype, Integral) else LaplaceTruncated mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size) output = mech.randomise(actual_sum) accountant.spend(epsilon, 0) return output
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False): if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nanvar if nan else np.var actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) dp_mech = LaplaceBoundedDomain( epsilon=epsilon, delta=0, sensitivity=((upper - lower) / array.size)**2 * (array.size - 1), lower=0, upper=float("inf")) output = np.minimum(dp_mech.randomise(actual_var), (upper - lower)**2) accountant.spend(epsilon, 0) return output
def test_iris(self): from sklearn import datasets dataset = datasets.load_iris() X_train, y_train = dataset.data, dataset.target maxes = np.max(X_train, axis=1) clip_max = (maxes[0] + maxes[1]) / 2 X_clipped = clip_to_bounds(X_train, (np.min(X_train), clip_max)) clipped_maxes = np.max(X_clipped, axis=0) self.assertLessEqual(clipped_maxes[0], maxes[0]) self.assertLessEqual(clipped_maxes[1], maxes[1]) self.assertTrue( np.isclose(clipped_maxes[0], clip_max) or np.isclose(clipped_maxes[1], clip_max))
def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False): if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_mean, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) array = clip_to_bounds(np.ravel(array), bounds) _func = np.nanmean if nan else np.mean actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) mech = LaplaceTruncated(epsilon=epsilon, delta=0, sensitivity=(upper - lower) / array.size, lower=lower, upper=upper) output = mech.randomise(actual_mean) accountant.spend(epsilon, 0) return output
def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") del y X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32]) n_samples, n_dims = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters)) iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5) X = clip_to_bounds(X, self.bounds) centers = self._init_centers(n_dims) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters self.accountant.spend(self.epsilon, 0) return self
def quantile(array, quant, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private quantile of the array. Returns the specified quantile with differential privacy. The quantile is calculated over the flattened array. Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by Smith, 2011. Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 Parameters ---------- array : array_like Array containing numbers whose quantile is sought. If `array` is not an array, a conversion is attempted. quant : float or array-like Quantile or array of quantiles. Each quantile must be in the unit interval [0, 1]. If quant is array-like, quantiles are returned over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the quantile values. See Also -------- numpy.quantile : Equivalent non-private method. percentile, median """ warn_unused_args(unused_args) if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) quant = np.ravel(quant) if np.any(quant < 0) or np.any(quant > 1): raise ValueError("Quantiles must be in the unit interval [0, 1].") if len(quant) > 1: return np.array([ quantile(array, q_i, epsilon=epsilon / len(quant), bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) for q_i in quant ]) # Dealing with a single quant from now on quant = quant.item() if axis is not None or keepdims: return _wrap_axis(quantile, array, quant=quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) # Dealing with a scalar output from now on bounds = check_bounds(bounds, shape=0, min_separation=1e-5) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) k = array.size array = np.append(array, list(bounds)) array.sort() interval_sizes = np.diff(array) # Todo: Need to find a way to do this in a differentially private way if np.isnan(interval_sizes).any(): return np.nan mech = Exponential(epsilon=epsilon, sensitivity=1, utility=list(-np.abs(np.arange(0, k + 1) - quant * k)), measure=list(interval_sizes)) idx = mech.randomise() output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx] accountant.spend(epsilon, 0) return output
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y) if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, shape=X.shape[1]) X = clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) if self.priors is not None: priors = np.asarray(self.priors) if len(priors) != n_classes: raise ValueError( "Number of priors must match number of classes.") if not np.isclose(priors.sum(), 1.0): raise ValueError("The sum of the priors should be 1.") if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( "The target label(s) %s in y do not exist in the initial classes %s" % (unique_y[~unique_y_in_classes], classes)) noisy_class_counts = self._noisy_class_counts(y) for _i, y_i in enumerate(unique_y): i = classes.searchsorted(y_i) X_i = X[y == y_i, :] n_i = noisy_class_counts[_i] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += n_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() self.accountant.spend(self.epsilon, 0) return self
def partial_fit(self, X, y=None, sample_weight=None): """Online computation of mean and std with differential privacy on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y Ignored sample_weight Ignored by diffprivlib. Present for consistency with sklearn API. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon X = check_array(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # Hotfix for sklearn v 0.23 self.n_features_in_ = X.shape[1] if self.bounds is None: warnings.warn( "Range parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, X.shape[1]) X = clip_to_bounds(X, self.bounds) # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis if hasattr(self, 'n_samples_seen_') and isinstance( self.n_samples_seen_, (int, np.integer)): self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]).astype(np.int64) if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64) # First pass if not hasattr(self, 'scale_'): self.mean_ = .0 if self.with_std: self.var_ = .0 else: self.var_ = None if not self.with_mean and not self.with_std: self.mean_ = None self.var_ = None self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) else: self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( X, epsilon_0, self.bounds, self.mean_, self.var_, self.n_samples_seen_) # for backward-compatibility, reduce n_samples_seen_ to an integer # if the number of samples is the same for each feature (i.e. no # missing values) if np.ptp(self.n_samples_seen_) == 0: self.n_samples_seen_ = self.n_samples_seen_[0] if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None self.accountant.spend(self.epsilon, 0) return self
def test_different_bounds(self): X = np.ones((10, 2)) X_clipped = clip_to_bounds(X, ([0, 0], [0.5, 1])) self.assertTrue(np.all(X_clipped[:, 0] == 0.5)) self.assertTrue(np.all(X_clipped[:, 1] == 1))