def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0, warm_start=False, n_jobs=None, accountant=None, **unused_args): super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0, class_weight=None, random_state=None, solver='lbfgs', max_iter=max_iter, multi_class='ovr', verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) self.epsilon = epsilon self.data_norm = data_norm self.classes_ = None self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args)
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): if sample_weight is not None: warn_unused_args("sample_weight") # Store size of current X to apply differential privacy later on self.new_n_samples = X.shape[0] if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0))) self.bounds = _check_bounds(self.bounds, X.shape[1]) super()._partial_fit(X, y, classes, _refit, sample_weight=None) del self.new_n_samples return self
def percentile(array, percent, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private percentile of the array. This method calls :obj:`.quantile`, where quantile = percentile / 100. Parameters ---------- array : array_like Array containing numbers whose percentile is sought. If `array` is not an array, a conversion is attempted. percent : float or array-like Percentile or list of percentiles sought. Each percentile must be in [0, 100]. If percent is array-like, percentiles are returned over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the percentile values. See Also -------- numpy.percentile : Equivalent non-private method. quantile, median """ warn_unused_args(unused_args) quant = np.asarray(percent) / 100 if np.any(quant < 0) or np.any(quant > 1): raise ValueError("Percentiles must be between 0 and 100 inclusive") return quantile(array, quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant)
def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : Ignored Not used in diffprivlib, present here for consistency with :obj:`sklearn.cluster.KMeans`. Specifying this parameter will result in a :class:`.DiffprivlibCompatibilityWarning`. Returns ------- self : class """ if sample_weight is not None: warn_unused_args("sample_weight") del y if X.ndim != 2: raise ValueError( "Expected 2D array, got array with %d dimensions instead. Reshape your data using array.reshape(-1, 1)," "or array.reshape(1, -1) if your data contains only one sample." % X.ndim) n_samples, n_dims = X.shape iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0))) self.bounds = _check_bounds(self.bounds, n_dims) centers = self._init_centers(n_dims) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters return self
def nanmean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, **unused_args): r""" Compute the differentially private arithmetic mean along the specified axis, ignoring NaNs. Returns the average of the array elements with differential privacy. The average is taken over the flattened array by default, otherwise over the specified axis. Noise is added using :class:`.Laplace` to satisfy differential privacy, where sensitivity is calculated using `bounds`. Users are advised to consult the documentation of :obj:`numpy.mean` for further details, as the behaviour of `mean` closely follows its Numpy variant. For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. Parameters ---------- array : array_like Array containing numbers whose mean is desired. If `array` is not an array, a conversion is attempted. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : int or tuple of ints, optional Axis or axes along which the means are computed. The default is to compute the mean of the flattened array. If this is a tuple of ints, a mean is performed over multiple axes, instead of a single axis or all the axes as before. dtype : data-type, optional Type to use in computing the mean. For integer inputs, the default is `float64`; for floating point inputs, it is the same as the input dtype. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray, see dtype parameter above Returns a new array containing the mean values. See Also -------- std, var, mean """ warn_unused_args(unused_args) return _mean(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=True)
def std(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, **unused_args): r""" Compute the standard deviation along the specified axis. Returns the standard deviation of the array elements, a measure of the spread of a distribution, with differential privacy. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. Noise is added using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using `bounds`. Users are advised to consult the documentation of :obj:`numpy.std` for further details, as the behaviour of `std` closely follows its Numpy variant. Parameters ---------- array : array_like Calculate the standard deviation of these values. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : int or tuple of ints, optional Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation of the flattened array. If this is a tuple of ints, a standard deviation is performed over multiple axes, instead of a single axis or all the axes as before. dtype : dtype, optional Type to use in computing the standard deviation. For arrays of integer type the default is float64, for arrays of float types it is the same as the array type. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `std` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- standard_deviation : ndarray, see dtype parameter above. Return a new array containing the standard deviation. See Also -------- var, mean, nanstd """ warn_unused_args(unused_args) return _std(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=False)
def nanvar(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private variance along the specified axis, ignoring NaNs. Returns the variance of the array elements, a measure of the spread of a distribution, with differential privacy. The variance is computer for the flattened array by default, otherwise over the specified axis. Noise is added using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using `bounds`. Users are advised to consult the documentation of :obj:`numpy.var` for further details, as the behaviour of `var` closely follows its Numpy variant. For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. Parameters ---------- array : array_like Array containing numbers whose variance is desired. If `array` is not an array, a conversion is attempted. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : int or tuple of ints, optional Axis or axes along which the variance is computed. The default is to compute the variance of the flattened array. If this is a tuple of ints, a variance is performed over multiple axes, instead of a single axis or all the axes as before. dtype : data-type, optional Type to use in computing the variance. For arrays of integer type the default is `float32`; for arrays of float types it is the same as the array type. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- variance : ndarray, see dtype parameter above If ``out=None``, returns a new array containing the variance; otherwise, a reference to the output array is returned. See Also -------- std , mean, var """ warn_unused_args(unused_args) return _var(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=True)
def nansum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue, **unused_args): r"""Sum of array elements over a given axis with differential privacy, ignoring NaNs. Parameters ---------- array : array_like Elements to sum. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. dtype : dtype, optional The type of the returned array and of the accumulator in which the elements are summed. The dtype of `array` is used by default unless `array` has an integer dtype of less precision than the default platform integer. In that case, if `array` is signed then the platform integer is used while if `array` is unsigned then an unsigned integer of the same precision as the platform integer is used. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `sum` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. Returns ------- sum_along_axis : ndarray An array with the same shape as `array`, with the specified axis removed. If `array` is a 0-d array, or if `axis` is None, a scalar is returned. If an output array is specified, a reference to `out` is returned. See Also -------- ndarray.sum : Equivalent non-private method. mean, sum """ warn_unused_args(unused_args) return _sum(array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype, keepdims=keepdims, nan=True)
def __init__(self, epsilon=1.0, data_norm=None, range_X=None, range_y=None, fit_intercept=True, copy_X=True, **unused_args): super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None) self.epsilon = epsilon self.data_norm = data_norm self.range_X = range_X self.range_y = range_y warn_unused_args(unused_args)
def __init__(self, n_components=None, centered=False, epsilon=1.0, data_norm=None, bounds=None, copy=True, whiten=False, random_state=None, accountant=None, **unused_args): super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0, iterated_power='auto', random_state=random_state) self.centered = centered self.epsilon = epsilon self.data_norm = data_norm self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args)
def median(array, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private median of the array. Returns the median with differential privacy. The median is calculated over each axis, or the flattened array if an axis is not provided. This method calls the :obj:`.quantile` method, for the 0.5 quantile. Parameters ---------- array : array_like Array containing numbers whose median is sought. If `array` is not an array, a conversion is attempted. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the median values. See Also -------- numpy.median : Equivalent non-private method. quantile, percentile """ warn_unused_args(unused_args) return quantile(array, 0.5, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant)
def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, **unused_args): super().__init__(n_clusters=n_clusters) self.epsilon = epsilon self.bounds = bounds warn_unused_args(unused_args) self.cluster_centers_ = None self.bounds_processed = None self.labels_ = None self.inertia_ = None self.n_iter_ = None
def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args): super().__init__(n_clusters=n_clusters) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args) self.cluster_centers_ = None self.bounds_processed = None self.labels_ = None self.inertia_ = None self.n_iter_ = None self._n_threads = 1
def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True, **unused_args): warn_unused_args(unused_args) if check_input: X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES) elif copy: X = X.copy(order='K') y = np.asarray(y, dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if fit_intercept: bounds_X = check_bounds(bounds_X, X.shape[1]) bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1) X = clip_to_bounds(X, bounds_X) y = clip_to_bounds(y, bounds_y) X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, accountant=BudgetAccountant()) X -= X_offset y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, accountant=BudgetAccountant()) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale
def __init__(self, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None, **unused_args): super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None) self.epsilon = epsilon self.bounds_X = bounds_X self.bounds_y = bounds_y self.accountant = BudgetAccountant.load_default(accountant) self.__repr__() warn_unused_args(unused_args)
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : returns an instance of self. """ if sample_weight is not None: warn_unused_args("sample_weight") max_norm = np.linalg.norm(X, axis=1).max() if self.data_norm is None: warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = max_norm if max_norm > self.data_norm: warnings.warn("Differential privacy is only guaranteed for data whose rows have a 2-norm of at most %g. " "Got %f\n" "Translate and/or scale the data accordingly to ensure differential privacy is achieved." % (self.data_norm, max_norm), PrivacyLeakWarning) if self.fit_intercept and (self.range_X is None or self.range_y is None): warnings.warn("Range parameters haven't been specified, so falling back to determining range from the " "data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range_X` and `range_y`.", PrivacyLeakWarning) if self.range_X is None: self.range_X = np.maximum(np.ptp(X, axis=0), 1e-5) if self.range_y is None: self.range_y = np.maximum(np.ptp(y, axis=0), 1e-5) X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) n_features = X.shape[1] epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 X, y, X_offset, y_offset, X_scale = self._preprocess_data(X, y, fit_intercept=self.fit_intercept, range_X=self.range_X, range_y=self.range_y, epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X) A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y)) AtA = np.dot(A.T, A) mech = Wishart().set_epsilon(self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity(self.data_norm) noisy_AtA = mech.randomise(AtA) noisy_AtA = noisy_AtA[:n_features, :] XtX = noisy_AtA[:, :n_features] Xty = noisy_AtA[:, n_features:] self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq(XtX, Xty, rcond=-1) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None): """Compute online update of Gaussian mean and variance. Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf Parameters ---------- n_past : int Number of samples represented in old mean and variance. If sample weights were given, this should contain the sum of sample weights represented in old mean and variance. mu : array-like, shape (number of Gaussians,) Means for Gaussians in original set. var : array-like, shape (number of Gaussians,) Variances for Gaussians in original set. sample_weight : ignored Ignored in diffprivlib. Returns ------- total_mu : array-like, shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. total_var : array-like, shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. """ if X.shape[0] == 0: return mu, var # Compute (potentially weighted) mean and variance of new datapoints if sample_weight is not None: warn_unused_args("sample_weight") n_new = X.shape[0] new_var = np.var(X, axis=0) new_mu = np.mean(X, axis=0) # Apply differential privacy to the new means and variances new_mu, new_var = self._randomise(new_mu, new_var, self.new_n_samples) if n_past == 0: return new_mu, new_var n_total = float(n_past + n_new) # Combine mean of old and new data, taking into consideration # (weighted) number of observations total_mu = (n_new * new_mu + n_past * mu) / n_total # Combine variance of old and new data, taking into consideration # (weighted) number of observations. This is achieved by combining # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_new * new_var total_ssd = old_ssd + new_ssd + (n_past / float(n_new * n_total)) * (n_new * mu - n_new * new_mu) ** 2 total_var = total_ssd / n_total return total_mu, total_var
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : returns an instance of self. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) if self.bounds_X is None or self.bounds_y is None: warnings.warn( "Bounds parameters haven't been specified, so falling back to determining bounds from the " "data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `bounds_X` and `bounds_y`.", PrivacyLeakWarning) if self.bounds_X is None: self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0)) if self.bounds_y is None: self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) self.bounds_X = check_bounds(self.bounds_X, X.shape[1]) self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) n_features = X.shape[1] n_targets = y.shape[1] if y.ndim > 1 else 1 epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, bounds_X=self.bounds_X, bounds_y=self.bounds_y, epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X) bounds_X = (self.bounds_X[0] - X_offset, self.bounds_X[1] - X_offset) bounds_y = (self.bounds_y[0] - y_offset, self.bounds_y[1] - y_offset) objs, obj_coefs = _construct_regression_obj( X, y, bounds_X, bounds_y, epsilon=self.epsilon * (1 - epsilon_intercept_scale), alpha=0) coef = np.zeros((n_features, n_targets)) residues = [] for i, obj in enumerate(objs): opt_result = minimize(obj, np.zeros(n_features), jac=True) coef[:, i] = opt_result.x residues += [opt_result.fun] self.coef_ = coef.T self._residues = residues self._obj_coefs = obj_coefs if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._residues = self._residues[0] self._set_intercept(X_offset, y_offset, X_scale) self.accountant.spend(self.epsilon, 0) return self
def partial_fit(self, X, y=None, sample_weight=None): """Online computation of mean and std with differential privacy on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y Ignored sample_weight Ignored by diffprivlib. Present for consistency with sklearn API. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon X = check_array(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # Hotfix for sklearn v 0.23 self.n_features_in_ = X.shape[1] if self.bounds is None: warnings.warn( "Range parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, X.shape[1]) X = clip_to_bounds(X, self.bounds) # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis if hasattr(self, 'n_samples_seen_') and isinstance( self.n_samples_seen_, (int, np.integer)): self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]).astype(np.int64) if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64) # First pass if not hasattr(self, 'scale_'): self.mean_ = .0 if self.with_std: self.var_ = .0 else: self.var_ = None if not self.with_mean and not self.with_std: self.mean_ = None self.var_ = None self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) else: self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( X, epsilon_0, self.bounds, self.mean_, self.var_, self.n_samples_seen_) # for backward-compatibility, reduce n_samples_seen_ to an integer # if the number of samples is the same for each feature (i.e. no # missing values) if np.ptp(self.n_samples_seen_) == 0: self.n_samples_seen_ = self.n_samples_seen_[0] if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None self.accountant.spend(self.epsilon, 0) return self
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : returns an instance of self. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) if self.fit_intercept: if self.bounds_X is None or self.bounds_y is None: warnings.warn( "Bounds parameters haven't been specified, so falling back to determining bounds from the " "data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `bounds_X` and `bounds_y`.", PrivacyLeakWarning) if self.bounds_X is None: self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0)) if self.bounds_y is None: self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) self.bounds_X = check_bounds(self.bounds_X, X.shape[1]) self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) n_features = X.shape[1] epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, bounds_X=self.bounds_X, bounds_y=self.bounds_y, epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X) if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y)) AtA = np.dot(A.T, A) mech = Wishart().set_epsilon( self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity( self.data_norm) noisy_AtA = mech.randomise(AtA) noisy_AtA = noisy_AtA[:n_features, :] XtX = noisy_AtA[:, :n_features] Xty = noisy_AtA[:, n_features:] self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq( XtX, Xty, rcond=-1) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) self.accountant.spend(self.epsilon, 0) return self
def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None, n_noisy=None): """Compute online update of Gaussian mean and variance. Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf Parameters ---------- n_past : int Number of samples represented in old mean and variance. If sample weights were given, this should contain the sum of sample weights represented in old mean and variance. mu : array-like, shape (number of Gaussians,) Means for Gaussians in original set. var : array-like, shape (number of Gaussians,) Variances for Gaussians in original set. sample_weight : ignored Ignored in diffprivlib. n_noisy : int, optional Noisy count of the given class, satisfying differential privacy. Returns ------- total_mu : array-like, shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. total_var : array-like, shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. """ if n_noisy is None: warnings.warn( "Noisy class count has not been specified and will be read from the data. To use this " "method correctly, make sure it is run by the parent GaussianNB class.", PrivacyLeakWarning) n_noisy = X.shape[0] if not n_noisy: return mu, var if sample_weight is not None: warn_unused_args("sample_weight") # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance n_features = X.shape[1] local_epsilon = self.epsilon / 3 / n_features new_mu = np.zeros((n_features, )) new_var = np.zeros((n_features, )) for feature in range(n_features): _X = X[:, feature] lower, upper = self.bounds[0][feature], self.bounds[1][feature] local_diameter = upper - lower mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter, lower=lower * n_noisy, upper=upper * n_noisy) _mu = mech_mu.randomise(_X.sum()) / n_noisy local_sq_sens = max(_mu - lower, upper - _mu)**2 mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0, upper=local_sq_sens * n_noisy) _var = mech_var.randomise(((_X - _mu)**2).sum()) / n_noisy new_mu[feature] = _mu new_var[feature] = _var if n_past == 0: return new_mu, new_var n_total = float(n_past + n_noisy) # Combine mean of old and new data, taking into consideration # (weighted) number of observations total_mu = (n_noisy * new_mu + n_past * mu) / n_total # Combine variance of old and new data, taking into consideration # (weighted) number of observations. This is achieved by combining # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_noisy * new_var total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * ( n_noisy * mu - n_noisy * new_mu)**2 total_var = total_ssd / n_total return total_mu, total_var
def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") del y X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32]) n_samples, n_dims = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters)) iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5) X = clip_to_bounds(X, self.bounds) centers = self._init_centers(n_dims) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters self.accountant.spend(self.epsilon, 0) return self
def quantile(array, quant, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private quantile of the array. Returns the specified quantile with differential privacy. The quantile is calculated over the flattened array. Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by Smith, 2011. Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 Parameters ---------- array : array_like Array containing numbers whose quantile is sought. If `array` is not an array, a conversion is attempted. quant : float or array-like Quantile or array of quantiles. Each quantile must be in the unit interval [0, 1]. If quant is array-like, quantiles are returned over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the quantile values. See Also -------- numpy.quantile : Equivalent non-private method. percentile, median """ warn_unused_args(unused_args) if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) quant = np.ravel(quant) if np.any(quant < 0) or np.any(quant > 1): raise ValueError("Quantiles must be in the unit interval [0, 1].") if len(quant) > 1: return np.array([ quantile(array, q_i, epsilon=epsilon / len(quant), bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) for q_i in quant ]) # Dealing with a single quant from now on quant = quant.item() if axis is not None or keepdims: return _wrap_axis(quantile, array, quant=quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) # Dealing with a scalar output from now on bounds = check_bounds(bounds, shape=0, min_separation=1e-5) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) k = array.size array = np.append(array, list(bounds)) array.sort() interval_sizes = np.diff(array) # Todo: Need to find a way to do this in a differentially private way if np.isnan(interval_sizes).any(): return np.nan mech = Exponential(epsilon=epsilon, sensitivity=1, utility=list(-np.abs(np.arange(0, k + 1) - quant * k)), measure=list(interval_sizes)) idx = mech.randomise() output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx] accountant.spend(epsilon, 0) return output
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y) if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, shape=X.shape[1]) X = clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) if self.priors is not None: priors = np.asarray(self.priors) if len(priors) != n_classes: raise ValueError( "Number of priors must match number of classes.") if not np.isclose(priors.sum(), 1.0): raise ValueError("The sum of the priors should be 1.") if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( "The target label(s) %s in y do not exist in the initial classes %s" % (unique_y[~unique_y_in_classes], classes)) noisy_class_counts = self._noisy_class_counts(y) for _i, y_i in enumerate(unique_y): i = classes.searchsorted(y_i) X_i = X[y == y_i, :] n_i = noisy_class_counts[_i] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += n_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() self.accountant.spend(self.epsilon, 0) return self
def histogram2d(array_x, array_y, epsilon=1.0, bins=10, range=None, weights=None, density=None, accountant=None, **unused_args): r""" Compute the differentially private bi-dimensional histogram of two data samples. Parameters ---------- array_x : array_like, shape (N,) An array containing the x coordinates of the points to be histogrammed. array_y : array_like, shape (N,) An array containing the y coordinates of the points to be histogrammed. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon` to be applied. bins : int or array_like or [int, int] or [array, array], default: 10 The bin specification: * If int, the number of bins for the two dimensions (nx=ny=bins). * If array_like, the bin edges for the two dimensions (x_edges=y_edges=bins). * If [int, int], the number of bins in each dimension (nx, ny = bins). * If [array, array], the bin edges in each dimension (x_edges, y_edges = bins). * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges. range : array_like, shape(2,2), optional The leftmost and rightmost edges of the bins along each dimension (if not specified explicitly in the `bins` parameters): ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range will be considered outliers and not tallied in the histogram. density : bool, optional If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_area``. weights : array_like, shape(N,), optional An array of values ``w_i`` weighing each sample ``(x_i, y_i)``. Weights are normalized to 1 if `normed` is True. If `normed` is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- H : ndarray, shape(nx, ny) The bi-dimensional histogram of samples `x` and `y`. Values in `x` are histogrammed along the first dimension and values in `y` are histogrammed along the second dimension. xedges : ndarray, shape(nx+1,) The bin edges along the first dimension. yedges : ndarray, shape(ny+1,) The bin edges along the second dimension. See Also -------- histogram : 1D differentially private histogram histogramdd : Differentially private Multidimensional histogram Notes ----- When `normed` is True, then the returned histogram is the sample density, defined such that the sum over bins of the product ``bin_value * bin_area`` is 1. Please note that the histogram does not follow the Cartesian convention where `x` values are on the abscissa and `y` values on the ordinate axis. Rather, `x` is histogrammed along the first dimension of the array (vertical), and `y` along the second dimension of the array (horizontal). This ensures compatibility with `histogramdd`. """ warn_unused_args(unused_args) try: num_bins = len(bins) except TypeError: num_bins = 1 if num_bins not in (1, 2): xedges = yedges = np.asarray(bins) bins = [xedges, yedges] hist, edges = histogramdd([array_x, array_y], epsilon=epsilon, bins=bins, range=range, weights=weights, density=density, accountant=accountant) return hist, edges[0], edges[1]
def histogramdd(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, accountant=None, **unused_args): r""" Compute the differentially private multidimensional histogram of some data. The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` is thrown. Users are referred to :obj:`numpy.histogramdd` for more usage notes. Parameters ---------- sample : (N, D) array, or (D, N) array_like The data to be histogrammed. Note the unusual interpretation of sample when an array_like: * When an array, each row is a coordinate in a D-dimensional space - such as ``histogramgramdd(np.array([p1, p2, p3]))``. * When an array_like, each element is the list of values for single coordinate - such as ``histogramgramdd((X, Y, Z))``. The first form should be preferred. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon` to be applied. bins : sequence or int, default: 10 The bin specification: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values. density : bool, optional If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. weights : (N,) array_like, optional An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. Weights are normalized to 1 if normed is True. If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- H : ndarray The multidimensional histogram of sample x. See normed and weights for the different possible semantics. edges : list A list of D arrays describing the bin edges for each dimension. See Also -------- histogram: 1-D differentially private histogram histogram2d: 2-D differentially private histogram """ warn_unused_args(unused_args) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Range only required if bin edges not specified if np.array(bins, dtype=object).ndim == 0 or not np.all( [np.ndim(_bin) for _bin in bins]): if range is None or (isinstance(range, list) and None in range): warnings.warn( "Range parameter has not been specified (or has missing elements). Falling back to taking " "range from the data.\n " "To ensure differential privacy, and no additional privacy leakage, the range must be " "specified for each dimension independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning) hist, bin_edges = np.histogramdd(sample, bins=bins, range=range, normed=None, weights=weights, density=None) dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize) dp_hist = np.zeros_like(hist) iterator = np.nditer(hist, flags=['multi_index']) while not iterator.finished: dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0])) iterator.iternext() dp_hist = dp_hist.astype(float, casting='safe') if density: # calculate the probability density function dims = len(dp_hist.shape) dp_hist_sum = dp_hist.sum() for i in np.arange(dims): shape = np.ones(dims, int) shape[i] = dp_hist.shape[i] # noinspection PyUnresolvedReferences dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape) if dp_hist_sum > 0: dp_hist /= dp_hist_sum accountant.spend(epsilon, 0) return dp_hist, bin_edges
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") if not isinstance(self.C, numbers.Real) or self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: raise ValueError( "Maximum number of iteration must be positive; got (max_iter=%r)" % self.max_iter) if not isinstance(self.tol, numbers.Real) or self.tol < 0: raise ValueError( "Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol) solver = _check_solver(self.solver, self.penalty, self.dual) X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C", accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) _, n_features = X.shape if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) self.multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_)) n_classes = len(self.classes_) classes_ = self.classes_ if n_classes < 2: raise ValueError( "This solver needs samples of at least 2 classes in the data, but the data contains only " "one class: %r" % classes_[0]) if len(self.classes_) == 2: n_classes = 1 classes_ = classes_[1:] if self.warm_start: warm_start_coef = getattr(self, 'coef_', None) else: warm_start_coef = None if warm_start_coef is not None and self.fit_intercept: warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1) self.coef_ = list() self.intercept_ = np.zeros(n_classes) if warm_start_coef is None: warm_start_coef = [None] * n_classes path_func = delayed(_logistic_regression_path) fold_coefs_ = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='processes'))( path_func(X, y, epsilon=self.epsilon / n_classes, data_norm=self.data_norm, pos_class=class_, Cs=[self.C], fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, coef=warm_start_coef_, check_input=False) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) fold_coefs_, _, n_iter_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] self.coef_ = np.asarray(fold_coefs_) self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) if self.fit_intercept: self.intercept_ = self.coef_[:, -1] self.coef_ = self.coef_[:, :-1] self.accountant.spend(self.epsilon, 0) return self
def histogram(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, accountant=None, **unused_args): r""" Compute the differentially private histogram of a set of data. The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` is thrown. Users are referred to :obj:`numpy.histogram` for more usage notes. Parameters ---------- sample : array_like Input data. The histogram is computed over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon` to be applied. bins : int or sequence of scalars or str, default: 10 If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by `histogram_bin_edges`. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- hist : array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- histogramdd, histogram2d Notes ----- All but the last (righthand-most) bin is half-open. In other words, if `bins` is:: [1, 2, 3, 4] then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. """ warn_unused_args(unused_args) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) if range is None: warnings.warn( "Range parameter has not been specified. Falling back to taking range from the data.\n" "To ensure differential privacy, and no additional privacy leakage, the range must be " "specified independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning) hist, bin_edges = np.histogram(sample, bins=bins, range=range, weights=weights, density=None) dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize) dp_hist = np.zeros_like(hist) for i in np.arange(dp_hist.shape[0]): dp_hist[i] = dp_mech.randomise(int(hist[i])) # dp_hist = dp_hist.astype(float, casting='safe') accountant.spend(epsilon, 0) if density: bin_sizes = np.array(np.diff(bin_edges), float) return dp_hist / bin_sizes / (dp_hist.sum() if dp_hist.sum() else 1), bin_edges return dp_hist, bin_edges
def _logistic_regression_path(X, y, epsilon, data_norm, pos_class=None, Cs=10, fit_intercept=True, max_iter=100, tol=1e-4, verbose=0, coef=None, check_input=True, **unused_args): """Compute a Logistic Regression model with differential privacy for a list of regularization parameters. Takes inspiration from ``_logistic_regression_path`` in scikit-learn, specified to the LBFGS solver and one-vs-rest multi class fitting. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Input data. y : array-like, shape (n_samples,) or (n_samples, n_targets) Input data, target values. epsilon : float Privacy parameter for differential privacy. data_norm : float Max norm of the data for which differential privacy is satisfied. pos_class : int, optional The class with respect to which we perform a one-vs-all fit. If None, then it is assumed that the given problem is binary. Cs : int | array-like, shape (n_cs,), default: 10 List of values for the regularization parameter or integer specifying the number of regularization parameters that should be used. In this case, the parameters will be chosen in a logarithmic scale between 1e-4 and 1e4. fit_intercept : bool, default: True Whether to fit an intercept for the model. In this case the shape of the returned array is (n_cs, n_features + 1). max_iter : int, default: 100 Maximum number of iterations for the solver. tol : float, default: 1e-4 Stopping criterion. For the newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient. verbose : int, default: 0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. coef : array-like, shape (n_features,), optional Initialization value for coefficients of logistic regression. Useless for liblinear solver. check_input : bool, default: True If False, the input arrays X and y will not be checked. Returns ------- coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) List of coefficients for the Logistic Regression model. If fit_intercept is set to True then the second dimension will be n_features + 1, where the last item represents the intercept. For ``multiclass='multinomial'``, the shape is (n_classes, n_cs, n_features) or (n_classes, n_cs, n_features + 1). Cs : ndarray Grid of Cs used for cross-validation. n_iter : array, shape (n_cs,) Actual number of iteration for each Cs. """ warn_unused_args(unused_args) if isinstance(Cs, numbers.Integral): Cs = np.logspace(-4, 4, int(Cs)) solver = 'lbfgs' # Data norm increases if intercept is included if fit_intercept: data_norm = np.sqrt(data_norm**2 + 1) # Pre-processing. if check_input: X = check_array(X, accept_sparse='csr', dtype=np.float64, accept_large_sparse=solver != 'liblinear') y = check_array(y, ensure_2d=False, dtype=None) check_consistent_length(X, y) _, n_features = X.shape classes = np.unique(y) if pos_class is None: if classes.size > 2: raise ValueError('To fit OvR, use the pos_class argument') # np.unique(y) gives labels in sorted order. pos_class = classes[1] sample_weight = np.ones(X.shape[0], dtype=X.dtype) # For doing a ovr, we need to mask the labels first. output_vec = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=X.dtype) y_bin[~mask] = -1. # for compute_class_weight if coef is not None: # it must work both giving the bias term and not if coef.size not in (n_features, output_vec.size): raise ValueError( 'Initialization coef is of shape %d, expected shape %d or %d' % (coef.size, n_features, output_vec.size)) output_vec[:coef.size] = coef target = y_bin coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) for i, C in enumerate(Cs): vector_mech = Vector(epsilon=epsilon, dimension=n_features + int(fit_intercept), alpha=1. / C, function_sensitivity=0.25, data_sensitivity=data_norm) noisy_logistic_loss = vector_mech.randomise(_logistic_loss_and_grad) iprint = [-1, 50, 1, 100, 101][np.searchsorted(np.array([0, 1, 2, 3]), verbose)] output_vec, _, info = optimize.fmin_l_bfgs_b(noisy_logistic_loss, output_vec, fprime=None, args=(X, target, 1. / C, sample_weight), iprint=iprint, pgtol=tol, maxiter=max_iter) if info["warnflag"] == 1: warnings.warn( "lbfgs failed to converge. Increase the number of iterations.", ConvergenceWarning) coefs.append(output_vec.copy()) n_iter[i] = info['nit'] return np.array(coefs), np.array(Cs), n_iter