Exemple #1
0
    def test_scalar_output(self):
        bounds = check_bounds((1, 2), shape=0)
        self.assertIsInstance(bounds[0], Real)
        self.assertIsInstance(bounds[1], Real)

        bounds = check_bounds((1, 2), shape=0, dtype=int)
        self.assertIsInstance(bounds[0], int)
        self.assertIsInstance(bounds[1], int)

        bounds = check_bounds((1, 2), shape=0, dtype=float)
        self.assertIsInstance(bounds[0], float)
        self.assertIsInstance(bounds[1], float)
Exemple #2
0
    def test_min_separation(self):
        bounds = check_bounds((1, 1), min_separation=2)
        self.assertEqual(0, bounds[0])
        self.assertEqual(2, bounds[1])

        bounds = check_bounds((1., 1.), min_separation=1)
        self.assertEqual(0.5, bounds[0])
        self.assertEqual(1.5, bounds[1])

        bounds = check_bounds((0.9, 1.1), min_separation=1)
        self.assertEqual(0.5, bounds[0])
        self.assertEqual(1.5, bounds[1])
Exemple #3
0
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                          accountant=accountant, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanvar if nan else np.var
    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0,
                                   sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0,
                                   upper=((upper - lower) ** 2) / 4)
    output = dp_mech.randomise(actual_var)

    accountant.spend(epsilon, 0)

    return output
Exemple #4
0
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=False, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype,
                          keepdims=keepdims, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nansum if nan else np.sum
    actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated
    mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size)
    output = mech.randomise(actual_sum)

    accountant.spend(epsilon, 0)

    return output
def _preprocess_data(X,
                     y,
                     fit_intercept,
                     epsilon=1.0,
                     bounds_X=None,
                     bounds_y=None,
                     copy=True,
                     check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X,
                        axis=0,
                        bounds=bounds_X,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y,
                        axis=0,
                        bounds=bounds_y,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nansum if nan else np.sum
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated

    if isinstance(actual_sum, np.ndarray):
        dp_sum = np.zeros_like(actual_sum, dtype=dtype)
        iterator = np.nditer(actual_sum, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0])
            local_diam = _upper - _lower
            mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).\
                set_bounds(_lower * n_datapoints, _upper * n_datapoints)

            dp_sum[idx] = mech.randomise(actual_sum[idx])
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_sum

    local_diam = upper[0] - lower[0]
    mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).set_bounds(lower[0] * n_datapoints,
                                                                                 upper[0] * n_datapoints)

    accountant.spend(epsilon, 0)

    return mech.randomise(actual_sum)
Exemple #7
0
    def test_incorrect_entries(self):
        with self.assertRaises(ValueError):
            check_bounds(([1, 2], 1))

        with self.assertRaises(ValueError):
            check_bounds(([1, 2], [1, 2, 3]))

        with self.assertRaises(ValueError):
            check_bounds(([1, 2], [1, 2], [1, 2]))
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nanvar if nan else np.var
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    if isinstance(actual_var, np.ndarray):
        dp_var = np.zeros_like(actual_var)
        iterator = np.nditer(actual_var, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            local_diam = upper[idx] - lower[idx] if vector_out else upper[0] - lower[0]
            dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \
                .set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

            dp_var[iterator.multi_index] = np.minimum(dp_mech.randomise(actual_var[idx]), local_diam ** 2)
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_var

    local_diam = upper[0] - lower[0]
    dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \
        set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

    accountant.spend(epsilon, 0)

    return np.minimum(dp_mech.randomise(actual_var), local_diam ** 2)
Exemple #9
0
def _wrap_axis(func, array, *, axis, keepdims, epsilon, bounds, **kwargs):
    """Wrapper for functions with axis and keepdims parameters to ensure the function only needs to be evaluated on
    scalar outputs.

    """
    dummy = np.zeros_like(array).sum(axis=axis, keepdims=keepdims)
    array = np.asarray(array)
    ndim = array.ndim
    bounds = check_bounds(bounds, np.size(dummy) if np.ndim(dummy) == 1 else 0)

    if isinstance(axis, int):
        axis = (axis, )
    elif axis is None:
        axis = tuple(range(ndim))

    # Ensure all axes are non-negative
    axis = tuple(ndim + ax if ax < 0 else ax for ax in axis)

    if isinstance(dummy, np.ndarray):
        iterator = np.nditer(dummy, flags=['multi_index'])

        while not iterator.finished:
            idx = list(iterator.multi_index)  # Multi index on 'dummy'
            _bounds = (bounds[0][idx],
                       bounds[1][idx]) if np.ndim(dummy) == 1 else bounds

            # Construct slicing tuple on 'array'
            if len(idx) + len(axis) > ndim:
                full_slice = tuple(
                    slice(None) if ax in axis else idx[ax]
                    for ax in range(ndim))
            else:
                idx.reverse()
                full_slice = tuple(
                    slice(None) if ax in axis else idx.pop()
                    for ax in range(ndim))

            dummy[iterator.multi_index] = func(array[full_slice],
                                               epsilon=epsilon / dummy.size,
                                               bounds=_bounds,
                                               **kwargs)
            iterator.iternext()

        return dummy

    return func(array, bounds=bounds, epsilon=epsilon, **kwargs)
Exemple #10
0
 def test_wrong_order(self):
     with self.assertRaises(ValueError):
         check_bounds((2, 1))
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values.  Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X,
                         y,
                         accept_sparse=False,
                         y_numeric=True,
                         multi_output=True)

        if self.bounds_X is None or self.bounds_y is None:
            warnings.warn(
                "Bounds parameters haven't been specified, so falling back to determining bounds from the "
                "data.\n"
                "This will result in additional privacy leakage. To ensure differential privacy with no "
                "additional privacy loss, specify `bounds_X` and `bounds_y`.",
                PrivacyLeakWarning)

            if self.bounds_X is None:
                self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0))
            if self.bounds_y is None:
                self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

        self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
        self.bounds_y = check_bounds(self.bounds_y,
                                     y.shape[1] if y.ndim > 1 else 1)

        n_features = X.shape[1]
        n_targets = y.shape[1] if y.ndim > 1 else 1
        epsilon_intercept_scale = 1 / (n_features +
                                       1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            bounds_X=self.bounds_X,
            bounds_y=self.bounds_y,
            epsilon=self.epsilon * epsilon_intercept_scale,
            copy=self.copy_X)

        bounds_X = (self.bounds_X[0] - X_offset, self.bounds_X[1] - X_offset)
        bounds_y = (self.bounds_y[0] - y_offset, self.bounds_y[1] - y_offset)

        objs, obj_coefs = _construct_regression_obj(
            X,
            y,
            bounds_X,
            bounds_y,
            epsilon=self.epsilon * (1 - epsilon_intercept_scale),
            alpha=0)
        coef = np.zeros((n_features, n_targets))
        residues = []

        for i, obj in enumerate(objs):
            opt_result = minimize(obj, np.zeros(n_features), jac=True)
            coef[:, i] = opt_result.x
            residues += [opt_result.fun]

        self.coef_ = coef.T
        self._residues = residues
        self._obj_coefs = obj_coefs

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
            self._residues = self._residues[0]
        self._set_intercept(X_offset, y_offset, X_scale)

        self.accountant.spend(self.epsilon, 0)

        return self
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = check_bounds(self.bounds, n_features)
            self.mean_ = mean(X,
                              epsilon=self.epsilon / 2,
                              bounds=self.bounds,
                              axis=0,
                              accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        XtX = np.dot(X.T, X)

        mech = Wishart().set_epsilon(self.epsilon if self.centered else self.epsilon / 2).\
            set_sensitivity(self.data_norm)
        noisy_input = mech.randomise(XtX)

        u, s, v = np.linalg.svd(noisy_input)
        u, v = svd_flip(u, v)
        s = np.sqrt(s)

        components_ = v

        # Get variance explained by singular values
        explained_variance_ = (s**2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = s.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            try:
                n_components = sk_pca._infer_dimension(explained_variance_,
                                                       n_samples)
            except AttributeError:
                n_components = sk_pca._infer_dimension_(
                    explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u, s, v
 def test_non_numeric(self):
     with self.assertRaises(ValueError):
         check_bounds(("One", "Two"))
Exemple #14
0
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values.  Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X,
                         y,
                         accept_sparse=False,
                         y_numeric=True,
                         multi_output=True)

        if self.fit_intercept:
            if self.bounds_X is None or self.bounds_y is None:
                warnings.warn(
                    "Bounds parameters haven't been specified, so falling back to determining bounds from the "
                    "data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `bounds_X` and `bounds_y`.",
                    PrivacyLeakWarning)

                if self.bounds_X is None:
                    self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0))
                if self.bounds_y is None:
                    self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

            self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
            self.bounds_y = check_bounds(self.bounds_y,
                                         y.shape[1] if y.ndim > 1 else 1)

        n_features = X.shape[1]
        epsilon_intercept_scale = 1 / (n_features +
                                       1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            bounds_X=self.bounds_X,
            bounds_y=self.bounds_y,
            epsilon=self.epsilon * epsilon_intercept_scale,
            copy=self.copy_X)

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y))
        AtA = np.dot(A.T, A)

        mech = Wishart().set_epsilon(
            self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity(
                self.data_norm)
        noisy_AtA = mech.randomise(AtA)

        noisy_AtA = noisy_AtA[:n_features, :]
        XtX = noisy_AtA[:, :n_features]
        Xty = noisy_AtA[:, n_features:]

        self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq(
            XtX, Xty, rcond=-1)
        self.coef_ = self.coef_.T

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)

        self.accountant.spend(self.epsilon, 0)

        return self
Exemple #15
0
    def fit(self, X, y=None, sample_weight=None):
        """Computes k-means clustering with differential privacy.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            not used, present here for API consistency by convention.

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : class

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        del y

        X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32])
        n_samples, n_dims = X.shape

        if n_samples < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters))

        iters = self._calc_iters(n_dims, n_samples)

        if self.bounds is None:
            warnings.warn("Bounds have not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5)
        X = clip_to_bounds(X, self.bounds)

        centers = self._init_centers(n_dims)
        labels = None
        distances = None

        # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely
        for _ in range(-1, iters):
            if labels is not None:
                centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters)

            distances, labels = self._distances_labels(X, centers)

        self.cluster_centers_ = centers
        self.labels_ = labels
        self.inertia_ = distances[np.arange(len(labels)), labels].sum()
        self.n_iter_ = iters

        self.accountant.spend(self.epsilon, 0)

        return self
 def test_complex(self):
     with self.assertRaises(TypeError):
         check_bounds((1.0, 1 + 2j), dtype=complex)
Exemple #17
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y)

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        self.epsilon_ = self.var_smoothing

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            if self.priors is not None:
                priors = np.asarray(self.priors)

                if len(priors) != n_classes:
                    raise ValueError(
                        "Number of priors must match number of classes.")
                if not np.isclose(priors.sum(), 1.0):
                    raise ValueError("The sum of the priors should be 1.")
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features %d does not match previous data %d." %
                    (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError(
                "The target label(s) %s in y do not exist in the initial classes %s"
                % (unique_y[~unique_y_in_classes], classes))

        noisy_class_counts = self._noisy_class_counts(y)

        for _i, y_i in enumerate(unique_y):
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            n_i = noisy_class_counts[_i]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i],
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                n_noisy=n_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += n_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        self.accountant.spend(self.epsilon, 0)

        return self
def quantile(array,
             quant,
             epsilon=1.0,
             bounds=None,
             axis=None,
             keepdims=False,
             accountant=None,
             **unused_args):
    r"""
    Compute the differentially private quantile of the array.

    Returns the specified quantile with differential privacy.  The quantile is calculated over the flattened array.
    Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by
    Smith, 2011.

    Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743

    Parameters
    ----------
    array : array_like
        Array containing numbers whose quantile is sought.  If `array` is not an array, a conversion is attempted.

    quant : float or array-like
        Quantile or array of quantiles.  Each quantile must be in the unit interval [0, 1].  If quant is array-like,
        quantiles are returned over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the quantile values.

    See Also
    --------
    numpy.quantile : Equivalent non-private method.

    percentile, median

    """
    warn_unused_args(unused_args)

    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    quant = np.ravel(quant)

    if np.any(quant < 0) or np.any(quant > 1):
        raise ValueError("Quantiles must be in the unit interval [0, 1].")

    if len(quant) > 1:
        return np.array([
            quantile(array,
                     q_i,
                     epsilon=epsilon / len(quant),
                     bounds=bounds,
                     axis=axis,
                     keepdims=keepdims,
                     accountant=accountant) for q_i in quant
        ])

    # Dealing with a single quant from now on
    quant = quant.item()

    if axis is not None or keepdims:
        return _wrap_axis(quantile,
                          array,
                          quant=quant,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          keepdims=keepdims,
                          accountant=accountant)

    # Dealing with a scalar output from now on
    bounds = check_bounds(bounds, shape=0, min_separation=1e-5)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    k = array.size
    array = np.append(array, list(bounds))
    array.sort()

    interval_sizes = np.diff(array)

    # Todo: Need to find a way to do this in a differentially private way
    if np.isnan(interval_sizes).any():
        return np.nan

    mech = Exponential(epsilon=epsilon,
                       sensitivity=1,
                       utility=list(-np.abs(np.arange(0, k + 1) - quant * k)),
                       measure=list(interval_sizes))
    idx = mech.randomise()
    output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx]

    accountant.spend(epsilon, 0)

    return output
Exemple #19
0
 def test_non_tuple(self):
     with self.assertRaises(TypeError):
         check_bounds([1, 2, 3])
    def partial_fit(self, X, y=None, sample_weight=None):
        """Online computation of mean and std with differential privacy on X for later scaling.  All of X is processed
        as a single batch.  This is intended for cases when `fit` is not feasible due to very large number of
        `n_samples` or because X is read from a continuous stream.

        The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and
        Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American
        Statistician 37.3 (1983): 242-247:

        Parameters
        ----------
        X : {array-like}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation used for later scaling along the features axis.

        y
            Ignored

        sample_weight
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon

        X = check_array(X,
                        accept_sparse=False,
                        copy=self.copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES,
                        force_all_finite='allow-nan')
        # Hotfix for sklearn v 0.23
        self.n_features_in_ = X.shape[1]

        if self.bounds is None:
            warnings.warn(
                "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
                "This will result in additional privacy leakage.  To ensure differential privacy with no "
                "additional privacy loss, specify `range` for each valued returned by np.mean().",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental
        # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis

        # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of
        # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis
        if hasattr(self, 'n_samples_seen_') and isinstance(
                self.n_samples_seen_, (int, np.integer)):
            self.n_samples_seen_ = np.repeat(self.n_samples_seen_,
                                             X.shape[1]).astype(np.int64)

        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)

        # First pass
        if not hasattr(self, 'scale_'):
            self.mean_ = .0
            if self.with_std:
                self.var_ = .0
            else:
                self.var_ = None

        if not self.with_mean and not self.with_std:
            self.mean_ = None
            self.var_ = None
            self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
        else:
            self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
                X, epsilon_0, self.bounds, self.mean_, self.var_,
                self.n_samples_seen_)

        # for backward-compatibility, reduce n_samples_seen_ to an integer
        # if the number of samples is the same for each feature (i.e. no
        # missing values)
        if np.ptp(self.n_samples_seen_) == 0:
            self.n_samples_seen_ = self.n_samples_seen_[0]

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        self.accountant.spend(self.epsilon, 0)

        return self
Exemple #21
0
 def test_non_numeric(self):
     with self.assertRaises(Exception):
         check_bounds(("One", "Two"))
    def test_bad_shape(self):
        with self.assertRaises(ValueError):
            check_bounds(([1, 1], [2, 2]), shape=-2)

        with self.assertRaises(TypeError):
            check_bounds(([1, 1], [2, 2]), shape=2.0)
Exemple #23
0
 def test_wrong_dims(self):
     with self.assertRaises(ValueError):
         check_bounds(([1, 1], [2, 2]), shape=3)
Exemple #24
0
 def test_consistency(self):
     bounds = check_bounds(([1, 1], [2, 2]), shape=2)
     bounds2 = check_bounds(bounds, shape=2)
     self.assertTrue(np.all(bounds[0] == bounds2[0]))
     self.assertTrue(np.all(bounds[1] == bounds2[1]))
Exemple #25
0
 def test_array_output(self):
     bounds = check_bounds(([1, 1], [2, 2]), shape=2)
     self.assertIsInstance(bounds[0], np.ndarray)
     self.assertIsInstance(bounds[1], np.ndarray)