Ejemplo n.º 1
0
    def _update_centers(self, X, centers, labels, dims, total_iters):
        """Updates the centers of the KMeans algorithm for the current iteration, while satisfying differential
        privacy.

        Differential privacy is satisfied by adding (integer-valued, using :class:`.GeometricFolded`) random noise to
        the count of nearest neighbours to the previous cluster centers, and adding (real-valued, using
        :class:`.LaplaceBoundedDomain`) random noise to the sum of values per dimension.

        """
        epsilon_0, epsilon_i = self._split_epsilon(dims, total_iters)
        geometric_mech = GeometricFolded().set_sensitivity(1).set_bounds(0.5, float("inf")).set_epsilon(epsilon_0)
        laplace_mech = LaplaceBoundedDomain().set_epsilon(epsilon_i)

        for cluster in range(self.n_clusters):
            if cluster not in labels:
                continue

            cluster_count = sum(labels == cluster)
            noisy_count = geometric_mech.randomise(cluster_count)

            cluster_sum = np.sum(X[labels == cluster], axis=0)
            noisy_sum = np.zeros_like(cluster_sum)

            for i in range(dims):
                laplace_mech.set_sensitivity(self.bounds[1][i] - self.bounds[0][i]) \
                    .set_bounds(noisy_count * self.bounds[0][i], noisy_count * self.bounds[1][i])
                noisy_sum[i] = laplace_mech.randomise(cluster_sum[i])

            centers[cluster, :] = noisy_sum / noisy_count

        return centers
Ejemplo n.º 2
0
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                          accountant=accountant, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanvar if nan else np.var
    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0,
                                   sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0,
                                   upper=((upper - lower) ** 2) / 4)
    output = dp_mech.randomise(actual_var)

    accountant.spend(epsilon, 0)

    return output
Ejemplo n.º 3
0
    def _randomise(self, mean, var, n_samples):
        """Randomises the learned means and variances subject to differential privacy."""
        features = var.shape[0]

        local_epsilon = self.epsilon / 2
        local_epsilon /= features

        if len(self.bounds) != features:
            raise ValueError(
                "Bounds must be specified for each feature dimension")

        new_mu = np.zeros_like(mean)
        new_var = np.zeros_like(var)

        for feature in range(features):
            local_diameter = self.bounds[feature][1] - self.bounds[feature][0]
            mech_mu = Laplace().set_sensitivity(
                local_diameter / n_samples).set_epsilon(local_epsilon)
            mech_var = LaplaceBoundedDomain().set_sensitivity((n_samples - 1) * local_diameter ** 2 / n_samples ** 2)\
                .set_epsilon(local_epsilon).set_bounds(0, float("inf"))

            new_mu[feature] = mech_mu.randomise(mean[feature])
            new_var[feature] = mech_var.randomise(var[feature])

        return new_mu, new_var
    def _randomise(self, mu, var, n_samples):
        """Randomises the learned means and variances subject to differential privacy."""
        features = var.shape[0]

        local_epsilon = self.epsilon / 2
        local_epsilon /= features

        if len(self.bounds) != features:
            raise ValueError("Bounds must be specified for each feature dimension")

        # Extra np.array() a temporary fix for PyLint bug: https://github.com/PyCQA/pylint/issues/2747
        new_mu = np.array(np.zeros_like(mu))
        new_var = np.array(np.zeros_like(var))

        for feature in range(features):
            local_diameter = self.bounds[feature][1] - self.bounds[feature][0]
            mech_mu = Laplace().set_sensitivity(local_diameter / n_samples).set_epsilon(local_epsilon)
            mech_var = LaplaceBoundedDomain().set_sensitivity((n_samples - 1) * local_diameter ** 2 / n_samples ** 2)\
                .set_epsilon(local_epsilon).set_bounds(0, float("inf"))

            new_mu[feature] = mech_mu.randomise(mu[feature])
            new_var[feature] = mech_var.randomise(var[feature])

        return new_mu, new_var
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nanvar if nan else np.var
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    if isinstance(actual_var, np.ndarray):
        dp_var = np.zeros_like(actual_var)
        iterator = np.nditer(actual_var, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            local_diam = upper[idx] - lower[idx] if vector_out else upper[0] - lower[0]
            dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \
                .set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

            dp_var[iterator.multi_index] = np.minimum(dp_mech.randomise(actual_var[idx]), local_diam ** 2)
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_var

    local_diam = upper[0] - lower[0]
    dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \
        set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

    accountant.spend(epsilon, 0)

    return np.minimum(dp_mech.randomise(actual_var), local_diam ** 2)
def covariance_eig(array,
                   epsilon=1.0,
                   norm=None,
                   dims=None,
                   eigvals_only=False):
    r"""
    Return the eigenvalues and eigenvectors of the covariance matrix of `array`, satisfying differential privacy.

    Paper link: http://papers.nips.cc/paper/9567-differentially-private-covariance-estimation.pdf

    Parameters
    ----------
    array : array-like, shape (n_samples, n_features)
        Matrix for which the covariance matrix is sought.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    norm : float, optional
        The max l2 norm of any row of the input array.  This defines the spread of data that will be protected by
        differential privacy.

        If not specified, the max norm is taken from the data, but will result in a :class:`.PrivacyLeakWarning`, as it
        reveals information about the data.  To preserve differential privacy fully, `norm` should be selected
        independently of the data, i.e. with domain knowledge.

    dims : int, optional
        Number of eigenvectors to return.  If `None`, return all eigenvectors.

    eigvals_only : bool, default: False
        Only return the eigenvalue estimates.  If True, all the privacy budget is spent on estimating the eigenvalues.

    Returns
    -------
    w : (n_features) array
        The eigenvalues, each repeated according to its multiplicity.

    v : (n_features, dims) array
        The normalized (unit "length") eigenvectors, such that the column ``v[:,i]`` is the eigenvector corresponding to
        the eigenvalue ``w[i]``.

    """

    n_features = array.shape[1]
    dims = n_features if dims is None else min(dims, n_features)
    if not isinstance(dims, Integral):
        raise TypeError(
            "Number of requested dimensions must be integer-valued, got %s" %
            type(dims))
    if dims < 0:
        raise ValueError(
            "Number of requested dimensions must be non-negative, got %d" %
            dims)

    max_norm = np.linalg.norm(array, axis=1).max()
    if norm is None:
        warnings.warn(
            "Data norm has not been specified and will be calculated on the data provided.  This will result "
            "in additional privacy leakage. To ensure differential privacy and no additional privacy "
            "leakage, specify `data_norm` at initialisation.",
            PrivacyLeakWarning)
        norm = max_norm
    elif max_norm > norm and not np.isclose(max_norm, norm):
        raise ValueError(
            "Rows of input array must have l2 norm of at most %f, got %f" %
            (norm, max_norm))

    cov = array.T.dot(array) / (norm**2)
    eigvals = np.sort(np.linalg.eigvalsh(cov))[::-1]
    epsilon_0 = epsilon if eigvals_only else epsilon / (dims +
                                                        (dims != n_features))

    mech_eigvals = LaplaceBoundedDomain(epsilon=epsilon_0,
                                        lower=0,
                                        upper=float("inf"),
                                        sensitivity=2)
    noisy_eigvals = np.array(
        [mech_eigvals.randomise(eigval) for eigval in eigvals]) * (norm**2)

    if eigvals_only:
        return noisy_eigvals

    # When estimating all eigenvectors, we don't need to spend budget for the dth vector
    epsilon_i = epsilon / (dims + (dims != n_features))
    cov_i = cov
    proj_i = np.eye(n_features)

    theta = np.zeros((0, n_features))
    mech_cov = Bingham(epsilon=epsilon_i)

    for _ in range(dims):
        if cov_i.size > 1:
            u_i = mech_cov.randomise(cov_i)
        else:
            u_i = np.ones((1, ))

        theta_i = proj_i.T.dot(u_i)
        theta = np.vstack((theta, theta_i))

        if cov_i.size > 1:
            proj_i = null_space(theta).T
            cov_i = proj_i.dot(cov).dot(proj_i.T)

    return noisy_eigvals, theta.T
class TestLaplaceBoundedDomain(TestCase):
    def setup_method(self, method):
        if method.__name__.endswith("prob"):
            global_seed(314159)

        self.mech = LaplaceBoundedDomain()

    def teardown_method(self, method):
        del self.mech

    def test_not_none(self):
        self.assertIsNotNone(self.mech)

    def test_class(self):
        from diffprivlib.mechanisms import DPMechanism
        self.assertTrue(issubclass(LaplaceBoundedDomain, DPMechanism))

    def test_no_params(self):
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_sensitivity(self):
        self.mech.set_epsilon(1).set_bounds(0, 1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_epsilon(self):
        self.mech.set_sensitivity(1).set_bounds(0, 1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_complex_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon(1 + 2j)

    def test_string_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon("Two")

    def test_inf_epsilon(self):
        self.mech.set_sensitivity(1).set_epsilon(float("inf")).set_bounds(
            0, 10)

        for i in range(1000):
            self.assertEqual(self.mech.randomise(1), 1)

    def test_no_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_non_numeric(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        with self.assertRaises(TypeError):
            self.mech.randomise("Hello")

    def test_zero_median_prob(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        vals = []

        for i in range(10000):
            vals.append(self.mech.randomise(0.5))

        median = float(np.median(vals))
        self.assertAlmostEqual(np.abs(median), 0.5, delta=0.1)

    def test_neighbors_prob(self):
        epsilon = 1
        runs = 10000
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        count = [0, 0]

        for i in range(runs):
            val0 = self.mech.randomise(0)
            if val0 <= 0.5:
                count[0] += 1

            val1 = self.mech.randomise(1)
            if val1 <= 0.5:
                count[1] += 1

        self.assertGreater(count[0], count[1])
        self.assertLessEqual(count[0] / runs,
                             np.exp(epsilon) * count[1] / runs + 0.1)

    def test_within_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        vals = []

        for i in range(1000):
            vals.append(self.mech.randomise(0.5))

        vals = np.array(vals)

        self.assertTrue(np.all(vals >= 0))
        self.assertTrue(np.all(vals <= 1))

    def test_semi_inf_domain_inf_epsilon(self):
        self.mech.set_epsilon(float("inf")).set_sensitivity(1).set_bounds(
            0.0, float("inf"))

        with pytest.warns(None) as w:
            self.assertIsNotNone(self.mech.randomise(0))

        self.assertFalse(w, "Warning thrown for LaplaceBoundedDomain")
    def setup_method(self, method):
        if method.__name__.endswith("prob"):
            global_seed(314159)

        self.mech = LaplaceBoundedDomain()
Ejemplo n.º 9
0
    def _update_mean_variance(self,
                              n_past,
                              mu,
                              var,
                              X,
                              sample_weight=None,
                              n_noisy=None):
        """Compute online update of Gaussian mean and variance.

        Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance.
        (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance).

        Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of
        independent Gaussians.

        See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

        http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

        Parameters
        ----------
        n_past : int
            Number of samples represented in old mean and variance.  If sample weights were given, this should contain
            the sum of sample weights represented in old mean and variance.

        mu : array-like, shape (number of Gaussians,)
            Means for Gaussians in original set.

        var : array-like, shape (number of Gaussians,)
            Variances for Gaussians in original set.

        sample_weight : ignored
            Ignored in diffprivlib.

        n_noisy : int, optional
            Noisy count of the given class, satisfying differential privacy.

        Returns
        -------
        total_mu : array-like, shape (number of Gaussians,)
            Updated mean for each Gaussian over the combined set.

        total_var : array-like, shape (number of Gaussians,)
            Updated variance for each Gaussian over the combined set.
        """
        if n_noisy is None:
            warnings.warn(
                "Noisy class count has not been specified and will be read from the data. To use this "
                "method correctly, make sure it is run by the parent GaussianNB class.",
                PrivacyLeakWarning)
            n_noisy = X.shape[0]

        if not n_noisy:
            return mu, var

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance
        n_features = X.shape[1]
        local_epsilon = self.epsilon / 3 / n_features

        new_mu = np.zeros((n_features, ))
        new_var = np.zeros((n_features, ))

        for feature in range(n_features):
            _X = X[:, feature]
            lower, upper = self.bounds[0][feature], self.bounds[1][feature]
            local_diameter = upper - lower

            mech_mu = LaplaceTruncated(epsilon=local_epsilon,
                                       delta=0,
                                       sensitivity=local_diameter,
                                       lower=lower * n_noisy,
                                       upper=upper * n_noisy)
            _mu = mech_mu.randomise(_X.sum()) / n_noisy

            local_sq_sens = max(_mu - lower, upper - _mu)**2
            mech_var = LaplaceBoundedDomain(epsilon=local_epsilon,
                                            delta=0,
                                            sensitivity=local_sq_sens,
                                            lower=0,
                                            upper=local_sq_sens * n_noisy)
            _var = mech_var.randomise(((_X - _mu)**2).sum()) / n_noisy

            new_mu[feature] = _mu
            new_var[feature] = _var

        if n_past == 0:
            return new_mu, new_var

        n_total = float(n_past + n_noisy)

        # Combine mean of old and new data, taking into consideration
        # (weighted) number of observations
        total_mu = (n_noisy * new_mu + n_past * mu) / n_total

        # Combine variance of old and new data, taking into consideration
        # (weighted) number of observations. This is achieved by combining
        # the sum-of-squared-differences (ssd)
        old_ssd = n_past * var
        new_ssd = n_noisy * new_var
        total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * (
            n_noisy * mu - n_noisy * new_mu)**2
        total_var = total_ssd / n_total

        return total_mu, total_var
Ejemplo n.º 10
0
def var(a,
        epsilon=1.0,
        range=None,
        axis=None,
        dtype=None,
        out=None,
        ddof=0,
        keepdims=np._NoValue):
    r"""
    Compute the differentially private variance along the specified axis.

    Returns the variance of the array elements, a measure of the spread of a distribution, with differential privacy.
    The variance is computer for the flattened array by default, otherwise over the specified axis. Noise is added using
    :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using `range`. Users
    are advised to consult the documentation of :obj:`numpy.var` for further details, as the behaviour of `var` closely
    follows its Numpy variant.

    Parameters
    ----------
    a : array_like
        Array containing numbers whose variance is desired.  If `a` is not an array, a conversion is attempted.

    epsilon : float
        Privacy parameter :math:`\epsilon`.

    range : array_like
        Range of each dimension of the returned var. Same shape as np.var(a)

    axis : None or int or tuple of ints, optional
        Axis or axes along which the variance is computed.  The default is to compute the variance of the flattened
        array.

        If this is a tuple of ints, a variance is performed over multiple axes, instead of a single axis or all the axes
        as before.

    dtype : data-type, optional
        Type to use in computing the variance.  For arrays of integer type the default is `float32`; for arrays of float
        types it is the same as the array type.

    out : ndarray, optional
        Alternate output array in which to place the result.  It must have the same shape as the expected output, but
        the type is cast if necessary.

    ddof : int, optional
        "Delta Degrees of Freedom": the divisor used in the calculation is ``N - ddof``, where ``N`` represents the
        number of elements. By default `ddof` is zero.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this
        option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `var` method of sub-classes of
        `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    Returns
    -------
    variance : ndarray, see dtype parameter above
        If ``out=None``, returns a new array containing the variance; otherwise, a reference to the output array is
        returned.

    See Also
    --------
    std , mean

    """
    if isinstance(axis, tuple):
        temp_axis = axis
    elif axis is not None:
        try:
            temp_axis = tuple(axis)
        except TypeError:
            temp_axis = (axis, )
    else:
        temp_axis = tuple(_range(len(a.shape)))

    num_datapoints = 1
    for i in temp_axis:
        num_datapoints *= a.shape[i]

    actual_var = np.var(a,
                        axis=axis,
                        dtype=dtype,
                        out=out,
                        ddof=ddof,
                        keepdims=keepdims)

    if range is None:
        warnings.warn(
            "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
            "This will result in additional privacy leakage. To ensure differential privacy with no "
            "additional privacy loss, specify `range` for each valued returned by np.mean().",
            PrivacyLeakWarning)

        ranges = np.maximum(np.max(a, axis=axis) - np.min(a, axis=axis), 1e-5)
    elif isinstance(range, Real):
        ranges = np.ones_like(actual_var) * range
    else:
        ranges = np.array(range)

    if not (ranges > 0).all():
        raise ValueError(
            "Ranges must be specified for each value returned by np.var(), and must be non-negative"
        )
    if ranges.shape != actual_var.shape:
        raise ValueError("Shape of range must be same as shape of np.var()")

    if isinstance(actual_var, np.ndarray):
        # Extra np.array() a temporary fix for PyLint bug: https://github.com/PyCQA/pylint/issues/2747
        dp_var = np.array(np.zeros_like(actual_var))
        iterator = np.nditer(actual_var, flags=['multi_index'])

        while not iterator.finished:
            dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf"))\
                .set_sensitivity((ranges[iterator.multi_index] / num_datapoints) ** 2 * (num_datapoints - 1))

            dp_var[iterator.multi_index] = dp_mech.randomise(float(
                iterator[0]))
            iterator.iternext()

        return dp_var

    range = np.ravel(ranges)[0]
    dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")).\
        set_sensitivity(range ** 2 / num_datapoints)

    return dp_mech.randomise(actual_var)
Ejemplo n.º 11
0
def _var(a,
         epsilon=1.0,
         range=None,
         axis=None,
         dtype=None,
         out=None,
         ddof=0,
         keepdims=np._NoValue,
         nan=False):
    if isinstance(axis, tuple):
        temp_axis = axis
    elif axis is not None:
        try:
            temp_axis = tuple(axis)
        except TypeError:
            temp_axis = (axis, )
    else:
        temp_axis = tuple(_range(len(a.shape)))

    num_datapoints = 1
    for i in temp_axis:
        num_datapoints *= a.shape[i]

    if nan:
        actual_var = np.nanvar(a,
                               axis=axis,
                               dtype=dtype,
                               out=out,
                               ddof=ddof,
                               keepdims=keepdims)
    else:
        actual_var = np.var(a,
                            axis=axis,
                            dtype=dtype,
                            out=out,
                            ddof=ddof,
                            keepdims=keepdims)

    if range is None:
        warnings.warn(
            "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
            "This will result in additional privacy leakage. To ensure differential privacy with no "
            "additional privacy loss, specify `range` for each valued returned by np.mean().",
            PrivacyLeakWarning)

        ranges = np.maximum(np.ptp(a, axis=axis), 1e-5)
    elif isinstance(range, Real):
        ranges = np.ones_like(actual_var) * range
    else:
        ranges = np.array(range)

    if not (ranges > 0).all():
        raise ValueError(
            "Ranges must be specified for each value returned by np.var(), and must be non-negative"
        )
    if ranges.shape != actual_var.shape:
        raise ValueError("Shape of range must be same as shape of np.var()")

    if isinstance(actual_var, np.ndarray):
        dp_var = np.zeros_like(actual_var)
        iterator = np.nditer(actual_var, flags=['multi_index'])

        while not iterator.finished:
            dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \
                .set_sensitivity((ranges[iterator.multi_index] / num_datapoints) ** 2 * (num_datapoints - 1))

            dp_var[iterator.multi_index] = dp_mech.randomise(float(
                iterator[0]))
            iterator.iternext()

        return dp_var

    range = np.ravel(ranges)[0]
    dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \
        set_sensitivity(range ** 2 / num_datapoints)

    return dp_mech.randomise(actual_var)