コード例 #1
0
def dp_joint_distribution(X, epsilon=1.0, range=None):
    """Represent data as a differentially private joint distribution of all attributes in input X"""
    # if range is None:
    #     warnings.warn("Range parameter has not been specified. Falling back to taking range from the data.\n"
    #                   "To ensure differential privacy, and no additional privacy leakage, the range must be "
    #                   "specified independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning)
    # todo evaluate range privacy leakage

    joint_distribution_ = utils.joint_distribution(X)

    # removing one record from X will decrease probability 1/n in one cell of the
    # joint distribution and increase the probability 1/n in the remaining cells
    sensitivity = 2 / X.shape[0]

    dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(sensitivity)

    dp_joint_distribution_ = np.zeros_like(joint_distribution_.values)

    for i in np.arange(dp_joint_distribution_.shape[0]):
        dp_joint_distribution_[i] = dp_mech.randomise(
            joint_distribution_.values[i])

    # laplacian_noise = np.random.laplace(0, scale=sensitivity / epsilon, size=joint_distribution_.shape[0])
    # dp_joint_distribution = joint_distribution_ + laplacian_noise

    # noise can result into negative probabilities, thus set boundary at 0 and re-normalize
    dp_joint_distribution_[dp_joint_distribution_ < 0] = 0
    # dp_joint_distribution_ = np.clip(dp_joint_distribution_, a_min=0, a_max=None)
    dp_joint_distribution_ = dp_joint_distribution_ / dp_joint_distribution_.sum(
    )
    return pd.Series(dp_joint_distribution_, index=joint_distribution_.index)
コード例 #2
0
def dp_contingency_table(X, epsilon=1.0, range=None):
    """Represent data as a differentiall private contingency table of all attributes"""
    # if range is None:
    #     warnings.warn("Range parameter has not been specified. Falling back to taking range from the data.\n"
    #                   "To ensure differential privacy, and no additional privacy leakage, the range must be "
    #                   "specified independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning)
    # todo evaluate range privacy leakage

    contingency_table_ = utils.contingency_table(X)

    # sensitivity similar to histogram, if we remove one record from X the count in one
    # cell will decrease by 1.
    sensitivity = 1
    # todo evaluate set_bound and geometric mechanism
    # dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(1).set_bounds(0, maxsize)
    dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(sensitivity)

    dp_contingency_table = np.zeros_like(contingency_table_.values)

    for i in np.arange(dp_contingency_table.shape[0]):
        # round counts upwards to preserve bins with noisy count between [0, 1]
        dp_contingency_table[i] = np.ceil(
            dp_mech.randomise(contingency_table_.values[i]))

    # noise can result into negative counts, thus set boundary at 0
    # dp_contingency_table[dp_contingency_table < 0] = 0
    dp_contingency_table = np.clip(dp_contingency_table, a_min=0, a_max=None)
    return pd.Series(dp_contingency_table, index=contingency_table_.index)
コード例 #3
0
ファイル: naive_bayes.py プロジェクト: Cowboycommit/SIT1719
    def _randomise(self, mean, var, n_samples):
        """Randomises the learned means and variances subject to differential privacy."""
        features = var.shape[0]

        local_epsilon = self.epsilon / 2
        local_epsilon /= features

        if len(self.bounds) != features:
            raise ValueError(
                "Bounds must be specified for each feature dimension")

        new_mu = np.zeros_like(mean)
        new_var = np.zeros_like(var)

        for feature in range(features):
            local_diameter = self.bounds[feature][1] - self.bounds[feature][0]
            mech_mu = Laplace().set_sensitivity(
                local_diameter / n_samples).set_epsilon(local_epsilon)
            mech_var = LaplaceBoundedDomain().set_sensitivity((n_samples - 1) * local_diameter ** 2 / n_samples ** 2)\
                .set_epsilon(local_epsilon).set_bounds(0, float("inf"))

            new_mu[feature] = mech_mu.randomise(mean[feature])
            new_var[feature] = mech_var.randomise(var[feature])

        return new_mu, new_var
コード例 #4
0
def dp_marginal_distribution(X, epsilon=1.0, range=None):
    assert len(X.shape) == 1, 'can only do 1-way marginal distribution, check contingency table or ' \
                            'joint distribution for higher dimensions'
    marginal = X.value_counts(normalize=True, dropna=False)

    # removing one record from X will decrease probability 1/n in one cell of the
    # marginal distribution and increase the probability 1/n in the remaining cells
    sensitivity = 2 / X.shape[0]

    dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(sensitivity)
    dp_marginal = np.zeros_like(marginal.values)

    for i in np.arange(dp_marginal.shape[0]):
        # round counts upwards to preserve bins with noisy count between [0, 1]
        dp_marginal[i] = dp_mech.randomise(marginal.values[i])

    # noise can result into negative counts, thus set boundary at 0
    dp_marginal = np.clip(dp_marginal, a_min=0, a_max=None)
    dp_marginal = dp_marginal / dp_marginal.sum()

    return pd.Series(dp_marginal, index=marginal.index)
コード例 #5
0
    def _randomise(self, mu, var, n_samples):
        """Randomises the learned means and variances subject to differential privacy."""
        features = var.shape[0]

        local_epsilon = self.epsilon / 2
        local_epsilon /= features

        if len(self.bounds) != features:
            raise ValueError("Bounds must be specified for each feature dimension")

        # Extra np.array() a temporary fix for PyLint bug: https://github.com/PyCQA/pylint/issues/2747
        new_mu = np.array(np.zeros_like(mu))
        new_var = np.array(np.zeros_like(var))

        for feature in range(features):
            local_diameter = self.bounds[feature][1] - self.bounds[feature][0]
            mech_mu = Laplace().set_sensitivity(local_diameter / n_samples).set_epsilon(local_epsilon)
            mech_var = LaplaceBoundedDomain().set_sensitivity((n_samples - 1) * local_diameter ** 2 / n_samples ** 2)\
                .set_epsilon(local_epsilon).set_bounds(0, float("inf"))

            new_mu[feature] = mech_mu.randomise(mu[feature])
            new_var[feature] = mech_var.randomise(var[feature])

        return new_mu, new_var
コード例 #6
0
class TestLaplace(TestCase):
    def setup_method(self, method):
        if method.__name__.endswith("prob"):
            global_seed(314159)

        self.mech = Laplace()

    def teardown_method(self, method):
        del self.mech

    def test_not_none(self):
        self.assertIsNotNone(self.mech)

    def test_class(self):
        from diffprivlib.mechanisms import DPMechanism
        self.assertTrue(issubclass(Laplace, DPMechanism))

    def test_no_params(self):
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_sensitivity(self):
        self.mech.set_epsilon(1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_neg_sensitivity(self):
        self.mech.set_epsilon(1)

        with self.assertRaises(ValueError):
            self.mech.set_sensitivity(-1)

    def test_str_sensitivity(self):
        self.mech.set_epsilon(1)

        with self.assertRaises(TypeError):
            self.mech.set_sensitivity("1")

    def test_zero_sensitivity(self):
        self.mech.set_sensitivity(0).set_epsilon(1)

        for i in range(1000):
            self.assertAlmostEqual(self.mech.randomise(1), 1)

    def test_no_epsilon(self):
        self.mech.set_sensitivity(1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_neg_epsilon(self):
        self.mech.set_sensitivity(1)
        with self.assertRaises(ValueError):
            self.mech.set_epsilon(-1)

    def test_inf_epsilon(self):
        self.mech.set_sensitivity(1).set_epsilon(float("inf"))

        for i in range(1000):
            self.assertAlmostEqual(self.mech.randomise(1), 1)

    def test_complex_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon(1 + 2j)

    def test_string_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon("Two")

    def test_repr(self):
        repr_ = repr(self.mech.set_epsilon(1).set_sensitivity(1))
        self.assertIn(".Laplace(", repr_)

    def test_zero_epsilon_with_delta(self):
        self.mech.set_sensitivity(1).set_epsilon_delta(0, 0.5)
        self.assertIsNotNone(self.mech.randomise(1))

    def test_epsilon_delta(self):
        self.mech.set_sensitivity(1).set_epsilon_delta(1, 0.01)
        self.assertIsNotNone(self.mech.randomise(1))

    def test_non_numeric(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        with self.assertRaises(TypeError):
            self.mech.randomise("Hello")

    def test_zero_median_prob(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        vals = []

        for i in range(10000):
            vals.append(self.mech.randomise(0))

        median = float(np.median(vals))
        self.assertAlmostEqual(np.abs(median), 0.0, delta=0.1)

    def test_neighbours_prob(self):
        epsilon = 1
        runs = 10000
        self.mech.set_sensitivity(1).set_epsilon(epsilon)
        count = [0, 0]

        for i in range(runs):
            val0 = self.mech.randomise(0)
            if val0 <= 0:
                count[0] += 1

            val1 = self.mech.randomise(1)
            if val1 <= 0:
                count[1] += 1

        self.assertGreater(count[0], count[1])
        self.assertLessEqual(count[0] / runs,
                             np.exp(epsilon) * count[1] / runs + 0.1)

    def test_bias(self):
        self.assertEqual(0.0, self.mech.get_bias(0))

    def test_variance(self):
        with self.assertRaises(ValueError):
            self.mech.get_variance(1)

        self.mech.set_epsilon(1).set_sensitivity(1)
        self.assertEqual(2.0, self.mech.get_variance(0))
コード例 #7
0
def mean(a,
         epsilon=1.0,
         range=None,
         axis=None,
         dtype=None,
         out=None,
         keepdims=np._NoValue):
    r"""
    Compute the differentially private arithmetic mean along the specified axis.

    Returns the average of the array elements with differential privacy.  The average is taken over the flattened array
    by default, otherwise over the specified axis. Noise is added using :class:`.Laplace` to satisfy differential
    privacy, where sensitivity is calculated using `range`.  Users are advised to consult the documentation of
    :obj:`numpy.mean` for further details, as the behaviour of `mean` closely follows its Numpy variant.

    Parameters
    ----------
    a : array_like
        Array containing numbers whose mean is desired. If `a` is not an array, a conversion is attempted.

    epsilon : float
        Privacy parameter :math:`\epsilon`.

    range : array_like
        Range of each dimension of the returned mean. Same shape as np.mean(a)

    axis : None or int or tuple of ints, optional
        Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.

        If this is a tuple of ints, a mean is performed over multiple axes, instead of a single axis or all the axes as
        before.

    dtype : data-type, optional
        Type to use in computing the mean.  For integer inputs, the default is `float64`; for floating point inputs, it
        is the same as the input dtype.

    out : ndarray, optional
        Alternate output array in which to place the result.  The default is ``None``; if provided, it must have the
        same shape as the expected output, but the type will be cast if necessary.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this
        option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    Returns
    -------
    m : ndarray, see dtype parameter above
        If `out=None`, returns a new array containing the mean values, otherwise a reference to the output array is
        returned.

    See Also
    --------
    std, var

    """
    if isinstance(axis, tuple):
        temp_axis = axis
    elif axis is not None:
        try:
            temp_axis = tuple(axis)
        except TypeError:
            temp_axis = (axis, )
    else:
        temp_axis = tuple(_range(len(a.shape)))

    num_datapoints = 1
    for i in temp_axis:
        num_datapoints *= a.shape[i]

    actual_mean = np.mean(a,
                          axis=axis,
                          dtype=dtype,
                          out=out,
                          keepdims=keepdims)

    if range is None:
        warnings.warn(
            "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
            "This will result in additional privacy leakage. To ensure differential privacy with no "
            "additional privacy loss, specify `range` for each valued returned by np.mean().",
            PrivacyLeakWarning)

        ranges = np.maximum(np.max(a, axis=axis) - np.min(a, axis=axis), 1e-5)
    elif isinstance(range, Real):
        ranges = np.ones_like(actual_mean) * range
    else:
        ranges = np.array(range)

    if not (ranges > 0).all():
        raise ValueError(
            "Ranges must be specified for each value returned by np.mean(), and must be non-negative"
        )
    if ranges.shape != actual_mean.shape:
        raise ValueError("Shape of range must be same as shape of np.mean")

    if isinstance(actual_mean, np.ndarray):
        # Extra np.array() a temporary fix for PyLint bug: https://github.com/PyCQA/pylint/issues/2747
        dp_mean = np.array(np.zeros_like(actual_mean))
        iterator = np.nditer(actual_mean, flags=['multi_index'])

        while not iterator.finished:
            dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(
                ranges[iterator.multi_index] / num_datapoints)

            dp_mean[iterator.multi_index] = dp_mech.randomise(
                float(iterator[0]))
            iterator.iternext()

        return dp_mean

    range = np.ravel(ranges)[0]
    dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(range /
                                                             num_datapoints)

    return dp_mech.randomise(actual_mean)
コード例 #8
0
ファイル: utils.py プロジェクト: Cowboycommit/SIT1719
def _mean(a,
          epsilon=1.0,
          range=None,
          axis=None,
          dtype=None,
          out=None,
          keepdims=np._NoValue,
          nan=False):
    if isinstance(axis, tuple):
        temp_axis = axis
    elif axis is not None:
        try:
            temp_axis = tuple(axis)
        except TypeError:
            temp_axis = (axis, )
    else:
        temp_axis = tuple(_range(len(a.shape)))

    num_datapoints = 1
    for i in temp_axis:
        num_datapoints *= a.shape[i]

    if nan:
        actual_mean = np.nanmean(a,
                                 axis=axis,
                                 dtype=dtype,
                                 out=out,
                                 keepdims=keepdims)
    else:
        actual_mean = np.mean(a,
                              axis=axis,
                              dtype=dtype,
                              out=out,
                              keepdims=keepdims)

    if range is None:
        warnings.warn(
            "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
            "This will result in additional privacy leakage. To ensure differential privacy with no "
            "additional privacy loss, specify `range` for each valued returned by np.mean().",
            PrivacyLeakWarning)

        ranges = np.maximum(np.ptp(a, axis=axis), 1e-5)
    elif isinstance(range, Real):
        ranges = np.ones_like(actual_mean) * range
    else:
        ranges = np.array(range)

    if not (ranges > 0).all():
        raise ValueError(
            "Ranges must be specified for each value returned by np.mean(), and must be non-negative"
        )
    if ranges.shape != actual_mean.shape:
        raise ValueError("Shape of range must be same as shape of np.mean")

    if isinstance(actual_mean, np.ndarray):
        dp_mean = np.zeros_like(actual_mean)
        iterator = np.nditer(actual_mean, flags=['multi_index'])

        while not iterator.finished:
            dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(
                ranges[iterator.multi_index] / num_datapoints)

            dp_mean[iterator.multi_index] = dp_mech.randomise(
                float(iterator[0]))
            iterator.iternext()

        return dp_mean

    range = np.ravel(ranges)[0]
    dp_mech = Laplace().set_epsilon(epsilon).set_sensitivity(range /
                                                             num_datapoints)

    return dp_mech.randomise(actual_mean)