Beispiel #1
0
def _dp_histograms(hist_dict, feature_list, epsilon):
    # NOTE: functiona assumes epsilon is a valid float value (>= 0)
    hists = copy.deepcopy(hist_dict)

    dp_mech = GeometricTruncated().set_epsilon(epsilon).set_sensitivity(1).set_bounds(0, maxsize)

    # iterate over all histograms and make them differentially private
    for f_idx in feature_list:
        for i in range(len(hist_dict[f"{f_idx}"])):
            hists[f"{f_idx}"][i]["n_pos"] = dp_mech.randomise(int(hist_dict[f"{f_idx}"][i]["n_pos"]))
            hists[f"{f_idx}"][i]["n_neg"] = dp_mech.randomise(int(hist_dict[f"{f_idx}"][i]["n_neg"]))

        # and filter out empty bins
        hists[f"{f_idx}"] = list(filter(lambda x: not (x["n_pos"] == 0 and x["n_neg"] == 0), hists[f"{f_idx}"]))

    return hists
Beispiel #2
0
    def _noisy_class_counts(self, y):
        unique_y = np.unique(y)
        n_total = y.shape[0]

        # Use 1/3 of total epsilon budget for getting noisy class counts
        mech = GeometricTruncated(epsilon=self.epsilon / 3, sensitivity=1, lower=1, upper=n_total)
        noisy_counts = np.array([mech.randomise((y == y_i).sum()) for y_i in unique_y])

        argsort = np.argsort(noisy_counts)
        i = 0 if noisy_counts.sum() > n_total else len(unique_y) - 1

        while np.sum(noisy_counts) != n_total:
            _i = argsort[i]
            sgn = np.sign(n_total - noisy_counts.sum())
            noisy_counts[_i] = np.clip(noisy_counts[_i] + sgn, 1, n_total)

            i = (i - sgn) % len(unique_y)

        return noisy_counts
Beispiel #3
0
def histogram(sample,
              epsilon=1.0,
              bins=10,
              range=None,
              weights=None,
              density=None,
              accountant=None,
              **unused_args):
    r"""
    Compute the differentially private histogram of a set of data.

    The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogram` for more usage notes.

    Parameters
    ----------
    sample : array_like
        Input data.  The histogram is computed over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : int or sequence of scalars or str, default: 10
        If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default).  If `bins`
        is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing
        for non-uniform bin widths.

        If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by
        `histogram_bin_edges`.

    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range is simply ``(a.min(), a.max())``.  Values outside
        the range are ignored.  The first element of the range must be less than or equal to the second. `range` affects
        the automatic bin computation as well.  While bin width is computed to be optimal based on the actual data
        within `range`, the bin count will fill the entire range including portions containing no data.

    weights : array_like, optional
        An array of weights, of the same shape as `a`.  Each value in `a` only contributes its associated weight
        towards the bin count (instead of 1).  If `density` is True, the weights are normalized, so that the integral
        of the density over the range remains 1.

    density : bool, optional
        If ``False``, the result will contain the number of samples in each bin.  If ``True``, the result is the value
        of the probability *density* function at the bin, normalized such that the *integral* over the range is 1.
        Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is
        not a probability *mass* function.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    hist : array
        The values of the histogram.  See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    See Also
    --------
    histogramdd, histogram2d

    Notes
    -----
    All but the last (righthand-most) bin is half-open.  In other words, if `bins` is::

      [1, 2, 3, 4]

    then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The last bin, however,
    is ``[3, 4]``, which *includes* 4.

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    if range is None:
        warnings.warn(
            "Range parameter has not been specified. Falling back to taking range from the data.\n"
            "To ensure differential privacy, and no additional privacy leakage, the range must be "
            "specified independently of the data (i.e., using domain knowledge).",
            PrivacyLeakWarning)

    hist, bin_edges = np.histogram(sample,
                                   bins=bins,
                                   range=range,
                                   weights=weights,
                                   density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)

    for i in np.arange(dp_hist.shape[0]):
        dp_hist[i] = dp_mech.randomise(int(hist[i]))

    # dp_hist = dp_hist.astype(float, casting='safe')

    accountant.spend(epsilon, 0)

    if density:
        bin_sizes = np.array(np.diff(bin_edges), float)
        return dp_hist / bin_sizes / (dp_hist.sum()
                                      if dp_hist.sum() else 1), bin_edges

    return dp_hist, bin_edges
Beispiel #4
0
def histogramdd(sample,
                epsilon=1.0,
                bins=10,
                range=None,
                weights=None,
                density=None,
                accountant=None,
                **unused_args):
    r"""
    Compute the differentially private multidimensional histogram of some data.

    The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogramdd` for more usage notes.

    Parameters
    ----------
    sample : (N, D) array, or (D, N) array_like
        The data to be histogrammed.

        Note the unusual interpretation of sample when an array_like:

        * When an array, each row is a coordinate in a D-dimensional space - such as
          ``histogramgramdd(np.array([p1, p2, p3]))``.
        * When an array_like, each element is the list of values for single coordinate - such as
          ``histogramgramdd((X, Y, Z))``.

        The first form should be preferred.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : sequence or int, default: 10
        The bin specification:

        * A sequence of arrays describing the monotonically increasing bin edges along each dimension.
        * The number of bins for each dimension (nx, ny, ... =bins)
        * The number of bins for all dimensions (nx=ny=...=bins).

    range : sequence, optional
        A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges
        are not given explicitly in `bins`.
        An entry of None in the sequence results in the minimum and maximum values being used for the corresponding
        dimension.
        The default, None, is equivalent to passing a tuple of D None values.

    density : bool, optional
        If False, the default, returns the number of samples in each bin.  If True, returns the probability *density*
        function at the bin, ``bin_count / sample_count / bin_volume``.

    weights : (N,) array_like, optional
        An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.  Weights are normalized to 1 if normed is
        True.  If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to
        the samples falling into each bin.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    H : ndarray
        The multidimensional histogram of sample x.  See normed and weights for the different possible semantics.
    edges : list
        A list of D arrays describing the bin edges for each dimension.

    See Also
    --------
    histogram: 1-D differentially private histogram
    histogram2d: 2-D differentially private histogram

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Range only required if bin edges not specified
    if np.array(bins, dtype=object).ndim == 0 or not np.all(
        [np.ndim(_bin) for _bin in bins]):
        if range is None or (isinstance(range, list) and None in range):
            warnings.warn(
                "Range parameter has not been specified (or has missing elements). Falling back to taking "
                "range from the data.\n "
                "To ensure differential privacy, and no additional privacy leakage, the range must be "
                "specified for each dimension independently of the data (i.e., using domain knowledge).",
                PrivacyLeakWarning)

    hist, bin_edges = np.histogramdd(sample,
                                     bins=bins,
                                     range=range,
                                     normed=None,
                                     weights=weights,
                                     density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)
    iterator = np.nditer(hist, flags=['multi_index'])

    while not iterator.finished:
        dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0]))
        iterator.iternext()

    dp_hist = dp_hist.astype(float, casting='safe')

    if density:
        # calculate the probability density function
        dims = len(dp_hist.shape)
        dp_hist_sum = dp_hist.sum()
        for i in np.arange(dims):
            shape = np.ones(dims, int)
            shape[i] = dp_hist.shape[i]
            # noinspection PyUnresolvedReferences
            dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape)

        if dp_hist_sum > 0:
            dp_hist /= dp_hist_sum

    accountant.spend(epsilon, 0)

    return dp_hist, bin_edges
Beispiel #5
0
    def setup_method(self, method):
        if method.__name__ .endswith("prob"):
            global_seed(314159)

        self.mech = GeometricTruncated()
Beispiel #6
0
class TestGeometricTruncated(TestCase):
    def setup_method(self, method):
        if method.__name__ .endswith("prob"):
            global_seed(314159)

        self.mech = GeometricTruncated()

    def teardown_method(self, method):
        del self.mech

    def test_not_none(self):
        self.assertIsNotNone(self.mech)

    def test_class(self):
        from diffprivlib.mechanisms import DPMechanism
        self.assertTrue(issubclass(GeometricTruncated, DPMechanism))

    def test_no_params(self):
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_sensitivity(self):
        self.mech.set_epsilon(1).set_bounds(0, 10)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_non_integer_sensitivity(self):
        self.mech.set_epsilon(1).set_bounds(0, 10)
        with self.assertRaises(TypeError):
            self.mech.set_sensitivity(0.5)

    def test_no_epsilon(self):
        self.mech.set_sensitivity(1).set_bounds(0, 10)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_non_zero_delta(self):
        self.mech.set_sensitivity(1).set_bounds(0, 10)
        with self.assertRaises(ValueError):
            self.mech.set_epsilon_delta(1, 0.5)

    def test_neg_epsilon(self):
        self.mech.set_sensitivity(1).set_bounds(0, 10)
        with self.assertRaises(ValueError):
            self.mech.set_epsilon(-1)

    def test_inf_epsilon(self):
        self.mech.set_sensitivity(1).set_epsilon(float("inf")).set_bounds(0, 10)

        for i in range(1000):
            self.assertEqual(self.mech.randomise(1), 1)

    def test_complex_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon(1+2j)

    def test_string_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon("Two")

    def test_no_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_non_integer_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        with self.assertRaises(TypeError):
            self.mech.set_bounds(1, 2.2)

    def test_non_numeric(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 10)
        with self.assertRaises(TypeError):
            self.mech.randomise("Hello")

    def test_non_integer(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 10)
        with self.assertRaises(TypeError):
            self.mech.randomise(1.0)

    def test_zero_median_prob(self):
        self.mech.set_sensitivity(1).set_bounds(0, 4).set_epsilon(1)
        vals = []

        for i in range(10000):
            vals.append(self.mech.randomise(2))

        median = float(np.median(vals))
        self.assertAlmostEqual(np.abs(median), 2.0, delta=0.1)

    def test_neighbors_prob(self):
        epsilon = 1
        runs = 10000
        self.mech.set_sensitivity(1).set_epsilon(epsilon).set_bounds(0, 4)
        count = [0, 0]

        for i in range(runs):
            val0 = self.mech.randomise(1)
            if val0 <= 1:
                count[0] += 1

            val1 = self.mech.randomise(2)
            if val1 <= 1:
                count[1] += 1

        self.assertGreater(count[0], count[1])
        self.assertLessEqual(count[0] / runs, np.exp(epsilon) * count[1] / runs + 0.1)