Beispiel #1
0
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=False, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype,
                          keepdims=keepdims, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nansum if nan else np.sum
    actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated
    mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size)
    output = mech.randomise(actual_sum)

    accountant.spend(epsilon, 0)

    return output
    def __init__(self,
                 n_components=None,
                 centered=False,
                 epsilon=1.0,
                 data_norm=None,
                 bounds=None,
                 copy=True,
                 whiten=False,
                 random_state=None,
                 accountant=None,
                 **unused_args):
        super().__init__(n_components=n_components,
                         copy=copy,
                         whiten=whiten,
                         svd_solver='full',
                         tol=0.0,
                         iterated_power='auto',
                         random_state=random_state)
        self.centered = centered
        self.epsilon = epsilon
        self.data_norm = data_norm
        self.bounds = bounds
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)
Beispiel #3
0
    def __init__(self,
                 epsilon=1.0,
                 data_norm=None,
                 tol=1e-4,
                 C=1.0,
                 fit_intercept=True,
                 max_iter=100,
                 verbose=0,
                 warm_start=False,
                 n_jobs=None,
                 accountant=None,
                 **unused_args):
        super().__init__(penalty='l2',
                         dual=False,
                         tol=tol,
                         C=C,
                         fit_intercept=fit_intercept,
                         intercept_scaling=1.0,
                         class_weight=None,
                         random_state=None,
                         solver='lbfgs',
                         max_iter=max_iter,
                         multi_class='ovr',
                         verbose=verbose,
                         warm_start=warm_start,
                         n_jobs=n_jobs)
        self.epsilon = epsilon
        self.data_norm = data_norm
        self.classes_ = None
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)
Beispiel #4
0
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                          accountant=accountant, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanvar if nan else np.var
    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0,
                                   sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0,
                                   upper=((upper - lower) ** 2) / 4)
    output = dp_mech.randomise(actual_var)

    accountant.spend(epsilon, 0)

    return output
    def __init__(self,
                 n_estimators=10,
                 *,
                 epsilon=1.0,
                 cat_feature_threshold=10,
                 n_jobs=1,
                 verbose=0,
                 accountant=None,
                 max_depth=15,
                 random_state=None,
                 feature_domains=None,
                 **unused_args):
        super().__init__(base_estimator=DecisionTreeClassifier(),
                         n_estimators=n_estimators,
                         estimator_params=("cat_feature_threshold",
                                           "max_depth", "epsilon",
                                           "random_state"),
                         n_jobs=n_jobs,
                         random_state=random_state,
                         verbose=verbose)
        self.epsilon = epsilon
        self.cat_feature_threshold = cat_feature_threshold
        self.max_depth = max_depth
        self.accountant = BudgetAccountant.load_default(accountant)
        self.feature_domains = feature_domains

        if random_state is not None:
            np.random.seed(random_state)

        self._warn_unused_args(unused_args)
Beispiel #6
0
    def __init__(self,
                 epsilon=1.0,
                 bounds=None,
                 priors=None,
                 var_smoothing=1e-9,
                 accountant=None):
        super().__init__(priors=priors, var_smoothing=var_smoothing)

        self.epsilon = epsilon
        self.bounds = bounds
        self.accountant = BudgetAccountant.load_default(accountant)
 def __init__(self,
              epsilon=1.0,
              bounds=None,
              copy=True,
              with_mean=True,
              with_std=True,
              accountant=None):
     super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
     self.epsilon = epsilon
     self.bounds = bounds
     self.accountant = BudgetAccountant.load_default(accountant)
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nansum if nan else np.sum
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated

    if isinstance(actual_sum, np.ndarray):
        dp_sum = np.zeros_like(actual_sum, dtype=dtype)
        iterator = np.nditer(actual_sum, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0])
            local_diam = _upper - _lower
            mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).\
                set_bounds(_lower * n_datapoints, _upper * n_datapoints)

            dp_sum[idx] = mech.randomise(actual_sum[idx])
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_sum

    local_diam = upper[0] - lower[0]
    mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).set_bounds(lower[0] * n_datapoints,
                                                                                 upper[0] * n_datapoints)

    accountant.spend(epsilon, 0)

    return mech.randomise(actual_sum)
    def test_load_wrong_type(self):
        with self.assertRaises(TypeError):
            BudgetAccountant.load_default(0)

        with self.assertRaises(TypeError):
            BudgetAccountant.load_default([1, 2, 3])

        with self.assertRaises(TypeError):
            BudgetAccountant.load_default("BudgetAccountant")
Beispiel #10
0
    def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args):
        super().__init__(n_clusters=n_clusters)

        self.epsilon = epsilon
        self.bounds = bounds
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)

        self.cluster_centers_ = None
        self.bounds_processed = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = None
        self._n_threads = 1
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nanvar if nan else np.var
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    if isinstance(actual_var, np.ndarray):
        dp_var = np.zeros_like(actual_var)
        iterator = np.nditer(actual_var, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            local_diam = upper[idx] - lower[idx] if vector_out else upper[0] - lower[0]
            dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \
                .set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

            dp_var[iterator.multi_index] = np.minimum(dp_mech.randomise(actual_var[idx]), local_diam ** 2)
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_var

    local_diam = upper[0] - lower[0]
    dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \
        set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1))

    accountant.spend(epsilon, 0)

    return np.minimum(dp_mech.randomise(actual_var), local_diam ** 2)
    def __init__(self,
                 *,
                 epsilon=1.0,
                 bounds_X=None,
                 bounds_y=None,
                 fit_intercept=True,
                 copy_X=True,
                 accountant=None,
                 **unused_args):
        super().__init__(fit_intercept=fit_intercept,
                         copy_X=copy_X,
                         n_jobs=None)

        self.epsilon = epsilon
        self.bounds_X = bounds_X
        self.bounds_y = bounds_y
        self.accountant = BudgetAccountant.load_default(accountant)

        self._warn_unused_args(unused_args)
Beispiel #13
0
def histogram(sample,
              epsilon=1.0,
              bins=10,
              range=None,
              weights=None,
              density=None,
              accountant=None,
              **unused_args):
    r"""
    Compute the differentially private histogram of a set of data.

    The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogram` for more usage notes.

    Parameters
    ----------
    sample : array_like
        Input data.  The histogram is computed over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : int or sequence of scalars or str, default: 10
        If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default).  If `bins`
        is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing
        for non-uniform bin widths.

        If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by
        `histogram_bin_edges`.

    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range is simply ``(a.min(), a.max())``.  Values outside
        the range are ignored.  The first element of the range must be less than or equal to the second. `range` affects
        the automatic bin computation as well.  While bin width is computed to be optimal based on the actual data
        within `range`, the bin count will fill the entire range including portions containing no data.

    weights : array_like, optional
        An array of weights, of the same shape as `a`.  Each value in `a` only contributes its associated weight
        towards the bin count (instead of 1).  If `density` is True, the weights are normalized, so that the integral
        of the density over the range remains 1.

    density : bool, optional
        If ``False``, the result will contain the number of samples in each bin.  If ``True``, the result is the value
        of the probability *density* function at the bin, normalized such that the *integral* over the range is 1.
        Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is
        not a probability *mass* function.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    hist : array
        The values of the histogram.  See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    See Also
    --------
    histogramdd, histogram2d

    Notes
    -----
    All but the last (righthand-most) bin is half-open.  In other words, if `bins` is::

      [1, 2, 3, 4]

    then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The last bin, however,
    is ``[3, 4]``, which *includes* 4.

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    if range is None:
        warnings.warn(
            "Range parameter has not been specified. Falling back to taking range from the data.\n"
            "To ensure differential privacy, and no additional privacy leakage, the range must be "
            "specified independently of the data (i.e., using domain knowledge).",
            PrivacyLeakWarning)

    hist, bin_edges = np.histogram(sample,
                                   bins=bins,
                                   range=range,
                                   weights=weights,
                                   density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)

    for i in np.arange(dp_hist.shape[0]):
        dp_hist[i] = dp_mech.randomise(int(hist[i]))

    # dp_hist = dp_hist.astype(float, casting='safe')

    accountant.spend(epsilon, 0)

    if density:
        bin_sizes = np.array(np.diff(bin_edges), float)
        return dp_hist / bin_sizes / (dp_hist.sum()
                                      if dp_hist.sum() else 1), bin_edges

    return dp_hist, bin_edges
Beispiel #14
0
def histogramdd(sample,
                epsilon=1.0,
                bins=10,
                range=None,
                weights=None,
                density=None,
                accountant=None,
                **unused_args):
    r"""
    Compute the differentially private multidimensional histogram of some data.

    The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogramdd` for more usage notes.

    Parameters
    ----------
    sample : (N, D) array, or (D, N) array_like
        The data to be histogrammed.

        Note the unusual interpretation of sample when an array_like:

        * When an array, each row is a coordinate in a D-dimensional space - such as
          ``histogramgramdd(np.array([p1, p2, p3]))``.
        * When an array_like, each element is the list of values for single coordinate - such as
          ``histogramgramdd((X, Y, Z))``.

        The first form should be preferred.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : sequence or int, default: 10
        The bin specification:

        * A sequence of arrays describing the monotonically increasing bin edges along each dimension.
        * The number of bins for each dimension (nx, ny, ... =bins)
        * The number of bins for all dimensions (nx=ny=...=bins).

    range : sequence, optional
        A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges
        are not given explicitly in `bins`.
        An entry of None in the sequence results in the minimum and maximum values being used for the corresponding
        dimension.
        The default, None, is equivalent to passing a tuple of D None values.

    density : bool, optional
        If False, the default, returns the number of samples in each bin.  If True, returns the probability *density*
        function at the bin, ``bin_count / sample_count / bin_volume``.

    weights : (N,) array_like, optional
        An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.  Weights are normalized to 1 if normed is
        True.  If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to
        the samples falling into each bin.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    H : ndarray
        The multidimensional histogram of sample x.  See normed and weights for the different possible semantics.
    edges : list
        A list of D arrays describing the bin edges for each dimension.

    See Also
    --------
    histogram: 1-D differentially private histogram
    histogram2d: 2-D differentially private histogram

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Range only required if bin edges not specified
    if np.array(bins, dtype=object).ndim == 0 or not np.all(
        [np.ndim(_bin) for _bin in bins]):
        if range is None or (isinstance(range, list) and None in range):
            warnings.warn(
                "Range parameter has not been specified (or has missing elements). Falling back to taking "
                "range from the data.\n "
                "To ensure differential privacy, and no additional privacy leakage, the range must be "
                "specified for each dimension independently of the data (i.e., using domain knowledge).",
                PrivacyLeakWarning)

    hist, bin_edges = np.histogramdd(sample,
                                     bins=bins,
                                     range=range,
                                     normed=None,
                                     weights=weights,
                                     density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)
    iterator = np.nditer(hist, flags=['multi_index'])

    while not iterator.finished:
        dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0]))
        iterator.iternext()

    dp_hist = dp_hist.astype(float, casting='safe')

    if density:
        # calculate the probability density function
        dims = len(dp_hist.shape)
        dp_hist_sum = dp_hist.sum()
        for i in np.arange(dims):
            shape = np.ones(dims, int)
            shape[i] = dp_hist.shape[i]
            # noinspection PyUnresolvedReferences
            dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape)

        if dp_hist_sum > 0:
            dp_hist /= dp_hist_sum

    accountant.spend(epsilon, 0)

    return dp_hist, bin_edges
def quantile(array,
             quant,
             epsilon=1.0,
             bounds=None,
             axis=None,
             keepdims=False,
             accountant=None,
             **unused_args):
    r"""
    Compute the differentially private quantile of the array.

    Returns the specified quantile with differential privacy.  The quantile is calculated over the flattened array.
    Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by
    Smith, 2011.

    Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743

    Parameters
    ----------
    array : array_like
        Array containing numbers whose quantile is sought.  If `array` is not an array, a conversion is attempted.

    quant : float or array-like
        Quantile or array of quantiles.  Each quantile must be in the unit interval [0, 1].  If quant is array-like,
        quantiles are returned over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the quantile values.

    See Also
    --------
    numpy.quantile : Equivalent non-private method.

    percentile, median

    """
    warn_unused_args(unused_args)

    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    quant = np.ravel(quant)

    if np.any(quant < 0) or np.any(quant > 1):
        raise ValueError("Quantiles must be in the unit interval [0, 1].")

    if len(quant) > 1:
        return np.array([
            quantile(array,
                     q_i,
                     epsilon=epsilon / len(quant),
                     bounds=bounds,
                     axis=axis,
                     keepdims=keepdims,
                     accountant=accountant) for q_i in quant
        ])

    # Dealing with a single quant from now on
    quant = quant.item()

    if axis is not None or keepdims:
        return _wrap_axis(quantile,
                          array,
                          quant=quant,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          keepdims=keepdims,
                          accountant=accountant)

    # Dealing with a scalar output from now on
    bounds = check_bounds(bounds, shape=0, min_separation=1e-5)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    k = array.size
    array = np.append(array, list(bounds))
    array.sort()

    interval_sizes = np.diff(array)

    # Todo: Need to find a way to do this in a differentially private way
    if np.isnan(interval_sizes).any():
        return np.nan

    mech = Exponential(epsilon=epsilon,
                       sensitivity=1,
                       utility=list(-np.abs(np.arange(0, k + 1) - quant * k)),
                       measure=list(interval_sizes))
    idx = mech.randomise()
    output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx]

    accountant.spend(epsilon, 0)

    return output
    def sample_model2(epsilon=1.0, accountant=None):
        accountant = BudgetAccountant.load_default(accountant)
        accountant.check(epsilon, 0.0)

        accountant.spend(epsilon, 0.0)