コード例 #1
0
def get_private_average(nonprivate_points: np.ndarray, private_count: int,
                        clustering_param: clustering_params.ClusteringParam,
                        dim: int) -> np.ndarray:
    """Returns a differentially private average of the given data points.

  Args:
    nonprivate_points: data points to be averaged, may be empty.
    private_count: differentially private count of the number of data points.
      This is provided to save privacy budget since, in our applications, it is
      often already computed elsewhere. Required to be >= 1.
    clustering_param: parameters of the clustering algorithm.
    dim: dimension of the data points.

  Returns:
    A differentially private average of the given data points.
  """
    if private_count < 1:
        raise ValueError(
            f"get_private_average() called with private_count={private_count}")

    sum_points = np.sum(nonprivate_points, axis=0)
    epsilon_sum = (clustering_param.privacy_budget_split.frac_sum *
                   clustering_param.privacy_param.epsilon)

    if epsilon_sum == np.inf:
        return sum_points / private_count

    gaussian_standard_deviation = accountant.get_smallest_gaussian_noise(
        common.DifferentialPrivacyParameters(
            epsilon_sum, clustering_param.privacy_param.delta),
        num_queries=1,
        sensitivity=clustering_param.radius)
    sum_points += np.random.normal(scale=gaussian_standard_deviation, size=dim)
    return sum_points / private_count
    def __init__(self,
                 f,
                 delta_f,
                 epsilon,
                 delta,
                 num_queries=1,
                 random_state=None):
        """Instantiates a gaussian mechanism.

    Args:
      f: A function which takes as input a database and which returns as output
        a numpy array.
      delta_f: The sensitivity paramater, e.g., the maximum value by which the
        function can change for two databases that differ by only one row.
      epsilon: Differential privacy parameter.
      delta: Differential privacy parameter.
      num_queries: The number of queries for which the mechanism is used. Note
        that the constructed mechanism will be (epsilon, delta)-differentially
        private when answering (no more than) num_queries queries.
      random_state:  Optional instance of numpy.random.RandomState that is
        used to seed the random number generator.
    """
        self._func = f
        self._delta_f = delta_f
        self._sigma = accountant.get_smallest_gaussian_noise(
            common.DifferentialPrivacyParameters(epsilon, delta),
            num_queries,
            sensitivity=delta_f)
        self._random_state = random_state or np.random.RandomState()
コード例 #3
0
def default_tree_param(
    k: int, data: clustering_params.Data,
    privacy_param: clustering_params.DifferentialPrivacyParam,
    privacy_budget_split: clustering_params.PrivacyBudgetSplit
) -> typing.Tuple[clustering_params.TreeParam, PrivateCount]:
    """Heuristic tree param based on the data and number of clusters.

  Args:
    k: Number of clusters to divide the data into.
    data: Data to find centers for.
    privacy_param: privacy parameters for the algorithm.
    privacy_budget_split: budget split between different computations.

  Returns:
    (default TreeParam, private count). The private count is provided so that
    it doesn't need to be re-computed.
  """
    # Note that max_depth is used for the private count calculation so it cannot
    # depend on the count.
    # Chosen experimentally over multiple datasets.
    max_depth = 20

    # Calculate the standard deviation for the sum noise using a sensitivity of 1.
    if privacy_param.epsilon == np.inf:
        sum_sigma = 0
    else:
        sum_sigma = accountant.get_smallest_gaussian_noise(
            common.DifferentialPrivacyParameters(
                privacy_param.epsilon * privacy_budget_split.frac_sum,
                privacy_param.delta),
            num_queries=1,
            sensitivity=1.0)

    private_count = central_privacy_utils.get_private_count(
        data.num_points,
        central_privacy_utils.PrivateCountParam(privacy_param,
                                                privacy_budget_split,
                                                max_depth))

    # We can consider the noise as distributed amongst the points that are being
    # summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma * radius,
    # so if we distribute among 10 * sqrt(dimension) * sum_sigma, each point
    # has noise roughly 0.1 * radius.
    num_points_in_node_for_low_noise = int(10 * np.sqrt(data.dim) * sum_sigma)

    # We want to at least have the ability to consider a node per cluster, even
    # if the noise might be higher than we'd like.
    min_num_points_in_node = min(num_points_in_node_for_low_noise,
                                 private_count // (2 * k))

    # min_num_points_in_node must always be at least 1. Note it's possible that
    # the private_count is negative, so we should ensure this max is done last.
    min_num_points_in_node = max(1, min_num_points_in_node)
    min_num_points_in_branching_node = 3 * min_num_points_in_node

    return (clustering_params.TreeParam(
        min_num_points_in_branching_node=min_num_points_in_branching_node,
        min_num_points_in_node=min_num_points_in_node,
        max_depth=max_depth), private_count)
コード例 #4
0
 def test_get_smallest_gaussian_noise(self, epsilon, delta, num_queries,
                                      sensitivity, expected_std):
   privacy_parameters = common.DifferentialPrivacyParameters(
       epsilon, delta)
   self.assertAlmostEqual(
       expected_std,
       accountant.get_smallest_gaussian_noise(
           privacy_parameters, num_queries, sensitivity=sensitivity))