def get_private_average(nonprivate_points: np.ndarray, private_count: int, clustering_param: clustering_params.ClusteringParam, dim: int) -> np.ndarray: """Returns a differentially private average of the given data points. Args: nonprivate_points: data points to be averaged, may be empty. private_count: differentially private count of the number of data points. This is provided to save privacy budget since, in our applications, it is often already computed elsewhere. Required to be >= 1. clustering_param: parameters of the clustering algorithm. dim: dimension of the data points. Returns: A differentially private average of the given data points. """ if private_count < 1: raise ValueError( f"get_private_average() called with private_count={private_count}") sum_points = np.sum(nonprivate_points, axis=0) epsilon_sum = (clustering_param.privacy_budget_split.frac_sum * clustering_param.privacy_param.epsilon) if epsilon_sum == np.inf: return sum_points / private_count gaussian_standard_deviation = accountant.get_smallest_gaussian_noise( common.DifferentialPrivacyParameters( epsilon_sum, clustering_param.privacy_param.delta), num_queries=1, sensitivity=clustering_param.radius) sum_points += np.random.normal(scale=gaussian_standard_deviation, size=dim) return sum_points / private_count
def __init__(self, f, delta_f, epsilon, delta, num_queries=1, random_state=None): """Instantiates a gaussian mechanism. Args: f: A function which takes as input a database and which returns as output a numpy array. delta_f: The sensitivity paramater, e.g., the maximum value by which the function can change for two databases that differ by only one row. epsilon: Differential privacy parameter. delta: Differential privacy parameter. num_queries: The number of queries for which the mechanism is used. Note that the constructed mechanism will be (epsilon, delta)-differentially private when answering (no more than) num_queries queries. random_state: Optional instance of numpy.random.RandomState that is used to seed the random number generator. """ self._func = f self._delta_f = delta_f self._sigma = accountant.get_smallest_gaussian_noise( common.DifferentialPrivacyParameters(epsilon, delta), num_queries, sensitivity=delta_f) self._random_state = random_state or np.random.RandomState()
def default_tree_param( k: int, data: clustering_params.Data, privacy_param: clustering_params.DifferentialPrivacyParam, privacy_budget_split: clustering_params.PrivacyBudgetSplit ) -> typing.Tuple[clustering_params.TreeParam, PrivateCount]: """Heuristic tree param based on the data and number of clusters. Args: k: Number of clusters to divide the data into. data: Data to find centers for. privacy_param: privacy parameters for the algorithm. privacy_budget_split: budget split between different computations. Returns: (default TreeParam, private count). The private count is provided so that it doesn't need to be re-computed. """ # Note that max_depth is used for the private count calculation so it cannot # depend on the count. # Chosen experimentally over multiple datasets. max_depth = 20 # Calculate the standard deviation for the sum noise using a sensitivity of 1. if privacy_param.epsilon == np.inf: sum_sigma = 0 else: sum_sigma = accountant.get_smallest_gaussian_noise( common.DifferentialPrivacyParameters( privacy_param.epsilon * privacy_budget_split.frac_sum, privacy_param.delta), num_queries=1, sensitivity=1.0) private_count = central_privacy_utils.get_private_count( data.num_points, central_privacy_utils.PrivateCountParam(privacy_param, privacy_budget_split, max_depth)) # We can consider the noise as distributed amongst the points that are being # summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma * radius, # so if we distribute among 10 * sqrt(dimension) * sum_sigma, each point # has noise roughly 0.1 * radius. num_points_in_node_for_low_noise = int(10 * np.sqrt(data.dim) * sum_sigma) # We want to at least have the ability to consider a node per cluster, even # if the noise might be higher than we'd like. min_num_points_in_node = min(num_points_in_node_for_low_noise, private_count // (2 * k)) # min_num_points_in_node must always be at least 1. Note it's possible that # the private_count is negative, so we should ensure this max is done last. min_num_points_in_node = max(1, min_num_points_in_node) min_num_points_in_branching_node = 3 * min_num_points_in_node return (clustering_params.TreeParam( min_num_points_in_branching_node=min_num_points_in_branching_node, min_num_points_in_node=min_num_points_in_node, max_depth=max_depth), private_count)
def test_get_smallest_gaussian_noise(self, epsilon, delta, num_queries, sensitivity, expected_std): privacy_parameters = common.DifferentialPrivacyParameters( epsilon, delta) self.assertAlmostEqual( expected_std, accountant.get_smallest_gaussian_noise( privacy_parameters, num_queries, sensitivity=sensitivity))