def test_error_tree_param(self): with self.assertRaises(ValueError): clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=0, max_depth=5) with self.assertRaises(ValueError): clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=-2, max_depth=5) with self.assertRaises(ValueError): clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=20, max_depth=5)
def get_test_clustering_param(epsilon=1.0, delta=1e-2, frac_sum=0.2, frac_group_count=0.8, min_num_points_in_branching_node=4, min_num_points_in_node=2, max_depth=4, radius=1): # pylint: disable=g-doc-args """Returns clustering_param with defaults for params not needed for testing. Usage: Explicitly pass in parameters that are relied on in the test. """ privacy_param = clustering_params.DifferentialPrivacyParam( epsilon=epsilon, delta=delta) privacy_budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=frac_sum, frac_group_count=frac_group_count) tree_param = clustering_params.TreeParam( min_num_points_in_branching_node=min_num_points_in_branching_node, min_num_points_in_node=min_num_points_in_node, max_depth=max_depth) clustering_param = clustering_params.ClusteringParam( privacy_param=privacy_param, privacy_budget_split=privacy_budget_split, tree_param=tree_param, short_description='TestClusteringParam', radius=radius) return clustering_param
def default_tree_param( k: int, data: clustering_params.Data, privacy_param: clustering_params.DifferentialPrivacyParam, privacy_budget_split: clustering_params.PrivacyBudgetSplit ) -> typing.Tuple[clustering_params.TreeParam, PrivateCount]: """Heuristic tree param based on the data and number of clusters. Args: k: Number of clusters to divide the data into. data: Data to find centers for. privacy_param: privacy parameters for the algorithm. privacy_budget_split: budget split between different computations. Returns: (default TreeParam, private count). The private count is provided so that it doesn't need to be re-computed. """ # Note that max_depth is used for the private count calculation so it cannot # depend on the count. # Chosen experimentally over multiple datasets. max_depth = 20 # Calculate the standard deviation for the sum noise using a sensitivity of 1. if privacy_param.epsilon == np.inf: sum_sigma = 0 else: sum_sigma = accountant.get_smallest_gaussian_noise( common.DifferentialPrivacyParameters( privacy_param.epsilon * privacy_budget_split.frac_sum, privacy_param.delta), num_queries=1, sensitivity=1.0) private_count = central_privacy_utils.get_private_count( data.num_points, central_privacy_utils.PrivateCountParam(privacy_param, privacy_budget_split, max_depth)) # We can consider the noise as distributed amongst the points that are being # summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma * radius, # so if we distribute among 10 * sqrt(dimension) * sum_sigma, each point # has noise roughly 0.1 * radius. num_points_in_node_for_low_noise = int(10 * np.sqrt(data.dim) * sum_sigma) # We want to at least have the ability to consider a node per cluster, even # if the noise might be higher than we'd like. min_num_points_in_node = min(num_points_in_node_for_low_noise, private_count // (2 * k)) # min_num_points_in_node must always be at least 1. Note it's possible that # the private_count is negative, so we should ensure this max is done last. min_num_points_in_node = max(1, min_num_points_in_node) min_num_points_in_branching_node = 3 * min_num_points_in_node return (clustering_params.TreeParam( min_num_points_in_branching_node=min_num_points_in_branching_node, min_num_points_in_node=min_num_points_in_node, max_depth=max_depth), private_count)
def test_tree_param(self): tree_param = clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=2, max_depth=5) self.assertEqual(tree_param.min_num_points_in_branching_node, 4) self.assertEqual(tree_param.min_num_points_in_node, 2) self.assertEqual(tree_param.max_depth, 5)
def test_clustering_param(self): privacy_param = clustering_params.DifferentialPrivacyParam() privacy_budget_split = clustering_params.PrivacyBudgetSplit() tree_param = clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=2, max_depth=5) clustering_param = clustering_params.ClusteringParam( privacy_param=privacy_param, privacy_budget_split=privacy_budget_split, tree_param=tree_param, short_description="TestClusteringParam", radius=20) self.assertEqual(clustering_param.privacy_param, privacy_param) self.assertEqual(clustering_param.privacy_budget_split, privacy_budget_split) self.assertEqual(clustering_param.tree_param, tree_param) self.assertEqual(clustering_param.short_description, "TestClusteringParam") self.assertEqual(clustering_param.radius, 20)
def test_clipped_data_used_for_clustering_and_not_result_calculation(self): # Clipped datapoints (radius=1): [[0.3, 0.2], [0.6, 0.8], [0.6, 0.8]] datapoints = np.array([[0.3, 0.2], [3, 4], [6, 8]]) # Very small radius means the datapoint will be clipped for the center # calculation. data = clustering_params.Data(datapoints=datapoints, radius=1) # No noise privacy_param = clustering_params.DifferentialPrivacyParam(np.inf) # No branching, the coreset will just be the average of the points tree_param = clustering_params.TreeParam(1, 1, 0) clustering_result = clustering_algorithm.private_lsh_clustering( 3, data, privacy_param, tree_param=tree_param) # Center should be calculated using the clipped data. expected_center = np.array([0.5, 0.6]) self.assertLen(clustering_result.centers, 1) self.assertSequenceAlmostEqual(clustering_result.centers[0], expected_center) self.assertListEqual(list(clustering_result.labels), [0, 0, 0]) # Loss calculation should still be relative to the original points. self.assertAlmostEqual(clustering_result.loss, 103.02)