Exemple #1
0
 def test_error_tree_param(self):
   with self.assertRaises(ValueError):
     clustering_params.TreeParam(
         min_num_points_in_branching_node=4,
         min_num_points_in_node=0,
         max_depth=5)
   with self.assertRaises(ValueError):
     clustering_params.TreeParam(
         min_num_points_in_branching_node=4,
         min_num_points_in_node=-2,
         max_depth=5)
   with self.assertRaises(ValueError):
     clustering_params.TreeParam(
         min_num_points_in_branching_node=4,
         min_num_points_in_node=20,
         max_depth=5)
def get_test_clustering_param(epsilon=1.0,
                              delta=1e-2,
                              frac_sum=0.2,
                              frac_group_count=0.8,
                              min_num_points_in_branching_node=4,
                              min_num_points_in_node=2,
                              max_depth=4,
                              radius=1):
  # pylint: disable=g-doc-args
  """Returns clustering_param with defaults for params not needed for testing.

  Usage: Explicitly pass in parameters that are relied on in the test.
  """
  privacy_param = clustering_params.DifferentialPrivacyParam(
      epsilon=epsilon, delta=delta)
  privacy_budget_split = clustering_params.PrivacyBudgetSplit(
      frac_sum=frac_sum,
      frac_group_count=frac_group_count)
  tree_param = clustering_params.TreeParam(
      min_num_points_in_branching_node=min_num_points_in_branching_node,
      min_num_points_in_node=min_num_points_in_node,
      max_depth=max_depth)
  clustering_param = clustering_params.ClusteringParam(
      privacy_param=privacy_param,
      privacy_budget_split=privacy_budget_split,
      tree_param=tree_param,
      short_description='TestClusteringParam',
      radius=radius)
  return clustering_param
def default_tree_param(
    k: int, data: clustering_params.Data,
    privacy_param: clustering_params.DifferentialPrivacyParam,
    privacy_budget_split: clustering_params.PrivacyBudgetSplit
) -> typing.Tuple[clustering_params.TreeParam, PrivateCount]:
    """Heuristic tree param based on the data and number of clusters.

  Args:
    k: Number of clusters to divide the data into.
    data: Data to find centers for.
    privacy_param: privacy parameters for the algorithm.
    privacy_budget_split: budget split between different computations.

  Returns:
    (default TreeParam, private count). The private count is provided so that
    it doesn't need to be re-computed.
  """
    # Note that max_depth is used for the private count calculation so it cannot
    # depend on the count.
    # Chosen experimentally over multiple datasets.
    max_depth = 20

    # Calculate the standard deviation for the sum noise using a sensitivity of 1.
    if privacy_param.epsilon == np.inf:
        sum_sigma = 0
    else:
        sum_sigma = accountant.get_smallest_gaussian_noise(
            common.DifferentialPrivacyParameters(
                privacy_param.epsilon * privacy_budget_split.frac_sum,
                privacy_param.delta),
            num_queries=1,
            sensitivity=1.0)

    private_count = central_privacy_utils.get_private_count(
        data.num_points,
        central_privacy_utils.PrivateCountParam(privacy_param,
                                                privacy_budget_split,
                                                max_depth))

    # We can consider the noise as distributed amongst the points that are being
    # summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma * radius,
    # so if we distribute among 10 * sqrt(dimension) * sum_sigma, each point
    # has noise roughly 0.1 * radius.
    num_points_in_node_for_low_noise = int(10 * np.sqrt(data.dim) * sum_sigma)

    # We want to at least have the ability to consider a node per cluster, even
    # if the noise might be higher than we'd like.
    min_num_points_in_node = min(num_points_in_node_for_low_noise,
                                 private_count // (2 * k))

    # min_num_points_in_node must always be at least 1. Note it's possible that
    # the private_count is negative, so we should ensure this max is done last.
    min_num_points_in_node = max(1, min_num_points_in_node)
    min_num_points_in_branching_node = 3 * min_num_points_in_node

    return (clustering_params.TreeParam(
        min_num_points_in_branching_node=min_num_points_in_branching_node,
        min_num_points_in_node=min_num_points_in_node,
        max_depth=max_depth), private_count)
Exemple #4
0
 def test_tree_param(self):
   tree_param = clustering_params.TreeParam(
       min_num_points_in_branching_node=4,
       min_num_points_in_node=2,
       max_depth=5)
   self.assertEqual(tree_param.min_num_points_in_branching_node, 4)
   self.assertEqual(tree_param.min_num_points_in_node, 2)
   self.assertEqual(tree_param.max_depth, 5)
Exemple #5
0
 def test_clustering_param(self):
   privacy_param = clustering_params.DifferentialPrivacyParam()
   privacy_budget_split = clustering_params.PrivacyBudgetSplit()
   tree_param = clustering_params.TreeParam(
       min_num_points_in_branching_node=4,
       min_num_points_in_node=2,
       max_depth=5)
   clustering_param = clustering_params.ClusteringParam(
       privacy_param=privacy_param,
       privacy_budget_split=privacy_budget_split,
       tree_param=tree_param,
       short_description="TestClusteringParam",
       radius=20)
   self.assertEqual(clustering_param.privacy_param, privacy_param)
   self.assertEqual(clustering_param.privacy_budget_split,
                    privacy_budget_split)
   self.assertEqual(clustering_param.tree_param, tree_param)
   self.assertEqual(clustering_param.short_description, "TestClusteringParam")
   self.assertEqual(clustering_param.radius, 20)
Exemple #6
0
    def test_clipped_data_used_for_clustering_and_not_result_calculation(self):
        # Clipped datapoints (radius=1): [[0.3, 0.2], [0.6, 0.8], [0.6, 0.8]]
        datapoints = np.array([[0.3, 0.2], [3, 4], [6, 8]])
        # Very small radius means the datapoint will be clipped for the center
        # calculation.
        data = clustering_params.Data(datapoints=datapoints, radius=1)
        # No noise
        privacy_param = clustering_params.DifferentialPrivacyParam(np.inf)
        # No branching, the coreset will just be the average of the points
        tree_param = clustering_params.TreeParam(1, 1, 0)
        clustering_result = clustering_algorithm.private_lsh_clustering(
            3, data, privacy_param, tree_param=tree_param)

        # Center should be calculated using the clipped data.
        expected_center = np.array([0.5, 0.6])
        self.assertLen(clustering_result.centers, 1)
        self.assertSequenceAlmostEqual(clustering_result.centers[0],
                                       expected_center)

        self.assertListEqual(list(clustering_result.labels), [0, 0, 0])

        # Loss calculation should still be relative to the original points.
        self.assertAlmostEqual(clustering_result.loss, 103.02)