예제 #1
0
def get_test_clustering_param(epsilon=1.0,
                              delta=1e-2,
                              frac_sum=0.2,
                              frac_group_count=0.8,
                              min_num_points_in_branching_node=4,
                              min_num_points_in_node=2,
                              max_depth=4,
                              radius=1):
  # pylint: disable=g-doc-args
  """Returns clustering_param with defaults for params not needed for testing.

  Usage: Explicitly pass in parameters that are relied on in the test.
  """
  privacy_param = clustering_params.DifferentialPrivacyParam(
      epsilon=epsilon, delta=delta)
  privacy_budget_split = clustering_params.PrivacyBudgetSplit(
      frac_sum=frac_sum,
      frac_group_count=frac_group_count)
  tree_param = clustering_params.TreeParam(
      min_num_points_in_branching_node=min_num_points_in_branching_node,
      min_num_points_in_node=min_num_points_in_node,
      max_depth=max_depth)
  clustering_param = clustering_params.ClusteringParam(
      privacy_param=privacy_param,
      privacy_budget_split=privacy_budget_split,
      tree_param=tree_param,
      short_description='TestClusteringParam',
      radius=radius)
  return clustering_param
예제 #2
0
  def test_default_tree_param(self, points, returned_private_count, k, epsilon,
                              expected_min_num_points_in_branching_node,
                              expected_min_num_points_in_node,
                              expected_max_depth, mock_gaussian_noise,
                              mock_private_count):
    dim = 10
    mock_private_count.return_value = returned_private_count
    data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0)
    privacy_param = clustering_params.DifferentialPrivacyParam(
        epsilon=epsilon, delta=1e-2)
    budget_split = clustering_params.PrivacyBudgetSplit(
        frac_sum=0.8, frac_group_count=0.2)

    (tree_param, private_count) = default_clustering_params.default_tree_param(
        k, data, privacy_param, budget_split)
    self.assertEqual(tree_param.max_depth, expected_max_depth)
    if epsilon == np.inf:
      mock_gaussian_noise.assert_not_called()
    else:
      mock_gaussian_noise.assert_called_once_with(
          common.DifferentialPrivacyParameters(0.8 * epsilon, 1e-2), 1, 1.0)
    mock_private_count.assert_called_once_with(
        nonprivate_count=points,
        count_privacy_param=central_privacy_utils.CountPrivacyParam(
            epsilon=0.2 * epsilon / (tree_param.max_depth + 1), delta=1e-2))
    self.assertEqual(private_count, returned_private_count)
    self.assertEqual(tree_param.min_num_points_in_node,
                     expected_min_num_points_in_node)
    self.assertEqual(tree_param.min_num_points_in_branching_node,
                     expected_min_num_points_in_branching_node)
예제 #3
0
def private_lsh_clustering(
        k: int,
        data: clustering_params.Data,
        privacy_param: clustering_params.DifferentialPrivacyParam,
        privacy_budget_split: typing.Optional[
            clustering_params.PrivacyBudgetSplit] = None,
        tree_param: typing.Optional[clustering_params.TreeParam] = None,
        short_description: str = "ClusteringParam") -> ClusteringResult:
    """Clusters data into k clusters.

  Args:
    k: Number of clusters to divide the data into.
    data: Data to find centers for. Centering the data around the origin
      beforehand may provide performance improvements.
    privacy_param: Differential privacy parameters.
    privacy_budget_split: Optional privacy budget split between operations in
      the clustering algorithm for fine-tuning.
    tree_param: Optional tree parameters for generating the LSH net tree for
      fine-tuning.
    short_description: Optional description to identify this parameter
      configuration.

  Returns:
    ClusteringResult with differentially private centers. The rest of
    ClusteringResult is nonprivate, and only provided for convenience.
  """
    # Initialize the parameters.
    if privacy_budget_split is None:
        privacy_budget_split = clustering_params.PrivacyBudgetSplit()
    private_count = None
    if tree_param is None:
        # Saves the private count to re-use for the root node of the tree.
        tree_param, private_count = default_clustering_params.default_tree_param(
            k, data, privacy_param, privacy_budget_split)
    clustering_param = clustering_params.ClusteringParam(
        privacy_param, privacy_budget_split, tree_param, short_description,
        data.radius)
    logging.debug("clustering_param: %s", clustering_param)

    # To guarantee privacy, enforce the radius provided.
    clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius,
                                          data.labels)

    coreset: private_outputs.PrivateWeightedData = get_private_coreset(
        clipped_data, clustering_param, private_count)

    k = min(k, len(coreset.datapoints))
    logging.debug(
        "Starting k-means++ computation on private coreset with k=%d. This may "
        "be less than the original if generated coreset data ended up with "
        "less than k unique points.", k)
    kmeans = sklearn.cluster.KMeans(n_clusters=k, init="k-means++").fit(
        coreset.datapoints, sample_weight=coreset.weights)
    # Calculate the result relative to the original data.
    # Note: the calculations besides the centers are nonprivate.
    return ClusteringResult(data, kmeans.cluster_centers_)
 def test_private_count_param(self):
     privacy_param = clustering_params.DifferentialPrivacyParam(epsilon=10,
                                                                delta=1e-2)
     privacy_budget_split = clustering_params.PrivacyBudgetSplit(
         frac_sum=0.2, frac_group_count=0.8)
     max_tree_depth = 3
     count_privacy_param = CountPrivacyParam.compute_group_count_privacy_param(
         privacy_param, privacy_budget_split, max_tree_depth)
     self.assertEqual(count_privacy_param.epsilon, 2.0)
     self.assertEqual(count_privacy_param.delta, 1e-2)
예제 #5
0
 def test_private_count_param(self):
     privacy_param = clustering_params.DifferentialPrivacyParam(epsilon=2.0,
                                                                delta=1e-3)
     privacy_budget_split = clustering_params.PrivacyBudgetSplit(
         frac_sum=0.7, frac_group_count=0.3)
     max_tree_depth = 10
     private_count_param = CentralPrivateCountParam(privacy_param,
                                                    privacy_budget_split,
                                                    max_tree_depth)
     self.assertEqual(private_count_param.privacy_param, privacy_param)
     self.assertEqual(private_count_param.privacy_budget_split,
                      privacy_budget_split)
     self.assertEqual(private_count_param.max_tree_depth, max_tree_depth)
예제 #6
0
 def test_clustering_param(self):
   privacy_param = clustering_params.DifferentialPrivacyParam()
   privacy_budget_split = clustering_params.PrivacyBudgetSplit()
   tree_param = clustering_params.TreeParam(
       min_num_points_in_branching_node=4,
       min_num_points_in_node=2,
       max_depth=5)
   clustering_param = clustering_params.ClusteringParam(
       privacy_param=privacy_param,
       privacy_budget_split=privacy_budget_split,
       tree_param=tree_param,
       short_description="TestClusteringParam",
       radius=20)
   self.assertEqual(clustering_param.privacy_param, privacy_param)
   self.assertEqual(clustering_param.privacy_budget_split,
                    privacy_budget_split)
   self.assertEqual(clustering_param.tree_param, tree_param)
   self.assertEqual(clustering_param.short_description, "TestClusteringParam")
   self.assertEqual(clustering_param.radius, 20)
예제 #7
0
 def test_privacy_budget_split_invalid(self):
   with self.assertRaises(
       ValueError,
       msg="The provided privacy budget split (1.6) was greater than 1.0."):
     clustering_params.PrivacyBudgetSplit(frac_sum=0.7, frac_group_count=0.8)
예제 #8
0
 def test_privacy_budget_split_defaults(self):
   privacy_budget_split = clustering_params.PrivacyBudgetSplit()
   self.assertEqual(privacy_budget_split.frac_sum, 0.8)
   self.assertEqual(privacy_budget_split.frac_group_count, 0.2)