def test_count_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) count_params = aggregate_params.CountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, budget_weight=1, partition_extractor=lambda x: x[1]) # Act result = private_collection | private_beam.Count( count_params=count_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pk1", 30.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 5.0)))
def test_count_calls_with_public_partitions_returns_sensible_result(self): # Arrange col = [(u, "pubK1") for u in range(30)] col += [(u, "privK1") for u in range(30)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) count_params = agg.CountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, budget_weight=1, public_partitions=["pubK1", "pubK2"], partition_extractor=lambda x: x[1]) # Act actual_result = prdd.count(count_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. expected_result_dict = {"pubK2": 0.0, "pubK1": 30.0} actual_result_dict = self.to_dict(actual_result.collect()) for pk, count in actual_result_dict.items(): self.assertTrue( self.value_per_key_within_tolerance(count, expected_result_dict[pk], 5.0))
def test_count_calls_aggregate_with_correct_params(self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk1")]) mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2, ["pk1"])]) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) count_params = agg.CountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, budget_weight=1, public_partitions=None, partition_extractor=lambda x: x[1]) # Act actual_result = prdd.count(count_params) # Assert mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x)) self.assertListEqual(args[0].collect(), rdd.collect()) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=count_params.max_partitions_contributed, max_contributions_per_partition=count_params. max_contributions_per_partition, public_partitions=count_params.public_partitions) self.assertEqual(args[1], params) self.assertEqual(actual_result.collect(), [(2, "pk1")])
def test_count_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) count_params = aggregate_params.CountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, budget_weight=1, partition_extractor=lambda x: f"pk:{x // 10}") # Act transformer = private_beam.Count(count_params=count_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=count_params. max_partitions_contributed, max_contributions_per_partition=count_params. max_contributions_per_partition, public_partitions=count_params.public_partitions) self.assertEqual(args[1], params)