def test_select_partitions_calls_select_partitions_with_params( self, mock_select_partitions): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.5) partition_extractor = lambda x: f"pk:{x // 10}" # Act transformer = private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_select_partitions.assert_called_once() args = mock_select_partitions.call_args[0] self.assertEqual(args[1], select_partitions_params)
def test_map_returns_correct_results_and_accountant(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4), (4, 5)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.Map( fn=lambda x: x[1]**2) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to( map( lambda x: (PrivateBeamTest.privacy_id_extractor(x), x[1]**2), pcol_input))) self.assertEqual(transformed._budget_accountant, budget_accountant)
def test_combine_per_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pk1", 100.0) for u in range(30)] col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) private_collection = private_collection | private_beam.Map( lambda x: (x[1], x[2])) # Act result = private_collection | private_beam.CombinePerKey( SumCombineFn(), private_beam.CombinePerKeyParams( max_partitions_contributed=2, max_contributions_per_partition=1)) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pk1", 0.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 10.0)))
def test_select_private_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(50)] col += [(50 + u, "pk2") for u in range(50)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.9) partition_extractor = lambda x: x[1] # Act result = private_collection | private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
def test_privacy_id_count_returns_sensible_result(self): # Arrange col = [(u, "pk1") for u in range(30)] dist_data = PrivateRDDTest.sc.parallelize(col) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) privacy_id_count_params = agg.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: x[1]) # Act actual_result = prdd.privacy_id_count(privacy_id_count_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. expected_result_dict = {"pk1": 30.0} actual_result_dict = self.to_dict(actual_result.collect()) for pk, count in actual_result_dict.items(): self.assertTrue( self.value_per_key_within_tolerance(count, expected_result_dict[pk], 5.0))
def test_select_partitions_returns_sensible_result(self): # Arrange col = [(u, "pk1") for u in range(50)] col += [(50 + u, "pk2") for u in range(50)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) max_partitions_contributed = 2 def privacy_id_extractor(x): return x[0] def partition_extractor(x): return x[1] # Act prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) select_partitions_params = agg.SelectPartitionsParams( max_partitions_contributed=max_partitions_contributed) actual_result = prdd.select_partitions(select_partitions_params, partition_extractor) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. self.assertEqual(sorted(actual_result.collect()), ["pk1", "pk2"])
def test_privacy_id_count_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) privacy_id_count_params = aggregate_params.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: x[1]) # Act result = private_collection | private_beam.PrivacyIdCount( privacy_id_count_params=privacy_id_count_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pk1", 30)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 5)))
def test_flatmap_returns_correct_results_and_accountant(self): def flat_map_fn(x): return [(x[0], x[1] + i) for i in range(2)] runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.FlatMap( flat_map_fn) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to([('pid:(1, 2)', (1, 2)), ('pid:(1, 2)', (1, 3)), ('pid:(2, 3)', (2, 3)), ('pid:(2, 3)', (2, 4)), ('pid:(3, 4)', (3, 4)), ('pid:(3, 4)', (3, 5))])) self.assertEqual(transformed._budget_accountant, budget_accountant)
def test_variance_calls_aggregate_with_correct_params( self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, 0.0, "pk1"), (2, 10.0, "pk1")]) MetricsTuple = collections.namedtuple('MetricsTuple', ['variance']) mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([ ("pk1", MetricsTuple(variance=25.0)) ]) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[1] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) variance_params = agg.VarianceParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.5, max_value=5.78, budget_weight=1.1, partition_extractor=lambda x: x[0], value_extractor=lambda x: x) # Act actual_result = prdd.variance(variance_params) # Assert mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x)) self.assertListEqual(args[0].collect(), rdd.collect()) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.VARIANCE], max_partitions_contributed=variance_params. max_partitions_contributed, max_contributions_per_partition=variance_params. max_contributions_per_partition, min_value=variance_params.min_value, max_value=variance_params.max_value, budget_weight=variance_params.budget_weight, public_partitions=variance_params.public_partitions) self.assertEqual(args[1], params) self.assertEqual(actual_result.collect(), [("pk1", 25.0)])
def test_utility_analysis_params(self): default_extractors = self._get_default_extractors() default_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=1, metrics=[pipeline_dp.Metrics.COUNT]) params_with_custom_combiners = copy.copy(default_params) params_with_custom_combiners.custom_combiners = sum params_with_unsupported_metric = copy.copy(default_params) params_with_unsupported_metric.metrics = [pipeline_dp.Metrics.MEAN] params_with_contribution_bounds_already_enforced = default_params params_with_contribution_bounds_already_enforced.contribution_bounds_already_enforced = True test_cases = [ { "desc": "custom combiners", "params": params_with_custom_combiners, "data_extractor": default_extractors, "public_partitions": [1] }, { "desc": "unsupported metric in metrics", "params": params_with_unsupported_metric, "data_extractor": default_extractors, "public_partitions": [1] }, { "desc": "contribution bounds are already enforced", "params": params_with_contribution_bounds_already_enforced, "data_extractor": default_extractors, "public_partitions": [1] }, ] for test_case in test_cases: with self.assertRaisesRegex(Exception, expected_regex=test_case["desc"]): budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=1e-10) engine = dp_engine.UtilityAnalysisEngine( budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = [0, 1, 2] engine.aggregate( col, test_case["params"], test_case["data_extractor"], public_partitions=test_case["public_partitions"])
def test_map(self): data = [(1, 11), (2, 12)] dist_data = PrivateRDDTest.sc.parallelize(data) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[0] prdd = private_spark.PrivateRDD(dist_data, budget_accountant, privacy_id_extractor) result = prdd.map(lambda x: (x[0], x[1] * 2)) self.assertEqual(result._rdd.collect(), [(1, (1, 22)), (2, (2, 24))]) self.assertEqual(result._budget_accountant, prdd._budget_accountant)
def test_mean_calls_aggregate_with_correct_params(self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, 2.0, "pk1"), (2, 2.0, "pk1")]) mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2.0, ["pk1"])]) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[1] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.5, max_value=5.78, budget_weight=1.1, public_partitions=None, partition_extractor=lambda x: x[0], value_extractor=lambda x: x) # Act actual_result = prdd.mean(mean_params) # Assert mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x)) self.assertListEqual(args[0].collect(), rdd.collect()) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.MEAN], max_partitions_contributed=mean_params.max_partitions_contributed, max_contributions_per_partition=mean_params. max_contributions_per_partition, min_value=mean_params.min_value, max_value=mean_params.max_value, budget_weight=mean_params.budget_weight, public_partitions=mean_params.public_partitions) self.assertEqual(args[1], params) self.assertEqual(actual_result.collect(), [(2.0, "pk1")])
def test_sum_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( float(i) for i in range(1, 7)) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) sum_params = aggregate_params.SumParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1, max_value=5, budget_weight=1, public_partitions=[], partition_extractor=lambda x: f"pk:{x // 10}", value_extractor=lambda x: x) # Act transformer = private_beam.Sum(sum_params=sum_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=sum_params. max_partitions_contributed, max_contributions_per_partition=sum_params. max_contributions_per_partition, min_value=sum_params.min_value, max_value=sum_params.max_value, public_partitions=sum_params.public_partitions) self.assertEqual(params, args[1])
def test_variance_with_public_partitions_returns_sensible_result(self): # Arrange col = [(u, "pubK1", -100) for u in range(30)] col += [(u + 30, "pubK1", 100) for u in range(10)] col += [(u + 40, "privK1", 100) for u in range(30)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=8000, total_delta=0.9999999) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) variance_params = agg.VarianceParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.55, # -100 should be clipped to this value max_value=2.7889, # 100 should be clipped to this value budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2]) # Act actual_result = prdd.variance(variance_params, public_partitions=["pubK1", "pubK2"]) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. expected_result_dict = {"pubK1": 0.288, "pubK2": 0.0} actual_result_dict = self.to_dict(actual_result.collect()) for pk, variance in actual_result_dict.items(): self.assertTrue( self.value_per_key_within_tolerance(variance, expected_result_dict[pk], 0.1))
def test_transform_with_return_anonymized_enabled_returns_pcollection( self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | SimplePrivatePTransform( return_anonymized=True) # Assert self.assertIsInstance(transformed, pvalue.PCollection)
def test_make_private_transform_succeeds(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) # Act private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Assert self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertEqual(private_collection._budget_accountant, budget_accountant)
def test_sum_returns_sensible_result(self): # Arrange col = [(f"{u}", "pk1", 100.0) for u in range(30)] col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) sum_params = agg.SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.55, max_value=2.7889, budget_weight=1, public_partitions=None, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2]) # Act actual_result = prdd.sum(sum_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. expected_result_dict = {"pk1": 130.167} actual_result_dict = self.to_dict(actual_result.collect()) for pk, sum in actual_result_dict.items(): self.assertTrue( self.value_per_key_within_tolerance(sum, expected_result_dict[pk], 5.0))
def test_variance_with_public_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pubK1", -100.0) for u in range(30)] col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)] col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=8000, total_delta=0.9999999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) variance_params = aggregate_params.VarianceParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=1, min_value=1.55, # -100 should be clipped to this value max_value=2.7889, # 100 should be clipped to this value budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2]) # Act result = private_collection | private_beam.Variance( variance_params=variance_params, public_partitions=["pubK1", "pubK2"]) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value} beam_util.equal_to([("pubK1", 0.288), ("pubK2", 0.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 0.1)))
def test_privacy_id_count_calls_aggregate_with_correct_params( self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk1")]) MetricsTuple = collections.namedtuple('MetricsTuple', ['privacy_id_count']) mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([ ("pk1", MetricsTuple(privacy_id_count=2)) ]) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) privacy_id_count_params = agg.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: x[1]) # Act actual_result = prdd.privacy_id_count(privacy_id_count_params) # Assert mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x)) self.assertListEqual(args[0].collect(), rdd.collect()) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1) self.assertEqual(args[1], params) self.assertEqual([("pk1", 2)], actual_result.collect())
def test_privacy_id_count_calls_aggregate_with_params( self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) privacy_id_count_params = aggregate_params.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: f"pk:{x // 10}") # Act transformer = private_beam.PrivacyIdCount( privacy_id_count_params=privacy_id_count_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=privacy_id_count_params.public_partitions) self.assertEqual(args[1], params)
def test_private_collection_with_non_private_transform_throws_error(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act and Assert with self.assertRaises(TypeError) as context: (private_collection | 'Non private transform on ' 'PrivatePCollection' >> beam.Map(lambda x: x)) self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertTrue( "private_transform should be of type " "PrivatePTransform but is " in str(context.exception))
def test_select_partitions_calls_select_partitions_with_correct_params( self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk2")]) expected_result_partitions = ["pk1", "pk2"] mock_aggregate.return_value = PrivateRDDTest.sc.parallelize( expected_result_partitions) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) max_partitions_contributed = 2 def privacy_id_extractor(x): return x[0] def partition_extractor(x): return {x[1]} # Act prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) select_partitions_params = agg.SelectPartitionsParams( max_partitions_contributed=max_partitions_contributed) actual_result = prdd.select_partitions(select_partitions_params, partition_extractor) # Assert mock_aggregate.assert_called_once() actual_args = mock_aggregate.call_args[0] actual_rdd = actual_args[0].collect() actual_select_partition_params = actual_args[1] self.assertListEqual(actual_rdd, [(1, (1, "pk1")), (2, (2, "pk2"))]) self.assertEqual( actual_select_partition_params.max_partitions_contributed, max_partitions_contributed) self.assertEqual(actual_result.collect(), expected_result_partitions)