def test_validation(self): pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10) # No exception. pipeline_dp.BudgetAccountant(epsilon=1, delta=0) # No exception. with self.assertRaises(ValueError): pipeline_dp.BudgetAccountant( epsilon=0, delta=1e-10) # Epsilon must be positive. with self.assertRaises(ValueError): pipeline_dp.BudgetAccountant( epsilon=0.5, delta=-1e-10) # Delta must be non-negative.
def calc_dp_rating_metrics(movie_views, ops, public_partitions): """Computes dp metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, ops) # Specify which DP aggregated metrics to compute. params = pipeline_dp.AggregateParams(metrics=[ pipeline_dp.Metrics.COUNT, ], max_partitions_contributed=2, max_contributions_per_partition=1, low=1, high=5, public_partitions=public_partitions) # Specify how to extract is privacy_id, partition_key and value from an element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() return dp_result
def test_request_budget(self): budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=0) budget = budget_accountant.request_budget(1, use_eps=False, use_delta=False) self.assertTrue(budget) # An object must be returned. with self.assertRaises(AssertionError): print(budget.eps) # The privacy budget is not calculated yet. with self.assertRaises(AssertionError): print(budget.delta) # The privacy budget is not calculated yet.
def test_compute_budgets(self): budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6) budget1 = budget_accountant.request_budget(1, use_eps=True, use_delta=False) budget2 = budget_accountant.request_budget(3, use_eps=True, use_delta=True) budget_accountant.compute_budgets() self.assertEqual(budget1.eps, 0.25) self.assertEqual(budget1.delta, 0) # Delta should be 0 if use_delta is False. self.assertEqual(budget2.eps, 0.75) self.assertEqual(budget2.delta, 1e-6)
def test_contribution_bounding_empty_col(self): input_col = [] max_partitions_contributed = 2 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertFalse(bound_result)
def test_select_private_partitions(self): input_col = [("pid1", ('pk1', 1)), ("pid1", ('pk1', 2)), ("pid1", ('pk2', 3)), ("pid1", ('pk2', 4)), ("pid1", ('pk2', 5)), ("pid1", ('pk3', 6)), ("pid1", ('pk4', 7)), ("pid2", ('pk4', 8))] max_partitions_contributed = 3 engine = pipeline_dp.DPEngine(pipeline_dp.BudgetAccountant(1, 1e-10), pipeline_dp.LocalPipelineOperations()) groups = engine._ops.group_by_key(input_col, None) groups = engine._ops.map_values(groups, lambda group: _MockAccumulator(group)) groups = list(groups) expected_data_filtered = [("pid1", _MockAccumulator([ ('pk1', 1), ('pk1', 2), ('pk2', 3), ('pk2', 4), ('pk2', 5), ('pk3', 6), ('pk4', 7), ])), ("pid2", _MockAccumulator([('pk4', 8)]))] self._mock_and_assert_private_partitions(engine, groups, 0, expected_data_filtered, max_partitions_contributed) expected_data_filtered = [ ("pid1", _MockAccumulator([ ('pk1', 1), ('pk1', 2), ('pk2', 3), ('pk2', 4), ('pk2', 5), ('pk3', 6), ('pk4', 7), ])), ] self._mock_and_assert_private_partitions(engine, groups, 3, expected_data_filtered, max_partitions_contributed) expected_data_filtered = [] self._mock_and_assert_private_partitions(engine, groups, 100, expected_data_filtered, max_partitions_contributed)
def test_contribution_bounding_bound_input_nothing_dropped(self): input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4)] max_partitions_contributed = 2 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) expected_result = [(('pid1', 'pk2'), (2, 7, 25)), (('pid1', 'pk1'), (2, 3, 5))] self.assertEqual(set(expected_result), set(bound_result))
def test_accumulator_factory(self, mock_create_accumulator_params_function): aggregate_params = pipeline_dp.AggregateParams([agg.Metrics.MEAN], 5, 3) budget_accountant = pipeline_dp.BudgetAccountant(1, 0.01) values = [10] mock_create_accumulator_params_function.return_value = [ accumulator.AccumulatorParams(MeanAccumulator, None) ] accumulator_factory = accumulator.AccumulatorFactory( aggregate_params, budget_accountant) accumulator_factory.initialize() created_accumulator = accumulator_factory.create(values) self.assertTrue(isinstance(created_accumulator, MeanAccumulator)) self.assertEqual(created_accumulator.compute_metrics(), 10) mock_create_accumulator_params_function.assert_called_with( aggregate_params, budget_accountant)
def test_contribution_bounding_per_partition_bounding_applied(self): input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid2", 'pk2', 6)] max_partitions_contributed = 5 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertEqual(len(bound_result), 3) # Check contributions per partitions self.assertTrue( all( map( lambda op_val: op_val[1][0] <= max_contributions_per_partition, bound_result)))
def test_contribution_bounding_cross_partition_bounding_applied(self): input_col = [ ("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid1", 'pk3', 6), ("pid1", 'pk4', 7), ("pid2", 'pk4', 8) ] max_partitions_contributed = 3 max_contributions_per_partition = 5 dp_engine = pipeline_dp.DPEngine( pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertEqual(len(bound_result), 4) # Check contributions per partitions self.assertTrue( all( map( lambda op_val: op_val[1][0] <= max_contributions_per_partition, bound_result))) # Check cross partition contributions dict_of_pid_to_pk = collections.defaultdict(lambda: []) for key, _ in bound_result: dict_of_pid_to_pk[key[0]].append(key[1]) self.assertEqual(len(dict_of_pid_to_pk), 2) self.assertTrue( all( map( lambda key: len(dict_of_pid_to_pk[key]) <= max_partitions_contributed, dict_of_pid_to_pk)))