Beispiel #1
0
    def test_validation(self):
        pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10)  # No exception.
        pipeline_dp.BudgetAccountant(epsilon=1, delta=0)  # No exception.

        with self.assertRaises(ValueError):
            pipeline_dp.BudgetAccountant(
                epsilon=0, delta=1e-10)  # Epsilon must be positive.

        with self.assertRaises(ValueError):
            pipeline_dp.BudgetAccountant(
                epsilon=0.5, delta=-1e-10)  # Delta must be non-negative.
Beispiel #2
0
def calc_dp_rating_metrics(movie_views, ops, public_partitions):
    """Computes dp metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, ops)

    # Specify which DP aggregated metrics to compute.
    params = pipeline_dp.AggregateParams(metrics=[
        pipeline_dp.Metrics.COUNT,
    ],
                                         max_partitions_contributed=2,
                                         max_contributions_per_partition=1,
                                         low=1,
                                         high=5,
                                         public_partitions=public_partitions)

    # Specify how to extract is privacy_id, partition_key and value from an element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()
    return dp_result
Beispiel #3
0
    def test_request_budget(self):
        budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=0)
        budget = budget_accountant.request_budget(1,
                                                  use_eps=False,
                                                  use_delta=False)
        self.assertTrue(budget)  # An object must be returned.

        with self.assertRaises(AssertionError):
            print(budget.eps)  # The privacy budget is not calculated yet.

        with self.assertRaises(AssertionError):
            print(budget.delta)  # The privacy budget is not calculated yet.
Beispiel #4
0
    def test_compute_budgets(self):
        budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6)
        budget1 = budget_accountant.request_budget(1,
                                                   use_eps=True,
                                                   use_delta=False)
        budget2 = budget_accountant.request_budget(3,
                                                   use_eps=True,
                                                   use_delta=True)
        budget_accountant.compute_budgets()

        self.assertEqual(budget1.eps, 0.25)
        self.assertEqual(budget1.delta,
                         0)  # Delta should be 0 if use_delta is False.

        self.assertEqual(budget2.eps, 0.75)
        self.assertEqual(budget2.delta, 1e-6)
Beispiel #5
0
    def test_contribution_bounding_empty_col(self):
        input_col = []
        max_partitions_contributed = 2
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertFalse(bound_result)
Beispiel #6
0
 def test_select_private_partitions(self):
     input_col = [("pid1", ('pk1', 1)), ("pid1", ('pk1', 2)),
                  ("pid1", ('pk2', 3)), ("pid1", ('pk2', 4)),
                  ("pid1", ('pk2', 5)), ("pid1", ('pk3', 6)),
                  ("pid1", ('pk4', 7)), ("pid2", ('pk4', 8))]
     max_partitions_contributed = 3
     engine = pipeline_dp.DPEngine(pipeline_dp.BudgetAccountant(1, 1e-10),
                                   pipeline_dp.LocalPipelineOperations())
     groups = engine._ops.group_by_key(input_col, None)
     groups = engine._ops.map_values(groups,
                                     lambda group: _MockAccumulator(group))
     groups = list(groups)
     expected_data_filtered = [("pid1",
                                _MockAccumulator([
                                    ('pk1', 1),
                                    ('pk1', 2),
                                    ('pk2', 3),
                                    ('pk2', 4),
                                    ('pk2', 5),
                                    ('pk3', 6),
                                    ('pk4', 7),
                                ])),
                               ("pid2", _MockAccumulator([('pk4', 8)]))]
     self._mock_and_assert_private_partitions(engine, groups, 0,
                                              expected_data_filtered,
                                              max_partitions_contributed)
     expected_data_filtered = [
         ("pid1",
          _MockAccumulator([
              ('pk1', 1),
              ('pk1', 2),
              ('pk2', 3),
              ('pk2', 4),
              ('pk2', 5),
              ('pk3', 6),
              ('pk4', 7),
          ])),
     ]
     self._mock_and_assert_private_partitions(engine, groups, 3,
                                              expected_data_filtered,
                                              max_partitions_contributed)
     expected_data_filtered = []
     self._mock_and_assert_private_partitions(engine, groups, 100,
                                              expected_data_filtered,
                                              max_partitions_contributed)
Beispiel #7
0
    def test_contribution_bounding_bound_input_nothing_dropped(self):
        input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2),
                     ("pid1", 'pk2', 3), ("pid1", 'pk2', 4)]
        max_partitions_contributed = 2
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        expected_result = [(('pid1', 'pk2'), (2, 7, 25)),
                           (('pid1', 'pk1'), (2, 3, 5))]
        self.assertEqual(set(expected_result), set(bound_result))
Beispiel #8
0
    def test_accumulator_factory(self,
                                 mock_create_accumulator_params_function):
        aggregate_params = pipeline_dp.AggregateParams([agg.Metrics.MEAN], 5,
                                                       3)
        budget_accountant = pipeline_dp.BudgetAccountant(1, 0.01)

        values = [10]
        mock_create_accumulator_params_function.return_value = [
            accumulator.AccumulatorParams(MeanAccumulator, None)
        ]

        accumulator_factory = accumulator.AccumulatorFactory(
            aggregate_params, budget_accountant)
        accumulator_factory.initialize()
        created_accumulator = accumulator_factory.create(values)

        self.assertTrue(isinstance(created_accumulator, MeanAccumulator))
        self.assertEqual(created_accumulator.compute_metrics(), 10)
        mock_create_accumulator_params_function.assert_called_with(
            aggregate_params, budget_accountant)
Beispiel #9
0
    def test_contribution_bounding_per_partition_bounding_applied(self):
        input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2),
                     ("pid1", 'pk2', 3), ("pid1", 'pk2', 4),
                     ("pid1", 'pk2', 5), ("pid2", 'pk2', 6)]
        max_partitions_contributed = 5
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertEqual(len(bound_result), 3)
        # Check contributions per partitions
        self.assertTrue(
            all(
                map(
                    lambda op_val: op_val[1][0] <=
                    max_contributions_per_partition, bound_result)))
Beispiel #10
0
    def test_contribution_bounding_cross_partition_bounding_applied(self):
        input_col = [
            ("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3),
            ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid1", 'pk3', 6),
            ("pid1", 'pk4', 7), ("pid2", 'pk4', 8)
        ]
        max_partitions_contributed = 3
        max_contributions_per_partition = 5

        dp_engine = pipeline_dp.DPEngine(
            pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertEqual(len(bound_result), 4)
        # Check contributions per partitions
        self.assertTrue(
            all(
                map(
                    lambda op_val: op_val[1][0] <=
                    max_contributions_per_partition, bound_result)))
        # Check cross partition contributions
        dict_of_pid_to_pk = collections.defaultdict(lambda: [])
        for key, _ in bound_result:
            dict_of_pid_to_pk[key[0]].append(key[1])
        self.assertEqual(len(dict_of_pid_to_pk), 2)
        self.assertTrue(
            all(
                map(
                    lambda key: len(dict_of_pid_to_pk[key]) <=
                    max_partitions_contributed, dict_of_pid_to_pk)))