Example #1
0
    def test_annotate_call(self, mock_annotate_fn):
        # Arrange
        total_epsilon, total_delta = 3, 0.0001
        budget_accountant = NaiveBudgetAccountant(total_epsilon,
                                                  total_delta,
                                                  num_aggregations=3)
        dp_engine = self._create_dp_engine_default(budget_accountant)
        aggregate_params, public_partitions = self._create_params_default()
        select_partition_params = SelectPartitionsParams(2)
        extractors = self._get_default_extractors()
        input = [1, 2, 3]

        # Act and assert
        dp_engine.select_partitions(input, select_partition_params, extractors)
        dp_engine.aggregate(input, aggregate_params, extractors,
                            public_partitions)
        dp_engine.aggregate(input, aggregate_params, extractors,
                            public_partitions)
        budget_accountant.compute_budgets()

        # Assert
        self.assertEqual(3, mock_annotate_fn.call_count)
        for i_call in range(3):
            budget = mock_annotate_fn.call_args_list[i_call][1]['budget']
            self.assertEqual(total_epsilon / 3, budget.epsilon)
            self.assertEqual(total_delta / 3, budget.delta)
Example #2
0
    def test_with_noise(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=10,
                                                  total_delta=1e-5)
        budget = budget_accountant.request_budget(
            pipeline_dp.MechanismType.GAUSSIAN)
        budget_accountant.compute_budgets()

        params = pipeline_dp.AggregateParams(
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            noise_kind=NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT])
        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, params), list(range(5)))
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=5,
                               delta=4)

        count_accumulator.add_value(50)
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=6,
                               delta=4)

        count_accumulator.add_value(list(range(49)))
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=7,
                               delta=4)

        count_accumulator.add_value('*' * 100)
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=8,
                               delta=4)
Example #3
0
    def run_e2e_private_partition_selection_large_budget(col, backend):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.LAPLACE,
            metrics=[agg.Metrics.COUNT, agg.Metrics.SUM],
            min_value=1,
            max_value=10,
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        # Set a large budget for having the small noise and keeping all
        # partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1)

        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x//2}",
            value_extractor=lambda x: x)

        engine = pipeline_dp.DPEngine(budget_accountant, backend)

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        return col
Example #4
0
    def test_without_noise(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1000000,
                                                  total_delta=0.9999999)
        budget = budget_accountant.request_budget(
            pipeline_dp.MechanismType.GAUSSIAN)
        budget_accountant.compute_budgets()
        no_noise = pipeline_dp.AggregateParams(
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            noise_kind=NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT])
        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(5)))
        self.assertEqual(count_accumulator.compute_metrics(), 5)

        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), 'a' * 50)
        self.assertEqual(count_accumulator.compute_metrics(), 50)

        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(50)))
        count_accumulator.add_value(49)
        self.assertEqual(count_accumulator.compute_metrics(), 51)

        count_accumulator_1 = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(50)))
        count_accumulator_2 = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), 'a' * 50)
        count_accumulator_1.add_accumulator(count_accumulator_2)
        self.assertEqual(count_accumulator_1.compute_metrics(), 100)
Example #5
0
    def test_aggregate_report(self):
        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)
        params1 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=3,
            max_contributions_per_partition=2,
            min_value=1,
            max_value=5,
            metrics=[
                pipeline_dp.Metrics.PRIVACY_ID_COUNT,
                pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
            ],
        )
        params2 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=3,
            min_value=2,
            max_value=10,
            metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
            public_partitions=list(range(1, 40)),
        )

        select_partitions_params = SelectPartitionsParams(
            max_partitions_contributed=2)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        engine.aggregate(col, params1, data_extractor)
        engine.aggregate(col, params2, data_extractor)
        engine.select_partitions(col, select_partitions_params, data_extractor)
        self.assertEqual(3, len(engine._report_generators))  # pylint: disable=protected-access
        budget_accountant.compute_budgets()
        self.assertEqual(
            engine._report_generators[0].report(),
            "Differentially private: Computing <Metrics: ['privacy_id_count', 'count', 'mean']>"
            "\n1. Per-partition contribution bounding: randomly selected not more than 2 contributions"
            "\n2. Cross-partition contribution bounding: randomly selected not more than 3 partitions per user"
            "\n3. Private Partition selection: using Truncated Geometric method with (eps= 0.1111111111111111, delta = 1.1111111111111111e-11)"
        )
        self.assertEqual(
            engine._report_generators[1].report(),
            "Differentially private: Computing <Metrics: ['sum', 'mean']>"
            "\n1. Public partition selection: dropped non public partitions"
            "\n2. Per-partition contribution bounding: randomly selected not more than 3 contributions"
            "\n3. Cross-partition contribution bounding: randomly selected not more than 1 partitions per user"
            "\n4. Adding empty partitions to public partitions that are missing in data"
        )
        self.assertEqual(
            engine._report_generators[2].report(),
            "Differentially private: Computing <Private Partitions>"
            "\n1. Private Partition selection: using Truncated Geometric method with (eps= 0.3333333333333333, delta = 3.3333333333333335e-11)"
        )
 def test_two_calls_compute_budgets_raise_exception(self):
     budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                               total_delta=1e-6)
     budget_accountant.request_budget(mechanism_type=MechanismType.LAPLACE)
     budget_accountant.compute_budgets()
     with self.assertRaises(Exception):
         # Budget can be computed only once.
         budget_accountant.compute_budgets()
 def test_request_after_compute_raise_exception(self):
     budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                               total_delta=1e-6)
     budget_accountant.request_budget(mechanism_type=MechanismType.LAPLACE)
     budget_accountant.compute_budgets()
     with self.assertRaises(Exception):
         # Budget can not be requested after it has been already computed.
         budget_accountant.request_budget(
             mechanism_type=MechanismType.LAPLACE)
    def test_num_aggregations(self, num_aggregations):
        total_epsilon, total_delta = 1, 1e-6
        budget_accountant = NaiveBudgetAccountant(
            total_epsilon=total_epsilon,
            total_delta=total_delta,
            num_aggregations=num_aggregations)
        for _ in range(num_aggregations):
            budget = budget_accountant._compute_budget_for_aggregation(1)
            expected_epsilon = total_epsilon / num_aggregations
            expected_delta = total_delta / num_aggregations
            self.assertAlmostEqual(expected_epsilon, budget.epsilon)
            self.assertAlmostEqual(expected_delta, budget.delta)

        budget_accountant.compute_budgets()
    def test_compute_budgets(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-6)
        budget1 = budget_accountant.request_budget(noise_kind=NoiseKind.LAPLACE)
        budget2 = budget_accountant.request_budget(
            noise_kind=NoiseKind.GAUSSIAN, weight=3)
        budget_accountant.compute_budgets()

        self.assertEqual(budget1.eps, 0.25)
        self.assertEqual(budget1.delta,
                         0)  # Delta should be 0 if mechanism is Gaussian.

        self.assertEqual(budget2.eps, 0.75)
        self.assertEqual(budget2.delta, 1e-6)
Example #10
0
    def test_select_partitions(self):
        # This test is probabilistic, but the parameters were chosen to ensure
        # the test has passed at least 10000 runs.

        # Arrange
        params = SelectPartitionsParams(max_partitions_contributed=1)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-5)

        # Generate dataset as a list of (user, partition_key) tuples.
        # There partitions are generated to reflect several scenarios.

        # A partition with sufficient amount of users.
        col = [(u, "pk-many-contribs") for u in range(25)]

        # A partition with many contributions, but only a few unique users.
        col += [(100 + u // 10, "pk-many-contribs-few-users")
                for u in range(30)]

        # A partition with few contributions.
        col += [(200 + u, "pk-few-contribs") for u in range(3)]

        # Generating 30 partitions, each with the same group of 25 users
        # 25 users is sufficient to keep the partition, but because of
        # contribution bounding, much less users per partition will be kept.
        for i in range(30):
            col += [(500 + u, f"few-contribs-after-bound{i}")
                    for u in range(25)]

        col = list(col)
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x[0],
            partition_extractor=lambda x: x[1])

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.select_partitions(col=col,
                                       params=params,
                                       data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert
        # Only one partition is retained, the one that has many unique _after_
        # applying the "max_partitions_contributed" bound is retained.
        self.assertEqual(["pk-many-contribs"], col)
    def test_compute_budgets(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-6)
        budget1 = budget_accountant.request_budget(
            mechanism_type=MechanismType.LAPLACE)
        budget2 = budget_accountant.request_budget(
            mechanism_type=MechanismType.GAUSSIAN, weight=3)
        budget_accountant.compute_budgets()

        self.assertEqual(budget1.eps, 0.25)
        self.assertEqual(budget1.delta,
                         0)  # Delta should be 0 if mechanism is Laplace.

        self.assertEqual(budget2.eps, 0.75)
        self.assertEqual(budget2.delta, 1e-6)
    def test_aggregation_weights(self):

        total_epsilon, total_delta = 1, 1e-6
        weights = [1, 2, 5]
        budget_accountant = NaiveBudgetAccountant(total_epsilon=total_epsilon,
                                                  total_delta=total_delta,
                                                  aggregation_weights=weights)
        for weight in weights:
            budget = budget_accountant._compute_budget_for_aggregation(weight)
            expected_epsilon = total_epsilon * weight / sum(weights)
            expected_delta = total_delta * weight / sum(weights)
            self.assertAlmostEqual(expected_epsilon, budget.epsilon)
            self.assertAlmostEqual(expected_delta, budget.delta)

        budget_accountant.compute_budgets()
    def test_not_enough_aggregations(self, use_num_aggregations):
        weights = num_aggregations = None
        if use_num_aggregations:
            num_aggregations = 2
        else:
            weights = [1, 1]  # 2 aggregations
        budget_accountant = NaiveBudgetAccountant(
            total_epsilon=1,
            total_delta=1e-6,
            num_aggregations=num_aggregations,
            aggregation_weights=weights)

        budget_accountant._compute_budget_for_aggregation(1)
        with self.assertRaises(ValueError):
            # num_aggregations = 2, but only 1 aggregation_scope was created
            budget_accountant.compute_budgets()
    def test_budget_scopes_no_parentscope(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-6)

        # Allocated in the top-level scope with no weight specified
        budget1 = budget_accountant.request_budget(
            mechanism_type=MechanismType.LAPLACE)

        with budget_accountant.scope(weight=0.5):
            budget2 = budget_accountant.request_budget(
                mechanism_type=MechanismType.LAPLACE)

        budget_accountant.compute_budgets()

        self.assertEqual(budget1.eps, 1.0 / (1.0 + 0.5))
        self.assertEqual(budget2.eps, 0.5 / (1.0 + 0.5))
Example #15
0
    def test_aggregate_public_partitions_add_empty_public_partitions(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[
                agg.Metrics.COUNT, agg.Metrics.SUM,
                agg.Metrics.PRIVACY_ID_COUNT
            ],
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            public_partitions=["pk0", "pk10", "pk11"])

        # Set a high budget to add close to 0 noise.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1 - 1e-10)

        # Input collection has 10 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(10))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: 1)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)
        partition_keys = [x[0] for x in col]
        # Assert

        # Only public partitions ("pk0") should be kept and empty public
        # partitions ("pk10", "pk11") should be added.
        self.assertEqual(["pk0", "pk10", "pk11"], partition_keys)
        self.assertAlmostEqual(1, col[0][1][0])  # "pk0" COUNT ≈ 1
        self.assertAlmostEqual(1, col[0][1][1])  # "pk0" SUM ≈ 1
        self.assertAlmostEqual(1, col[0][1][2])  # "pk0" PRIVACY_ID_COUNT ≈ 1
        self.assertAlmostEqual(0, col[1][1][0])  # "pk10" COUNT ≈ 0
        self.assertAlmostEqual(0, col[1][1][1])  # "pk10" SUM ≈ 0
        self.assertAlmostEqual(0, col[1][1][2])  # "pk10" PRIVACY_ID_COUNT ≈ 0
Example #16
0
    def test_aggregate_public_partitions_drop_non_public(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[
                agg.Metrics.COUNT, agg.Metrics.SUM,
                agg.Metrics.PRIVACY_ID_COUNT
            ],
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            public_partitions=["pk0", "pk1", "pk10"])

        # Set an arbitrary budget, we are not interested in the DP outputs, only
        # the partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        # Input collection has 10 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(10))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)
        partition_keys = [x[0] for x in col]
        # Assert

        # Only public partitions (0, 1, 2) should be kept and the rest of the
        # partitions should be dropped.
        self.assertEqual(["pk0", "pk1", "pk10"], partition_keys)
    def test_budget_scopes(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-6)

        with budget_accountant.scope(weight=0.4):
            budget1 = budget_accountant.request_budget(
                mechanism_type=MechanismType.LAPLACE)
            budget2 = budget_accountant.request_budget(
                mechanism_type=MechanismType.LAPLACE, weight=3)

        with budget_accountant.scope(weight=0.6):
            budget3 = budget_accountant.request_budget(
                mechanism_type=MechanismType.LAPLACE)
            budget4 = budget_accountant.request_budget(
                mechanism_type=MechanismType.LAPLACE, weight=4)

        budget_accountant.compute_budgets()

        self.assertEqual(budget1.eps, 0.4 * (1 / 4))
        self.assertEqual(budget2.eps, 0.4 * (3 / 4))
        self.assertEqual(budget3.eps, 0.6 * (1 / 5))
        self.assertEqual(budget4.eps, 0.6 * (4 / 5))
Example #18
0
    def test_aggregate_private_partition_selection_drop_many(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        # Set a small budget for dropping most partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        # Input collection has 100 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(100))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: None)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert

        # Most partition should be dropped by private partition selection.
        # This tests is non-deterministic, but it should pass with probability
        # very close to 1.
        self.assertLess(len(col), 5)
Example #19
0
    def test_aggregate_private_partition_selection_keep_everything(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)
        # Set a large budget for having the small noise and keeping all
        # partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1e-10)

        col = list(range(10)) + list(range(100, 120))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x//100}",
            value_extractor=lambda x: None)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert
        approximate_expected = {"pk0": 10, "pk1": 20}
        self.assertEqual(2, len(col))  # all partition keys are kept.
        for pk, metrics_tuple in col:
            dp_count = metrics_tuple.count
            self.assertAlmostEqual(approximate_expected[pk],
                                   dp_count,
                                   delta=1e-3)
Example #20
0
    def test_aggregate_report(self):
        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)
        params1 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=3,
            max_contributions_per_partition=2,
            min_value=1,
            max_value=5,
            metrics=[
                pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT,
                pipeline_dp.Metrics.MEAN
            ],
        )
        params2 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=3,
            min_value=2,
            max_value=10,
            metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
        )

        select_partitions_params = SelectPartitionsParams(
            max_partitions_contributed=2)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        engine.aggregate(col, params1, data_extractor)
        engine.aggregate(col, params2, data_extractor, list(range(1, 40)))
        engine.select_partitions(col, select_partitions_params, data_extractor)
        self.assertEqual(3, len(engine._report_generators))  # pylint: disable=protected-access
        budget_accountant.compute_budgets()
        self._check_string_contains_strings(
            engine._report_generators[0].report(),
            [
                "DPEngine method: aggregate",
                "metrics=['privacy_id_count', 'count', 'mean']",
                " noise_kind=gaussian", "max_value=5",
                "Partition selection: private partitions",
                "Cross-partition contribution bounding: for each privacy id randomly select max(actual_partition_contributed, 3)",
                "Private Partition selection: using Truncated Geometric method with (eps="
            ],
        )

        self._check_string_contains_strings(
            engine._report_generators[1].report(),
            [
                "metrics=['sum', 'mean']", " noise_kind=gaussian",
                "max_value=5", "Partition selection: public partitions",
                "Per-partition contribution bounding: for each privacy_id and eachpartition, randomly select max(actual_contributions_per_partition, 3)",
                "Adding empty partitions for public partitions that are missing in data"
            ],
        )

        self._check_string_contains_strings(
            engine._report_generators[2].report(),
            [
                "DPEngine method: select_partitions",
                " budget_weight=1",
                "max_partitions_contributed=2",
                "Private Partition selection: using Truncated Geometric method with",
            ],
        )