Example #1
0
    def test_count_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            count_params = aggregate_params.CountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                budget_weight=1,
                partition_extractor=lambda x: x[1])

            # Act
            result = private_collection | private_beam.Count(
                count_params=count_params)
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pk1", 30.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 5.0)))
Example #2
0
    def test_count_calls_with_public_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pubK1") for u in range(30)]
        col += [(u, "privK1") for u in range(30)]
        dist_data = PrivateRDDTest.sc.parallelize(col)

        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        count_params = agg.CountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            budget_weight=1,
            public_partitions=["pubK1", "pubK2"],
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.count(count_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pubK2": 0.0, "pubK1": 30.0}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, count in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(count,
                                                    expected_result_dict[pk],
                                                    5.0))
Example #3
0
    def test_count_calls_aggregate_with_correct_params(self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk1")])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2,
                                                                      ["pk1"])])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        count_params = agg.CountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            budget_weight=1,
            public_partitions=None,
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.count(count_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]
        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT],
            max_partitions_contributed=count_params.max_partitions_contributed,
            max_contributions_per_partition=count_params.
            max_contributions_per_partition,
            public_partitions=count_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [(2, "pk1")])
Example #4
0
    def test_count_calls_aggregate_with_params(self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            count_params = aggregate_params.CountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                budget_weight=1,
                partition_extractor=lambda x: f"pk:{x // 10}")

            # Act
            transformer = private_beam.Count(count_params=count_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.COUNT],
                max_partitions_contributed=count_params.
                max_partitions_contributed,
                max_contributions_per_partition=count_params.
                max_contributions_per_partition,
                public_partitions=count_params.public_partitions)
            self.assertEqual(args[1], params)