Esempio n. 1
0
    def test_mean_calls_aggregate_with_correct_params(self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 2.0, "pk1"),
                                                   (2, 2.0, "pk1")])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2.0,
                                                                      ["pk1"])])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                     max_partitions_contributed=2,
                                     max_contributions_per_partition=3,
                                     min_value=1.5,
                                     max_value=5.78,
                                     budget_weight=1.1,
                                     public_partitions=None,
                                     partition_extractor=lambda x: x[0],
                                     value_extractor=lambda x: x)

        # Act
        actual_result = prdd.mean(mean_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.MEAN],
            max_partitions_contributed=mean_params.max_partitions_contributed,
            max_contributions_per_partition=mean_params.
            max_contributions_per_partition,
            min_value=mean_params.min_value,
            max_value=mean_params.max_value,
            budget_weight=mean_params.budget_weight,
            public_partitions=mean_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [(2.0, "pk1")])
Esempio n. 2
0
    def test_mean_calls_aggregate_with_params(self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            mean_params = aggregate_params.MeanParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                min_value=1,
                max_value=5,
                budget_weight=1,
                public_partitions=[],
                partition_extractor=lambda x: f"pk:{x // 10}",
                value_extractor=lambda x: x)

            # Act
            transformer = private_beam.Mean(mean_params=mean_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.MEAN],
                max_partitions_contributed=mean_params.
                max_partitions_contributed,
                max_contributions_per_partition=mean_params.
                max_contributions_per_partition,
                min_value=mean_params.min_value,
                max_value=mean_params.max_value,
                public_partitions=mean_params.public_partitions)
            self.assertEqual(params, args[1])
Esempio n. 3
0
    def test_mean_with_public_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pubK1", -100) for u in range(30)]
        col += [(u + 30, "pubK1", 100) for u in range(10)]
        col += [(u + 40, "privK1", 100) for u in range(30)]

        dist_data = PrivateRDDTest.sc.parallelize(col)
        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                     max_partitions_contributed=2,
                                     max_contributions_per_partition=3,
                                     min_value=1.55,
                                     max_value=2.789,
                                     budget_weight=1,
                                     partition_extractor=lambda x: x[1],
                                     value_extractor=lambda x: x[2],
                                     public_partitions=["pubK1", "pubK2"])

        # Act
        actual_result = prdd.mean(mean_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pubK1": 1.859, "pubK2": 2.1695}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, mean in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(mean,
                                                    expected_result_dict[pk],
                                                    5.0))
Esempio n. 4
0
    def test_mean_with_public_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pubK1", -100.0) for u in range(30)]
            col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)]
            col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            mean_params = aggregate_params.MeanParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=1,
                max_contributions_per_partition=1,
                min_value=1.55,  # -100 should be clipped to this value
                max_value=2.7889,  # 100 should be clipped to this value
                budget_weight=1,
                partition_extractor=lambda x: x[1],
                value_extractor=lambda x: x[2])

            # Act
            result = private_collection | private_beam.Mean(
                mean_params=mean_params, public_partitions=["pubK1", "pubK2"])
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value}
                beam_util.equal_to([("pubK1", 1.859), ("pubK2", 2.1695)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 0.1)))