Example #1
0
    def test_select_partitions_calls_select_partitions_with_params(
            self, mock_select_partitions):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.5)
            partition_extractor = lambda x: f"pk:{x // 10}"

            # Act
            transformer = private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_select_partitions.assert_called_once()

            args = mock_select_partitions.call_args[0]
            self.assertEqual(args[1], select_partitions_params)
Example #2
0
    def test_map_returns_correct_results_and_accountant(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol_input = [(1, 2), (2, 3), (3, 4), (4, 5)]
            pcol = pipeline | 'Create produce' >> beam.Create(pcol_input)
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | private_beam.Map(
                fn=lambda x: x[1]**2)

            # Assert
            self.assertIsInstance(transformed, private_beam.PrivatePCollection)
            beam_util.assert_that(
                transformed._pcol,
                beam_util.equal_to(
                    map(
                        lambda x:
                        (PrivateBeamTest.privacy_id_extractor(x), x[1]**2),
                        pcol_input)))
            self.assertEqual(transformed._budget_accountant, budget_accountant)
Example #3
0
    def test_combine_per_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pk1", 100.0) for u in range(30)]
            col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            private_collection = private_collection | private_beam.Map(
                lambda x: (x[1], x[2]))

            # Act
            result = private_collection | private_beam.CombinePerKey(
                SumCombineFn(),
                private_beam.CombinePerKeyParams(
                    max_partitions_contributed=2,
                    max_contributions_per_partition=1))
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pk1", 0.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 10.0)))
Example #4
0
    def test_select_private_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(50)]
            col += [(50 + u, "pk2") for u in range(50)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.9)
            partition_extractor = lambda x: x[1]

            # Act
            result = private_collection | private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
Example #5
0
    def test_privacy_id_count_returns_sensible_result(self):
        # Arrange
        col = [(u, "pk1") for u in range(30)]
        dist_data = PrivateRDDTest.sc.parallelize(col)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        privacy_id_count_params = agg.PrivacyIdCountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            budget_weight=1,
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.privacy_id_count(privacy_id_count_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pk1": 30.0}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, count in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(count,
                                                    expected_result_dict[pk],
                                                    5.0))
Example #6
0
    def test_select_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pk1") for u in range(50)]
        col += [(50 + u, "pk2") for u in range(50)]
        dist_data = PrivateRDDTest.sc.parallelize(col)

        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return x[1]

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        self.assertEqual(sorted(actual_result.collect()), ["pk1", "pk2"])
Example #7
0
    def test_privacy_id_count_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            privacy_id_count_params = aggregate_params.PrivacyIdCountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                budget_weight=1,
                partition_extractor=lambda x: x[1])

            # Act
            result = private_collection | private_beam.PrivacyIdCount(
                privacy_id_count_params=privacy_id_count_params)
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pk1", 30)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 5)))
Example #8
0
    def test_flatmap_returns_correct_results_and_accountant(self):
        def flat_map_fn(x):
            return [(x[0], x[1] + i) for i in range(2)]

        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol_input = [(1, 2), (2, 3), (3, 4)]
            pcol = pipeline | 'Create produce' >> beam.Create(pcol_input)
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | private_beam.FlatMap(
                flat_map_fn)

            # Assert
            self.assertIsInstance(transformed, private_beam.PrivatePCollection)
            beam_util.assert_that(
                transformed._pcol,
                beam_util.equal_to([('pid:(1, 2)', (1, 2)),
                                    ('pid:(1, 2)', (1, 3)),
                                    ('pid:(2, 3)', (2, 3)),
                                    ('pid:(2, 3)', (2, 4)),
                                    ('pid:(3, 4)', (3, 4)),
                                    ('pid:(3, 4)', (3, 5))]))
            self.assertEqual(transformed._budget_accountant, budget_accountant)
Example #9
0
    def test_variance_calls_aggregate_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 0.0, "pk1"),
                                                   (2, 10.0, "pk1")])
        MetricsTuple = collections.namedtuple('MetricsTuple', ['variance'])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([
            ("pk1", MetricsTuple(variance=25.0))
        ])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        variance_params = agg.VarianceParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            min_value=1.5,
            max_value=5.78,
            budget_weight=1.1,
            partition_extractor=lambda x: x[0],
            value_extractor=lambda x: x)

        # Act
        actual_result = prdd.variance(variance_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.VARIANCE],
            max_partitions_contributed=variance_params.
            max_partitions_contributed,
            max_contributions_per_partition=variance_params.
            max_contributions_per_partition,
            min_value=variance_params.min_value,
            max_value=variance_params.max_value,
            budget_weight=variance_params.budget_weight,
            public_partitions=variance_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [("pk1", 25.0)])
Example #10
0
    def test_utility_analysis_params(self):
        default_extractors = self._get_default_extractors()
        default_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            metrics=[pipeline_dp.Metrics.COUNT])
        params_with_custom_combiners = copy.copy(default_params)
        params_with_custom_combiners.custom_combiners = sum
        params_with_unsupported_metric = copy.copy(default_params)
        params_with_unsupported_metric.metrics = [pipeline_dp.Metrics.MEAN]
        params_with_contribution_bounds_already_enforced = default_params
        params_with_contribution_bounds_already_enforced.contribution_bounds_already_enforced = True

        test_cases = [
            {
                "desc": "custom combiners",
                "params": params_with_custom_combiners,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "unsupported metric in metrics",
                "params": params_with_unsupported_metric,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "contribution bounds are already enforced",
                "params": params_with_contribution_bounds_already_enforced,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
        ]

        for test_case in test_cases:

            with self.assertRaisesRegex(Exception,
                                        expected_regex=test_case["desc"]):
                budget_accountant = budget_accounting.NaiveBudgetAccountant(
                    total_epsilon=1, total_delta=1e-10)
                engine = dp_engine.UtilityAnalysisEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                col = [0, 1, 2]
                engine.aggregate(
                    col,
                    test_case["params"],
                    test_case["data_extractor"],
                    public_partitions=test_case["public_partitions"])
Example #11
0
    def test_map(self):
        data = [(1, 11), (2, 12)]
        dist_data = PrivateRDDTest.sc.parallelize(data)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.PrivateRDD(dist_data, budget_accountant,
                                        privacy_id_extractor)

        result = prdd.map(lambda x: (x[0], x[1] * 2))

        self.assertEqual(result._rdd.collect(), [(1, (1, 22)), (2, (2, 24))])
        self.assertEqual(result._budget_accountant, prdd._budget_accountant)
Example #12
0
    def test_mean_calls_aggregate_with_correct_params(self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 2.0, "pk1"),
                                                   (2, 2.0, "pk1")])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2.0,
                                                                      ["pk1"])])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                     max_partitions_contributed=2,
                                     max_contributions_per_partition=3,
                                     min_value=1.5,
                                     max_value=5.78,
                                     budget_weight=1.1,
                                     public_partitions=None,
                                     partition_extractor=lambda x: x[0],
                                     value_extractor=lambda x: x)

        # Act
        actual_result = prdd.mean(mean_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.MEAN],
            max_partitions_contributed=mean_params.max_partitions_contributed,
            max_contributions_per_partition=mean_params.
            max_contributions_per_partition,
            min_value=mean_params.min_value,
            max_value=mean_params.max_value,
            budget_weight=mean_params.budget_weight,
            public_partitions=mean_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [(2.0, "pk1")])
Example #13
0
    def test_sum_calls_aggregate_with_params(self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                float(i) for i in range(1, 7))
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            sum_params = aggregate_params.SumParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                min_value=1,
                max_value=5,
                budget_weight=1,
                public_partitions=[],
                partition_extractor=lambda x: f"pk:{x // 10}",
                value_extractor=lambda x: x)

            # Act
            transformer = private_beam.Sum(sum_params=sum_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.SUM],
                max_partitions_contributed=sum_params.
                max_partitions_contributed,
                max_contributions_per_partition=sum_params.
                max_contributions_per_partition,
                min_value=sum_params.min_value,
                max_value=sum_params.max_value,
                public_partitions=sum_params.public_partitions)
            self.assertEqual(params, args[1])
Example #14
0
    def test_variance_with_public_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pubK1", -100) for u in range(30)]
        col += [(u + 30, "pubK1", 100) for u in range(10)]
        col += [(u + 40, "privK1", 100) for u in range(30)]

        dist_data = PrivateRDDTest.sc.parallelize(col)
        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=8000, total_delta=0.9999999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        variance_params = agg.VarianceParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            min_value=1.55,  # -100 should be clipped to this value
            max_value=2.7889,  # 100 should be clipped to this value
            budget_weight=1,
            partition_extractor=lambda x: x[1],
            value_extractor=lambda x: x[2])

        # Act
        actual_result = prdd.variance(variance_params,
                                      public_partitions=["pubK1", "pubK2"])
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pubK1": 0.288, "pubK2": 0.0}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, variance in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(variance,
                                                    expected_result_dict[pk],
                                                    0.1))
Example #15
0
    def test_transform_with_return_anonymized_enabled_returns_pcollection(
            self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | SimplePrivatePTransform(
                return_anonymized=True)

            # Assert
            self.assertIsInstance(transformed, pvalue.PCollection)
Example #16
0
    def test_make_private_transform_succeeds(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)

            # Act
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Assert
            self.assertIsInstance(private_collection,
                                  private_beam.PrivatePCollection)
            self.assertEqual(private_collection._budget_accountant,
                             budget_accountant)
Example #17
0
    def test_sum_returns_sensible_result(self):
        # Arrange
        col = [(f"{u}", "pk1", 100.0) for u in range(30)]
        col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)]

        dist_data = PrivateRDDTest.sc.parallelize(col)
        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        sum_params = agg.SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                   max_partitions_contributed=2,
                                   max_contributions_per_partition=3,
                                   min_value=1.55,
                                   max_value=2.7889,
                                   budget_weight=1,
                                   public_partitions=None,
                                   partition_extractor=lambda x: x[1],
                                   value_extractor=lambda x: x[2])

        # Act
        actual_result = prdd.sum(sum_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pk1": 130.167}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, sum in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(sum,
                                                    expected_result_dict[pk],
                                                    5.0))
Example #18
0
    def test_variance_with_public_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pubK1", -100.0) for u in range(30)]
            col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)]
            col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=8000, total_delta=0.9999999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            variance_params = aggregate_params.VarianceParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=1,
                max_contributions_per_partition=1,
                min_value=1.55,  # -100 should be clipped to this value
                max_value=2.7889,  # 100 should be clipped to this value
                budget_weight=1,
                partition_extractor=lambda x: x[1],
                value_extractor=lambda x: x[2])

            # Act
            result = private_collection | private_beam.Variance(
                variance_params=variance_params,
                public_partitions=["pubK1", "pubK2"])
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value}
                beam_util.equal_to([("pubK1", 0.288), ("pubK2", 0.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 0.1)))
Example #19
0
    def test_privacy_id_count_calls_aggregate_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk1")])
        MetricsTuple = collections.namedtuple('MetricsTuple',
                                              ['privacy_id_count'])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([
            ("pk1", MetricsTuple(privacy_id_count=2))
        ])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        privacy_id_count_params = agg.PrivacyIdCountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            budget_weight=1,
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.privacy_id_count(privacy_id_count_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1)
        self.assertEqual(args[1], params)

        self.assertEqual([("pk1", 2)], actual_result.collect())
Example #20
0
    def test_privacy_id_count_calls_aggregate_with_params(
            self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            privacy_id_count_params = aggregate_params.PrivacyIdCountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                budget_weight=1,
                partition_extractor=lambda x: f"pk:{x // 10}")

            # Act
            transformer = private_beam.PrivacyIdCount(
                privacy_id_count_params=privacy_id_count_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
                max_partitions_contributed=privacy_id_count_params.
                max_partitions_contributed,
                max_contributions_per_partition=1,
                public_partitions=privacy_id_count_params.public_partitions)
            self.assertEqual(args[1], params)
Example #21
0
    def test_private_collection_with_non_private_transform_throws_error(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act and Assert
            with self.assertRaises(TypeError) as context:
                (private_collection | 'Non private transform on '
                 'PrivatePCollection' >> beam.Map(lambda x: x))
            self.assertIsInstance(private_collection,
                                  private_beam.PrivatePCollection)
            self.assertTrue(
                "private_transform should be of type "
                "PrivatePTransform but is " in str(context.exception))
Example #22
0
    def test_select_partitions_calls_select_partitions_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk2")])
        expected_result_partitions = ["pk1", "pk2"]
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize(
            expected_result_partitions)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=1, total_delta=0.01)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return {x[1]}

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)

        # Assert
        mock_aggregate.assert_called_once()
        actual_args = mock_aggregate.call_args[0]
        actual_rdd = actual_args[0].collect()
        actual_select_partition_params = actual_args[1]

        self.assertListEqual(actual_rdd, [(1, (1, "pk1")), (2, (2, "pk2"))])

        self.assertEqual(
            actual_select_partition_params.max_partitions_contributed,
            max_partitions_contributed)
        self.assertEqual(actual_result.collect(), expected_result_partitions)