Beispiel #1
0
    def test_select_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pk1") for u in range(50)]
        col += [(50 + u, "pk2") for u in range(50)]
        dist_data = PrivateRDDTest.sc.parallelize(col)

        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return x[1]

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        self.assertEqual(sorted(actual_result.collect()), ["pk1", "pk2"])
Beispiel #2
0
    def test_select_private_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(50)]
            col += [(50 + u, "pk2") for u in range(50)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.9)
            partition_extractor = lambda x: x[1]

            # Act
            result = private_collection | private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
Beispiel #3
0
    def test_select_partitions_calls_select_partitions_with_params(
            self, mock_select_partitions):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.5)
            partition_extractor = lambda x: f"pk:{x // 10}"

            # Act
            transformer = private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_select_partitions.assert_called_once()

            args = mock_select_partitions.call_args[0]
            self.assertEqual(args[1], select_partitions_params)
Beispiel #4
0
    def test_select_partitions_calls_select_partitions_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk2")])
        expected_result_partitions = ["pk1", "pk2"]
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize(
            expected_result_partitions)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=1, total_delta=0.01)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return {x[1]}

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)

        # Assert
        mock_aggregate.assert_called_once()
        actual_args = mock_aggregate.call_args[0]
        actual_rdd = actual_args[0].collect()
        actual_select_partition_params = actual_args[1]

        self.assertListEqual(actual_rdd, [(1, (1, "pk1")), (2, (2, "pk2"))])

        self.assertEqual(
            actual_select_partition_params.max_partitions_contributed,
            max_partitions_contributed)
        self.assertEqual(actual_result.collect(), expected_result_partitions)