Esempio n. 1
0
 def test_aggregate_report(self):
     params1 = pipeline_dp.AggregateParams(
         max_partitions_contributed=3,
         max_contributions_per_partition=2,
         low=1,
         high=5,
         metrics=[
             pipeline_dp.Metrics.PRIVACY_ID_COUNT,
             pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
         ],
     )
     params2 = pipeline_dp.AggregateParams(
         max_partitions_contributed=1,
         max_contributions_per_partition=3,
         low=2,
         high=10,
         metrics=[
             pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM,
             pipeline_dp.Metrics.MEAN
         ],
         public_partitions=list(range(1, 40)),
     )
     engine = pipeline_dp.DPEngine(None, None)
     engine.aggregate(None, params1, None)
     engine.aggregate(None, params2, None)
     self.assertEqual(len(engine._report_generators), 2)  # pylint: disable=protected-access
Esempio n. 2
0
    def test_aggregate_report(self):
        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)
        params1 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=3,
            max_contributions_per_partition=2,
            min_value=1,
            max_value=5,
            metrics=[
                pipeline_dp.Metrics.PRIVACY_ID_COUNT,
                pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
            ],
        )
        params2 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=3,
            min_value=2,
            max_value=10,
            metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
            public_partitions=list(range(1, 40)),
        )

        select_partitions_params = SelectPartitionsParams(
            max_partitions_contributed=2)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        engine.aggregate(col, params1, data_extractor)
        engine.aggregate(col, params2, data_extractor)
        engine.select_partitions(col, select_partitions_params, data_extractor)
        self.assertEqual(3, len(engine._report_generators))  # pylint: disable=protected-access
        budget_accountant.compute_budgets()
        self.assertEqual(
            engine._report_generators[0].report(),
            "Differentially private: Computing <Metrics: ['privacy_id_count', 'count', 'mean']>"
            "\n1. Per-partition contribution bounding: randomly selected not more than 2 contributions"
            "\n2. Cross-partition contribution bounding: randomly selected not more than 3 partitions per user"
            "\n3. Private Partition selection: using Truncated Geometric method with (eps= 0.1111111111111111, delta = 1.1111111111111111e-11)"
        )
        self.assertEqual(
            engine._report_generators[1].report(),
            "Differentially private: Computing <Metrics: ['sum', 'mean']>"
            "\n1. Public partition selection: dropped non public partitions"
            "\n2. Per-partition contribution bounding: randomly selected not more than 3 contributions"
            "\n3. Cross-partition contribution bounding: randomly selected not more than 1 partitions per user"
            "\n4. Adding empty partitions to public partitions that are missing in data"
        )
        self.assertEqual(
            engine._report_generators[2].report(),
            "Differentially private: Computing <Private Partitions>"
            "\n1. Private Partition selection: using Truncated Geometric method with (eps= 0.3333333333333333, delta = 3.3333333333333335e-11)"
        )
Esempio n. 3
0
    def expand(self, pcol: pvalue.PCollection):
        combiner = _CombineFnCombiner(self._combine_fn)
        aggregate_params = pipeline_dp.AggregateParams(
            metrics=None,
            max_partitions_contributed=self._params.max_partitions_contributed,
            max_contributions_per_partition=self._params.
            max_contributions_per_partition,
            custom_combiners=[combiner])

        backend, dp_engine = self._create_dp_engine()
        # Assumed elements format: (privacy_id, (partition_key, value))
        data_extractors = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x[0],
            partition_extractor=lambda x: x[1][0],
            value_extractor=lambda x: x[1][1])

        dp_result = dp_engine.aggregate(pcol, aggregate_params,
                                        data_extractors)
        # dp_result : (partition_key, [combiner_result])

        # aggregate() returns a tuple with on 1 element per combiner.
        # Here is only one combiner. Extract it from the tuple.
        dp_result = backend.map_values(dp_result, lambda v: v[0],
                                       "Unnest tuple")
        # dp_result : (partition_key, result)

        return dp_result
Esempio n. 4
0
def calc_dp_rating_metrics(movie_views, ops, public_partitions):
    """Computes dp metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, ops)

    # Specify which DP aggregated metrics to compute.
    params = pipeline_dp.AggregateParams(metrics=[
        pipeline_dp.Metrics.COUNT,
    ],
                                         max_partitions_contributed=2,
                                         max_contributions_per_partition=1,
                                         low=1,
                                         high=5,
                                         public_partitions=public_partitions)

    # Specify how to extract is privacy_id, partition_key and value from an element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()
    return dp_result
Esempio n. 5
0
    def test_aggregate_computation_graph_verification(
            self, mock_bound_contributions):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=5,
            max_contributions_per_partition=3)
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)

        mock_bound_contributions.return_value = [
            [("pid1", "pk1"), (1, [1])],
            [("pid2", "pk2"), (1, [1])],
            [("pid3", "pk3"), (1, [2])],
        ]

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)

        # Assert
        mock_bound_contributions.assert_called_with(
            unittest.mock.ANY, aggregator_params.max_partitions_contributed,
            aggregator_params.max_contributions_per_partition,
            unittest.mock.ANY)
Esempio n. 6
0
    def run_e2e_private_partition_selection_large_budget(col, backend):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.LAPLACE,
            metrics=[agg.Metrics.COUNT, agg.Metrics.SUM],
            min_value=1,
            max_value=10,
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        # Set a large budget for having the small noise and keeping all
        # partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1)

        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x//2}",
            value_extractor=lambda x: x)

        engine = pipeline_dp.DPEngine(budget_accountant, backend)

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        return col
Esempio n. 7
0
 def test_check_invalid_bounding_params(self, error_msg, min_value,
                                        max_value,
                                        max_partitions_contributed,
                                        max_contributions_per_partition,
                                        metrics):
     with self.assertRaises(Exception, msg=error_msg):
         budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                   total_delta=1e-10)
         engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                       backend=pipeline_dp.LocalBackend())
         engine.aggregate(
             [0],
             pipeline_dp.AggregateParams(
                 noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                 max_partitions_contributed=max_partitions_contributed,
                 max_contributions_per_partition=
                 max_contributions_per_partition,
                 min_value=min_value,
                 max_value=max_value,
                 metrics=metrics),
             pipeline_dp.DataExtractors(
                 privacy_id_extractor=lambda x: x,
                 partition_extractor=lambda x: x,
                 value_extractor=lambda x: x,
             ))
Esempio n. 8
0
def calc_dp_rating_metrics(movie_views, backend, public_partitions):
    """Computes DP metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    # Specify which DP aggregated metrics to compute.
    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=None,
        max_partitions_contributed=2,
        max_contributions_per_partition=1,
        min_value=1,
        max_value=5,
        public_partitions=public_partitions,
        custom_combiners=[CountCombiner()])

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()

    return dp_result
Esempio n. 9
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.COUNT],
            max_partitions_contributed=self._count_params.
            max_partitions_contributed,
            max_contributions_per_partition=self._count_params.
            max_contributions_per_partition,
            public_partitions=self._count_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # Count calculates the number of elements per partition key and
            # doesn't use value extractor.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_count])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - count. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.count,
                                       "Extract sum")
        # dp_result : (partition_key, dp_count)

        return dp_result
Esempio n. 10
0
    def sum(self, sum_params: aggregate_params.SumParams) -> RDD:
        """Computes a DP sum.

        Args:
            sum_params: parameters for calculation
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=sum_params.noise_kind,
            metrics=[pipeline_dp.Metrics.SUM],
            max_partitions_contributed=sum_params.max_partitions_contributed,
            max_contributions_per_partition=sum_params.
            max_contributions_per_partition,
            min_value=sum_params.min_value,
            max_value=sum_params.max_value,
            public_partitions=sum_params.public_partitions,
            budget_weight=sum_params.budget_weight)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: sum_params.partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            value_extractor=lambda x: sum_params.value_extractor(x[1]))

        dp_result = dp_engine.aggregate(self._rdd, params, data_extractors)
        # dp_result : (partition_key, [dp_sum])

        # aggregate() returns a list of metrics for each partition key.
        # Here is only one metric - sum. Remove list.
        dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest list")
        # dp_result : (partition_key, dp_sum)

        return dp_result
Esempio n. 11
0
    def test_with_noise(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=10,
                                                  total_delta=1e-5)
        budget = budget_accountant.request_budget(
            pipeline_dp.MechanismType.GAUSSIAN)
        budget_accountant.compute_budgets()

        params = pipeline_dp.AggregateParams(
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            noise_kind=NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT])
        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, params), list(range(5)))
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=5,
                               delta=4)

        count_accumulator.add_value(50)
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=6,
                               delta=4)

        count_accumulator.add_value(list(range(49)))
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=7,
                               delta=4)

        count_accumulator.add_value('*' * 100)
        self.assertAlmostEqual(first=count_accumulator.compute_metrics(),
                               second=8,
                               delta=4)
Esempio n. 12
0
    def test_without_noise(self):
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1000000,
                                                  total_delta=0.9999999)
        budget = budget_accountant.request_budget(
            pipeline_dp.MechanismType.GAUSSIAN)
        budget_accountant.compute_budgets()
        no_noise = pipeline_dp.AggregateParams(
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            noise_kind=NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT])
        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(5)))
        self.assertEqual(count_accumulator.compute_metrics(), 5)

        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), 'a' * 50)
        self.assertEqual(count_accumulator.compute_metrics(), 50)

        count_accumulator = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(50)))
        count_accumulator.add_value(49)
        self.assertEqual(count_accumulator.compute_metrics(), 51)

        count_accumulator_1 = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), list(range(50)))
        count_accumulator_2 = accumulator.CountAccumulator(
            accumulator.CountParams(budget, no_noise), 'a' * 50)
        count_accumulator_1.add_accumulator(count_accumulator_2)
        self.assertEqual(count_accumulator_1.compute_metrics(), 100)
Esempio n. 13
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._privacy_id_count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=self._privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1,
            public_partitions=self._privacy_id_count_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._privacy_id_count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # PrivacyIdCount ignores values.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_privacy_id_count])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - privacy_id_count. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count,
                                       "Extract privacy_id_count")
        # dp_result : (partition_key, dp_privacy_id_count)

        return dp_result
Esempio n. 14
0
    def test_accumulator_factory_multiple_types(
            self, mock_create_accumulator_factories):
        aggregate_params = pipeline_dp.AggregateParams(
            noise_kind=NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.MEAN],
            max_partitions_contributed=5,
            max_contributions_per_partition=3,
            min_value=0,
            max_value=1)
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=0.01)
        values = [10]

        mock_create_accumulator_factories.return_value = [
            MeanAccumulatorFactory(),
            SumOfSquaresAccumulatorFactory()
        ]

        accumulator_factory = accumulator.CompoundAccumulatorFactory(
            aggregate_params, budget_accountant)
        created_accumulator = accumulator_factory.create(values)

        self.assertTrue(
            isinstance(created_accumulator, accumulator.CompoundAccumulator))
        self.assertEqual(created_accumulator.compute_metrics(), [10, 100])
        mock_create_accumulator_factories.assert_called_with(
            aggregate_params, budget_accountant)
Esempio n. 15
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._mean_params.noise_kind,
            metrics=[pipeline_dp.Metrics.MEAN],
            max_partitions_contributed=self._mean_params.
            max_partitions_contributed,
            max_contributions_per_partition=self._mean_params.
            max_contributions_per_partition,
            min_value=self._mean_params.min_value,
            max_value=self._mean_params.max_value,
            public_partitions=self._mean_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._mean_params.partition_extractor(
                x[1]),
            privacy_id_extractor=lambda x: x[0],
            value_extractor=lambda x: self._mean_params.value_extractor(x[1]))

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_sum])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - mean. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.mean,
                                       "Extract mean")
        # dp_result : (partition_key, dp_sum)

        return dp_result
Esempio n. 16
0
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    movie_views = parse_file(FLAGS.input_file)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        metrics=[
            # we can compute multiple metrics at once.
            pipeline_dp.Metrics.COUNT,
            pipeline_dp.Metrics.SUM,
            pipeline_dp.Metrics.PRIVACY_ID_COUNT
        ],
        # Limits to how much one user can contribute:
        # .. at most two movies rated per user
        max_partitions_contributed=2,
        # .. at most one rating for each movie
        max_contributions_per_partition=1,
        # .. with minimal rating of "1"
        min_value=1,
        # .. and maximum rating of "5"
        max_value=5)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie_views.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Create a computational graph for the aggregation.
    # All computations are lazy. dp_result is iterable, but iterating it would
    # fail until budget is computed (below).
    # It’s possible to call DPEngine.aggregate multiple times with different
    # metrics to compute.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
Esempio n. 17
0
 def _create_aggregate_params_per_partition_bound(self):
     return pipeline_dp.AggregateParams(
         min_sum_per_partition=0,
         max_sum_per_partition=3,
         max_contributions_per_partition=1,
         max_partitions_contributed=1,
         noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
         metrics=[pipeline_dp.Metrics.SUM])
Esempio n. 18
0
 def _create_params_default(self):
     return (pipeline_dp.AggregateParams(
         noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
         metrics=[agg.Metrics.COUNT, agg.Metrics.SUM, agg.Metrics.MEAN],
         min_value=0,
         max_value=1,
         max_partitions_contributed=1,
         max_contributions_per_partition=1), ["pk0", "pk10", "pk11"])
Esempio n. 19
0
def _create_aggregate_params(max_value: float = 1):
    return pipeline_dp.AggregateParams(
        min_value=0,
        max_value=max_value,
        max_partitions_contributed=1,
        max_contributions_per_partition=3,
        noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
        metrics=[pipeline_dp.Metrics.COUNT])
Esempio n. 20
0
 def _create_aggregate_params(self, metrics: list):
     return pipeline_dp.AggregateParams(
         noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
         metrics=metrics,
         min_value=0,
         max_value=1,
         max_partitions_contributed=1,
         max_contributions_per_partition=1,
         budget_weight=10.0)
Esempio n. 21
0
    def test_check_aggregate_params(self):
        default_extractors = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: x,
            value_extractor=lambda x: x,
        )
        default_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT])

        test_cases = [
            {
                "desc": "None col",
                "col": None,
                "params": default_params,
                "data_extractor": default_extractors,
            },
            {
                "desc": "empty col",
                "col": [],
                "params": default_params,
                "data_extractor": default_extractors
            },
            {
                "desc": "none params",
                "col": [0],
                "params": None,
                "data_extractor": default_extractors,
            },
            {
                "desc": "None data_extractor",
                "col": [0],
                "params": default_params,
                "data_extractor": None,
            },
            {
                "desc": "data_extractor with an incorrect type",
                "col": [0],
                "params": default_params,
                "data_extractor": 1,
            },
        ]

        for test_case in test_cases:
            with self.assertRaises(Exception, msg=test_case["desc"]):
                budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-10)
                engine = pipeline_dp.DPEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                engine.aggregate(test_case["col"], test_case["params"],
                                 test_case["data_extractor"])
Esempio n. 22
0
def calc_dp_rating_metrics(movie_views, backend, public_partitions):
    """Computes DP metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[
            pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM,
            pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.VARIANCE
        ] + ([pipeline_dp.Metrics.PRIVACY_ID_COUNT]
             if not FLAGS.contribution_bounds_already_enforced else []),
        max_partitions_contributed=2,
        max_contributions_per_partition=1,
        min_value=1,
        max_value=5,
        contribution_bounds_already_enforced=FLAGS.
        contribution_bounds_already_enforced)

    value_extractor = lambda mv: mv.rating

    if FLAGS.vector_metrics:
        # Specify which DP aggregated metrics to compute for vector values.
        params.metrics = [pipeline_dp.Metrics.VECTOR_SUM]
        params.vector_size = 5  # Size of ratings vector
        params.vector_max_norm = 1
        value_extractor = lambda mv: encode_one_hot(mv.rating - 1, params.
                                                    vector_size)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=(lambda mv: mv.user_id)
        if not FLAGS.contribution_bounds_already_enforced else None,
        value_extractor=value_extractor)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors,
                                    public_partitions)

    budget_accountant.compute_budgets()

    reports = dp_engine.explain_computations_report()
    for report in reports:
        print(report)

    return dp_result
Esempio n. 23
0
 def test_create_accumulator_factories_with_count_params(self):
     acc_factories = accumulator._create_accumulator_factories(
         aggregation_params=pipeline_dp.AggregateParams(
             noise_kind=NoiseKind.GAUSSIAN,
             metrics=[pipeline_dp.Metrics.COUNT],
             max_partitions_contributed=1,
             max_contributions_per_partition=1,
             budget_weight=1),
         budget_accountant=NaiveBudgetAccountant(total_epsilon=1,
                                                 total_delta=0.01))
     self.assertEqual(len(acc_factories), 1)
     self.assertIsInstance(acc_factories[0],
                           accumulator.CountAccumulatorFactory)
Esempio n. 24
0
def _create_combiner_params_for_privacy_id_count(
) -> pipeline_dp.combiners.CombinerParams:
    return pipeline_dp.combiners.CombinerParams(
        pipeline_dp.budget_accounting.MechanismSpec(
            mechanism_type=pipeline_dp.MechanismType.GAUSSIAN,
            _eps=1,
            _delta=0.00001),
        pipeline_dp.AggregateParams(
            max_partitions_contributed=2,
            max_contributions_per_partition=2,
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
        ))
Esempio n. 25
0
def _create_aggregate_params(max_value: float = 1,
                             vector_size: int = 1,
                             vector_norm_kind=pipeline_dp.NormKind.Linf):
    return pipeline_dp.AggregateParams(
        min_value=0,
        max_value=max_value,
        max_partitions_contributed=1,
        max_contributions_per_partition=3,
        noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
        metrics=[pipeline_dp.Metrics.COUNT],
        vector_norm_kind=vector_norm_kind,
        vector_max_norm=5,
        vector_size=vector_size)
Esempio n. 26
0
 def test_aggregate_report(self, mock_create_accumulator_params_function):
     col = [[1], [2], [3], [3]]
     data_extractor = pipeline_dp.DataExtractors(
         privacy_id_extractor=lambda x: "pid" + str(x),
         partition_extractor=lambda x: "pk" + str(x),
         value_extractor=lambda x: x)
     params1 = pipeline_dp.AggregateParams(
         max_partitions_contributed=3,
         max_contributions_per_partition=2,
         low=1,
         high=5,
         metrics=[
             pipeline_dp.Metrics.PRIVACY_ID_COUNT,
             pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
         ],
     )
     params2 = pipeline_dp.AggregateParams(
         max_partitions_contributed=1,
         max_contributions_per_partition=3,
         low=2,
         high=10,
         metrics=[
             pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM,
             pipeline_dp.Metrics.MEAN
         ],
         public_partitions=list(range(1, 40)),
     )
     mock_create_accumulator_params_function.return_value = [
         pipeline_dp.accumulator.AccumulatorParams(
             pipeline_dp.accumulator.CountAccumulator, None)
     ]
     engine = pipeline_dp.DPEngine(
         budget_accountant=NaiveBudgetAccountant(total_epsilon=1,
                                                 total_delta=1e-10),
         ops=pipeline_dp.LocalPipelineOperations())
     engine.aggregate(col, params1, data_extractor)
     engine.aggregate(col, params2, data_extractor)
     self.assertEqual(len(engine._report_generators), 2)  # pylint: disable=protected-access
Esempio n. 27
0
    def test_utility_analysis_params(self):
        default_extractors = self._get_default_extractors()
        default_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            metrics=[pipeline_dp.Metrics.COUNT])
        params_with_custom_combiners = copy.copy(default_params)
        params_with_custom_combiners.custom_combiners = sum
        params_with_unsupported_metric = copy.copy(default_params)
        params_with_unsupported_metric.metrics = [pipeline_dp.Metrics.MEAN]
        params_with_contribution_bounds_already_enforced = default_params
        params_with_contribution_bounds_already_enforced.contribution_bounds_already_enforced = True

        test_cases = [
            {
                "desc": "custom combiners",
                "params": params_with_custom_combiners,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "unsupported metric in metrics",
                "params": params_with_unsupported_metric,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "contribution bounds are already enforced",
                "params": params_with_contribution_bounds_already_enforced,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
        ]

        for test_case in test_cases:

            with self.assertRaisesRegex(Exception,
                                        expected_regex=test_case["desc"]):
                budget_accountant = budget_accounting.NaiveBudgetAccountant(
                    total_epsilon=1, total_delta=1e-10)
                engine = dp_engine.UtilityAnalysisEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                col = [0, 1, 2]
                engine.aggregate(
                    col,
                    test_case["params"],
                    test_case["data_extractor"],
                    public_partitions=test_case["public_partitions"])
Esempio n. 28
0
    def test_variance_calls_aggregate_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 0.0, "pk1"),
                                                   (2, 10.0, "pk1")])
        MetricsTuple = collections.namedtuple('MetricsTuple', ['variance'])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([
            ("pk1", MetricsTuple(variance=25.0))
        ])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        variance_params = agg.VarianceParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            min_value=1.5,
            max_value=5.78,
            budget_weight=1.1,
            partition_extractor=lambda x: x[0],
            value_extractor=lambda x: x)

        # Act
        actual_result = prdd.variance(variance_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.VARIANCE],
            max_partitions_contributed=variance_params.
            max_partitions_contributed,
            max_contributions_per_partition=variance_params.
            max_contributions_per_partition,
            min_value=variance_params.min_value,
            max_value=variance_params.max_value,
            budget_weight=variance_params.budget_weight,
            public_partitions=variance_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [("pk1", 25.0)])
Esempio n. 29
0
 def test_create_accumulator_params_with_count_params(self):
     acc_params = accumulator.create_accumulator_params(
         aggregation_params=pipeline_dp.AggregateParams(
             metrics=[pipeline_dp.Metrics.COUNT],
             max_partitions_contributed=4,
             max_contributions_per_partition=5,
             budget_weight=1),
         budget_accountant=NaiveBudgetAccountant(total_epsilon=1,
                                                 total_delta=0.01))
     self.assertEqual(len(acc_params), 1)
     self.assertEqual(acc_params[0].accumulator_type,
                      accumulator.CountAccumulator)
     self.assertTrue(
         isinstance(acc_params[0].constructor_params,
                    accumulator.CountParams))
Esempio n. 30
0
def _create_combiner_params_for_sum(
        min, max) -> pipeline_dp.combiners.CombinerParams:
    return pipeline_dp.combiners.CombinerParams(
        pipeline_dp.budget_accounting.MechanismSpec(
            mechanism_type=pipeline_dp.MechanismType.GAUSSIAN,
            _eps=1,
            _delta=0.00001),
        pipeline_dp.AggregateParams(
            max_partitions_contributed=1,
            max_contributions_per_partition=2,
            min_sum_per_partition=min,
            max_sum_per_partition=max,
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.SUM],
        ))