def get_private_movies(movie_views, backend): """Obtains the list of movies in a differentially private manner. This does not calculate any metrics; it merely returns the list of movies, making sure the result is differentially private. """ # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=0.1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id) # Run aggregation. dp_result = dp_engine.select_partitions( movie_views, pipeline_dp.SelectPartitionsParams(max_partitions_contributed=2), data_extractors=data_extractors) budget_accountant.compute_budgets() return dp_result
def test_aggregate_computation_graph_verification( self, mock_bound_contributions): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[agg.Metrics.COUNT], max_partitions_contributed=5, max_contributions_per_partition=3) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: f"pid{x}", partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) mock_bound_contributions.return_value = [ [("pid1", "pk1"), (1, [1])], [("pid2", "pk2"), (1, [1])], [("pid3", "pk3"), (1, [2])], ] engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) # Assert mock_bound_contributions.assert_called_with( unittest.mock.ANY, aggregator_params.max_partitions_contributed, aggregator_params.max_contributions_per_partition, unittest.mock.ANY)
def calc_dp_rating_metrics(movie_views, backend, public_partitions): """Computes DP metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) # Specify which DP aggregated metrics to compute. params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=None, max_partitions_contributed=2, max_contributions_per_partition=1, min_value=1, max_value=5, public_partitions=public_partitions, custom_combiners=[CountCombiner()]) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() return dp_result
def sum(self, sum_params: aggregate_params.SumParams) -> RDD: """Computes a DP sum. Args: sum_params: parameters for calculation """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=sum_params.noise_kind, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=sum_params.max_partitions_contributed, max_contributions_per_partition=sum_params. max_contributions_per_partition, min_value=sum_params.min_value, max_value=sum_params.max_value, public_partitions=sum_params.public_partitions, budget_weight=sum_params.budget_weight) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: sum_params.partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: sum_params.value_extractor(x[1])) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors) # dp_result : (partition_key, [dp_sum]) # aggregate() returns a list of metrics for each partition key. # Here is only one metric - sum. Remove list. dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest list") # dp_result : (partition_key, dp_sum) return dp_result
def test_contribution_bounds_already_enforced_sensible_result(self): # Arrange. # Set large budget, so the noise is very small. accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1000, total_delta=0.999) engine = self._create_dp_engine_default(accountant=accountant) aggregate_params, public_partitions = self._create_params_default() aggregate_params.contribution_bounds_already_enforced = True aggregate_params.metrics = [pipeline_dp.Metrics.SUM] input = [(pk, 1) for pk in public_partitions] data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: x[0], value_extractor=lambda x: x[1]) data_extractors.privacy_id_extractor = None # Act. col = engine.aggregate(input, aggregate_params, data_extractors, public_partitions) accountant.compute_budgets() col = list(col) # Assert. self.assertLen(col, len(public_partitions)) values = [x[1].sum for x in col] self.assertSequenceAlmostEqual(values, [1.0] * len(public_partitions))
def test_check_invalid_bounding_params(self, error_msg, min_value, max_value, max_partitions_contributed, max_contributions_per_partition, metrics): with self.assertRaises(Exception, msg=error_msg): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) engine.aggregate( [0], pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition= max_contributions_per_partition, min_value=min_value, max_value=max_value, metrics=metrics), pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: x, value_extractor=lambda x: x, ))
def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection: backend = pipeline_dp.BeamBackend() dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=self._privacy_id_count_params.noise_kind, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=self._privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=self._privacy_id_count_params.public_partitions) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: self._privacy_id_count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # PrivacyIdCount ignores values. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(pcol, params, data_extractors) # dp_result : (partition_key, [dp_privacy_id_count]) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - privacy_id_count. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count, "Extract privacy_id_count") # dp_result : (partition_key, dp_privacy_id_count) return dp_result
def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection: backend = pipeline_dp.BeamBackend() dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=self._count_params.noise_kind, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=self._count_params. max_partitions_contributed, max_contributions_per_partition=self._count_params. max_contributions_per_partition, public_partitions=self._count_params.public_partitions) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: self._count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # Count calculates the number of elements per partition key and # doesn't use value extractor. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(pcol, params, data_extractors) # dp_result : (partition_key, [dp_count]) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - count. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.count, "Extract sum") # dp_result : (partition_key, dp_count) return dp_result
def expand(self, pcol: pvalue.PCollection): combiner = _CombineFnCombiner(self._combine_fn) aggregate_params = pipeline_dp.AggregateParams( metrics=None, max_partitions_contributed=self._params.max_partitions_contributed, max_contributions_per_partition=self._params. max_contributions_per_partition, custom_combiners=[combiner]) backend, dp_engine = self._create_dp_engine() # Assumed elements format: (privacy_id, (partition_key, value)) data_extractors = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x[0], partition_extractor=lambda x: x[1][0], value_extractor=lambda x: x[1][1]) dp_result = dp_engine.aggregate(pcol, aggregate_params, data_extractors) # dp_result : (partition_key, [combiner_result]) # aggregate() returns a tuple with on 1 element per combiner. # Here is only one combiner. Extract it from the tuple. dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest tuple") # dp_result : (partition_key, result) return dp_result
def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection: backend = pipeline_dp.BeamBackend() dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=self._mean_params.noise_kind, metrics=[pipeline_dp.Metrics.MEAN], max_partitions_contributed=self._mean_params. max_partitions_contributed, max_contributions_per_partition=self._mean_params. max_contributions_per_partition, min_value=self._mean_params.min_value, max_value=self._mean_params.max_value, public_partitions=self._mean_params.public_partitions) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: self._mean_params.partition_extractor( x[1]), privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: self._mean_params.value_extractor(x[1])) dp_result = dp_engine.aggregate(pcol, params, data_extractors) # dp_result : (partition_key, [dp_sum]) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - mean. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.mean, "Extract mean") # dp_result : (partition_key, dp_sum) return dp_result
def calc_dp_rating_metrics(movie_views, ops, public_partitions): """Computes dp metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, ops) # Specify which DP aggregated metrics to compute. params = pipeline_dp.AggregateParams(metrics=[ pipeline_dp.Metrics.COUNT, ], max_partitions_contributed=2, max_contributions_per_partition=1, low=1, high=5, public_partitions=public_partitions) # Specify how to extract is privacy_id, partition_key and value from an element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() return dp_result
def run_e2e_private_partition_selection_large_budget(col, backend): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[agg.Metrics.COUNT, agg.Metrics.SUM], min_value=1, max_value=10, max_partitions_contributed=1, max_contributions_per_partition=1) # Set a large budget for having the small noise and keeping all # partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x//2}", value_extractor=lambda x: x) engine = pipeline_dp.DPEngine(budget_accountant, backend) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() return col
def test_aggregate_report(self): col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: f"pid{x}", partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) params1 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=3, max_contributions_per_partition=2, min_value=1, max_value=5, metrics=[ pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN ], ) params2 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=3, min_value=2, max_value=10, metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN], public_partitions=list(range(1, 40)), ) select_partitions_params = SelectPartitionsParams( max_partitions_contributed=2) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) engine.aggregate(col, params1, data_extractor) engine.aggregate(col, params2, data_extractor) engine.select_partitions(col, select_partitions_params, data_extractor) self.assertEqual(3, len(engine._report_generators)) # pylint: disable=protected-access budget_accountant.compute_budgets() self.assertEqual( engine._report_generators[0].report(), "Differentially private: Computing <Metrics: ['privacy_id_count', 'count', 'mean']>" "\n1. Per-partition contribution bounding: randomly selected not more than 2 contributions" "\n2. Cross-partition contribution bounding: randomly selected not more than 3 partitions per user" "\n3. Private Partition selection: using Truncated Geometric method with (eps= 0.1111111111111111, delta = 1.1111111111111111e-11)" ) self.assertEqual( engine._report_generators[1].report(), "Differentially private: Computing <Metrics: ['sum', 'mean']>" "\n1. Public partition selection: dropped non public partitions" "\n2. Per-partition contribution bounding: randomly selected not more than 3 contributions" "\n3. Cross-partition contribution bounding: randomly selected not more than 1 partitions per user" "\n4. Adding empty partitions to public partitions that are missing in data" ) self.assertEqual( engine._report_generators[2].report(), "Differentially private: Computing <Private Partitions>" "\n1. Private Partition selection: using Truncated Geometric method with (eps= 0.3333333333333333, delta = 3.3333333333333335e-11)" )
def main(unused_argv): # Here, we use a local backend for computations. This does not depend on # any pipeline framework and it is implemented in pure Python in # PipelineDP. It keeps all data in memory and is not optimized for large data. # For datasets smaller than ~tens of megabytes, local execution without any # framework is faster than local mode with Beam or Spark. backend = pipeline_dp.LocalBackend() # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views = parse_file(FLAGS.input_file) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( metrics=[ # we can compute multiple metrics at once. pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.PRIVACY_ID_COUNT ], # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5) # Specify how to extract privacy_id, partition_key and value from an # element of movie_views. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Create a computational graph for the aggregation. # All computations are lazy. dp_result is iterable, but iterating it would # fail until budget is computed (below). # It’s possible to call DPEngine.aggregate multiple times with different # metrics to compute. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() # Here's where the lazy iterator initiates computations and gets transformed # into actual results dp_result = list(dp_result) # Save the results write_to_file(dp_result, FLAGS.output_file) return 0
def test_check_aggregate_params(self): default_extractors = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: x, value_extractor=lambda x: x, ) default_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=1, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT]) test_cases = [ { "desc": "None col", "col": None, "params": default_params, "data_extractor": default_extractors, }, { "desc": "empty col", "col": [], "params": default_params, "data_extractor": default_extractors }, { "desc": "none params", "col": [0], "params": None, "data_extractor": default_extractors, }, { "desc": "None data_extractor", "col": [0], "params": default_params, "data_extractor": None, }, { "desc": "data_extractor with an incorrect type", "col": [0], "params": default_params, "data_extractor": 1, }, ] for test_case in test_cases: with self.assertRaises(Exception, msg=test_case["desc"]): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine( budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) engine.aggregate(test_case["col"], test_case["params"], test_case["data_extractor"])
def calc_dp_rating_metrics(movie_views, backend, public_partitions): """Computes DP metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[ pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.VARIANCE ] + ([pipeline_dp.Metrics.PRIVACY_ID_COUNT] if not FLAGS.contribution_bounds_already_enforced else []), max_partitions_contributed=2, max_contributions_per_partition=1, min_value=1, max_value=5, contribution_bounds_already_enforced=FLAGS. contribution_bounds_already_enforced) value_extractor = lambda mv: mv.rating if FLAGS.vector_metrics: # Specify which DP aggregated metrics to compute for vector values. params.metrics = [pipeline_dp.Metrics.VECTOR_SUM] params.vector_size = 5 # Size of ratings vector params.vector_max_norm = 1 value_extractor = lambda mv: encode_one_hot(mv.rating - 1, params. vector_size) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=(lambda mv: mv.user_id) if not FLAGS.contribution_bounds_already_enforced else None, value_extractor=value_extractor) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors, public_partitions) budget_accountant.compute_budgets() reports = dp_engine.explain_computations_report() for report in reports: print(report) return dp_result
def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection: backend = pipeline_dp.BeamBackend() dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: self._partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0]) dp_result = dp_engine.select_partitions(pcol, self._select_partitions_params, data_extractors) return dp_result
def test_select_partitions(self): # This test is probabilistic, but the parameters were chosen to ensure # the test has passed at least 10000 runs. # Arrange params = SelectPartitionsParams(max_partitions_contributed=1) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-5) # Generate dataset as a list of (user, partition_key) tuples. # There partitions are generated to reflect several scenarios. # A partition with sufficient amount of users. col = [(u, "pk-many-contribs") for u in range(25)] # A partition with many contributions, but only a few unique users. col += [(100 + u // 10, "pk-many-contribs-few-users") for u in range(30)] # A partition with few contributions. col += [(200 + u, "pk-few-contribs") for u in range(3)] # Generating 30 partitions, each with the same group of 25 users # 25 users is sufficient to keep the partition, but because of # contribution bounding, much less users per partition will be kept. for i in range(30): col += [(500 + u, f"few-contribs-after-bound{i}") for u in range(25)] col = list(col) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x[0], partition_extractor=lambda x: x[1]) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.select_partitions(col=col, params=params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert # Only one partition is retained, the one that has many unique _after_ # applying the "max_partitions_contributed" bound is retained. self.assertEqual(["pk-many-contribs"], col)
def test_aggregate_public_partitions_add_empty_public_partitions(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[ agg.Metrics.COUNT, agg.Metrics.SUM, agg.Metrics.PRIVACY_ID_COUNT ], min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, public_partitions=["pk0", "pk10", "pk11"]) # Set a high budget to add close to 0 noise. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1 - 1e-10) # Input collection has 10 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(10)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: 1) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) partition_keys = [x[0] for x in col] # Assert # Only public partitions ("pk0") should be kept and empty public # partitions ("pk10", "pk11") should be added. self.assertEqual(["pk0", "pk10", "pk11"], partition_keys) self.assertAlmostEqual(1, col[0][1][0]) # "pk0" COUNT ≈ 1 self.assertAlmostEqual(1, col[0][1][1]) # "pk0" SUM ≈ 1 self.assertAlmostEqual(1, col[0][1][2]) # "pk0" PRIVACY_ID_COUNT ≈ 1 self.assertAlmostEqual(0, col[1][1][0]) # "pk10" COUNT ≈ 0 self.assertAlmostEqual(0, col[1][1][1]) # "pk10" SUM ≈ 0 self.assertAlmostEqual(0, col[1][1][2]) # "pk10" PRIVACY_ID_COUNT ≈ 0
def variance(self, variance_params: aggregate_params.VarianceParams, public_partitions=None) -> RDD: """Computes a DP variance. Args: variance_params: parameters for calculation public_partitions: A collection of partition keys that will be present in the result. Optional. If not provided, partitions will be selected in a DP manner. """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=variance_params.noise_kind, metrics=[pipeline_dp.Metrics.VARIANCE], max_partitions_contributed=variance_params. max_partitions_contributed, max_contributions_per_partition=variance_params. max_contributions_per_partition, min_value=variance_params.min_value, max_value=variance_params.max_value, budget_weight=variance_params.budget_weight) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: variance_params.partition_extractor(x[ 1]), privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: variance_params.value_extractor(x[1])) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors, public_partitions) # dp_result : (partition_key, (variance=dp_variance)) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - variance. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.variance, "Extract variance") # dp_result : (partition_key, dp_variance) return dp_result
def test_aggregate_public_partitions_drop_non_public(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[ agg.Metrics.COUNT, agg.Metrics.SUM, agg.Metrics.PRIVACY_ID_COUNT ], min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, public_partitions=["pk0", "pk1", "pk10"]) # Set an arbitrary budget, we are not interested in the DP outputs, only # the partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) # Input collection has 10 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(10)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) partition_keys = [x[0] for x in col] # Assert # Only public partitions (0, 1, 2) should be kept and the rest of the # partitions should be dropped. self.assertEqual(["pk0", "pk1", "pk10"], partition_keys)
def privacy_id_count( self, privacy_id_count_params: aggregate_params.PrivacyIdCountParams, public_partitions=None) -> RDD: """Computes a DP Privacy ID count. Args: privacy_id_count_params: parameters for calculation public_partitions: A collection of partition keys that will be present in the result. Optional. If not provided, partitions will be selected in a DP manner. """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=privacy_id_count_params.noise_kind, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: privacy_id_count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # PrivacyIdCount ignores values. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors, public_partitions) # dp_result : (partition_key, (privacy_id_count=dp_privacy_id_count)) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - privacy id count. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count, "Extract privacy id count") # dp_result : (partition_key, dp_privacy_id_count) return dp_result
def test_aggregate_report(self, mock_create_accumulator_params_function): col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: "pid" + str(x), partition_extractor=lambda x: "pk" + str(x), value_extractor=lambda x: x) params1 = pipeline_dp.AggregateParams( max_partitions_contributed=3, max_contributions_per_partition=2, low=1, high=5, metrics=[ pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN ], ) params2 = pipeline_dp.AggregateParams( max_partitions_contributed=1, max_contributions_per_partition=3, low=2, high=10, metrics=[ pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN ], public_partitions=list(range(1, 40)), ) mock_create_accumulator_params_function.return_value = [ pipeline_dp.accumulator.AccumulatorParams( pipeline_dp.accumulator.CountAccumulator, None) ] engine = pipeline_dp.DPEngine( budget_accountant=NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), ops=pipeline_dp.LocalPipelineOperations()) engine.aggregate(col, params1, data_extractor) engine.aggregate(col, params2, data_extractor) self.assertEqual(len(engine._report_generators), 2) # pylint: disable=protected-access
def test_aggregate_computation_graph_verification( self, mock_bound_contributions): # Arrange aggregator_params = pipeline_dp.AggregateParams([agg.Metrics.COUNT], 5, 3) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) accumulator_factory = AccumulatorFactory( params=aggregator_params, budget_accountant=budget_accountant) accumulator_factory.initialize() col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: "pid" + str(x), partition_extractor=lambda x: "pk" + str(x), value_extractor=lambda x: x) mock_bound_contributions.return_value = [ [("pid1", "pk1"), CountAccumulator(params=None, values=[1])], [("pid2", "pk2"), CountAccumulator(params=None, values=[1])], [("pid3", "pk3"), CountAccumulator(params=None, values=[2])], ] engine = pipeline_dp.DPEngine( budget_accountant=budget_accountant, ops=pipeline_dp.LocalPipelineOperations()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) # Assert mock_bound_contributions.assert_called_with( unittest.mock.ANY, aggregator_params.max_partitions_contributed, aggregator_params.max_contributions_per_partition, unittest.mock.ANY)
def select_partitions( self, select_partitions_params: aggregate_params.SelectPartitionsParams, partition_extractor: Callable) -> RDD: """Computes a collection of partition keys in a DP manner. Args: select_partitions_params: parameters for calculation partition_extractor: function for extracting partition key from each input element """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.SelectPartitionsParams( max_partitions_contributed=select_partitions_params. max_partitions_contributed) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0]) return dp_engine.select_partitions(self._rdd, params, data_extractors)
def test_aggregate_public_partition_applied(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) budget_accountant = pipeline_dp.NaiveBudgetAccountant( total_epsilon=1, total_delta=1e-10) public_partitions = ["pk0", "pk1", "pk101"] # Input collection has 100 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(100)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: None) engine = dp_engine.UtilityAnalysisEngine( budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor, public_partitions=public_partitions) budget_accountant.compute_budgets() col = list(col) # Assert public partitions are applied, i.e. that pk0 and pk1 are kept, # and pk101 is added. self.assertEqual(len(col), 3) self.assertTrue(any(map(lambda x: x[0] == "pk101", col)))
def privacy_id_count( self, privacy_id_count_params: aggregate_params.PrivacyIdCountParams ) -> RDD: """Computes a DP Privacy ID count. Args: privacy_id_count_params: parameters for calculation """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=privacy_id_count_params.noise_kind, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=privacy_id_count_params.public_partitions) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: privacy_id_count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # PrivacyIdCount ignores values. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors) # dp_result : (partition_key, [dp_privacy_id_count]) # aggregate() returns a list of metrics for each partition key. # Here is only one metric - privacy_id_count. Remove list. dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest list") # dp_result : (partition_key, dp_privacy_id_count) return dp_result
def test_aggregate_private_partition_selection_drop_many(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[agg.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) # Set a small budget for dropping most partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) # Input collection has 100 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(100)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: None) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert # Most partition should be dropped by private partition selection. # This tests is non-deterministic, but it should pass with probability # very close to 1. self.assertLess(len(col), 5)
def test_aggregate_private_partition_selection_keep_everything(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[agg.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) # Set a large budget for having the small noise and keeping all # partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1e-10) col = list(range(10)) + list(range(100, 120)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x//100}", value_extractor=lambda x: None) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert approximate_expected = {"pk0": 10, "pk1": 20} self.assertEqual(2, len(col)) # all partition keys are kept. for pk, metrics_tuple in col: dp_count = metrics_tuple.count self.assertAlmostEqual(approximate_expected[pk], dp_count, delta=1e-3)
def main(unused_argv): # Here, we use a local backend for computations. This does not depend on # any pipeline framework and it is implemented in pure Python in # PipelineDP. It keeps all data in memory and is not optimized for large data. # For datasets smaller than ~tens of megabytes, local execution without any # framework is faster than local mode with Beam or Spark. backend = pipeline_dp.LocalBackend() # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM], max_partitions_contributed=3, max_contributions_per_partition=2, min_value=0, max_value=60) # Specify how to extract privacy_id, partition_key and value from an # element of restaraunt_visits_rows. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda row: row.day, privacy_id_extractor=lambda row: row.user_id, value_extractor=lambda row: row.spent_money) # Create a computational graph for the aggregation. # All computations are lazy. dp_result is iterable, but iterating it would # fail until budget is computed (below). # It’s possible to call DPEngine.aggregate multiple times with different # metrics to compute. dp_result = dp_engine.aggregate(restaraunt_visits_rows, params, data_extractors, public_partitions=list(range(1, 8))) budget_accountant.compute_budgets() # Here's where the lazy iterator initiates computations and gets transformed # into actual results dp_result = list(dp_result) # Save the results write_to_file(dp_result, FLAGS.output_file) return 0