def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] beam_data = pipeline | beam.Create(restaraunt_visits_rows) # Wrap Beam's PCollection into it's private version private_restaraunt_visits = beam_data | private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda row: row.user_id) # Calculate the private sum dp_result = private_restaraunt_visits | private_beam.Sum( SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=7, max_contributions_per_partition=2, min_value=1, max_value=100, budget_weight=1, public_partitions=None, partition_extractor=lambda row: row.day, value_extractor=lambda row: row.spent_money)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views_pcol = pipeline | \ beam.io.ReadFromText(FLAGS.input_file) | \ beam.ParDo(ParseFile()) # Wrap Beam's PCollection into it's private version private_movie_views = (movie_views_pcol | 'Create private collection' >> MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda mv: mv.user_id)) # Calculate the private sum dp_result = private_movie_views | "Private Sum" >> private_beam.Sum( SumParams( # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5, # The aggregation key: we're grouping data by movies partition_extractor=lambda mv: mv.movie_id, # The value we're aggregating: we're summing up ratings value_extractor=lambda mv: mv.rating)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def test_sum_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( float(i) for i in range(1, 7)) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) sum_params = aggregate_params.SumParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1, max_value=5, budget_weight=1, public_partitions=[], partition_extractor=lambda x: f"pk:{x // 10}", value_extractor=lambda x: x) # Act transformer = private_beam.Sum(sum_params=sum_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=sum_params. max_partitions_contributed, max_contributions_per_partition=sum_params. max_contributions_per_partition, min_value=sum_params.min_value, max_value=sum_params.max_value, public_partitions=sum_params.public_partitions) self.assertEqual(params, args[1])
def test_sum_with_public_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pubK1", 100.0) for u in range(30)] col += [(f"{u + 30}", "pubK1", -100.0) for u in range(30)] col += [(f"{u + 60}", "privK1", 100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) sum_params = aggregate_params.SumParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.55, max_value=2.7889, budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2], public_partitions=["pubK1", "pubK2"]) # Act result = private_collection | private_beam.Sum( sum_params=sum_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pubK1", 130.167), ("pubK2", 0.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 10.0)))