Beispiel #1
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        df = pd.read_csv(FLAGS.input_file)
        df.rename(inplace=True,
                  columns={
                      'VisitorId': 'user_id',
                      'Time entered': 'enter_time',
                      'Time spent (minutes)': 'spent_minutes',
                      'Money spent (euros)': 'spent_money',
                      'Day': 'day'
                  })
        restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()]
        beam_data = pipeline | beam.Create(restaraunt_visits_rows)

        # Wrap Beam's PCollection into it's private version
        private_restaraunt_visits = beam_data | private_beam.MakePrivate(
            budget_accountant=budget_accountant,
            privacy_id_extractor=lambda row: row.user_id)

        # Calculate the private sum
        dp_result = private_restaraunt_visits | private_beam.Sum(
            SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                      max_partitions_contributed=7,
                      max_contributions_per_partition=2,
                      min_value=1,
                      max_value=100,
                      budget_weight=1,
                      public_partitions=None,
                      partition_extractor=lambda row: row.day,
                      value_extractor=lambda row: row.spent_money))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
Beispiel #2
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        movie_views_pcol = pipeline | \
                           beam.io.ReadFromText(FLAGS.input_file) | \
                           beam.ParDo(ParseFile())

        # Wrap Beam's PCollection into it's private version
        private_movie_views = (movie_views_pcol
                               | 'Create private collection' >> MakePrivate(
                                   budget_accountant=budget_accountant,
                                   privacy_id_extractor=lambda mv: mv.user_id))

        # Calculate the private sum
        dp_result = private_movie_views | "Private Sum" >> private_beam.Sum(
            SumParams(
                # Limits to how much one user can contribute:
                # .. at most two movies rated per user
                max_partitions_contributed=2,
                # .. at most one rating for each movie
                max_contributions_per_partition=1,
                # .. with minimal rating of "1"
                min_value=1,
                # .. and maximum rating of "5"
                max_value=5,
                # The aggregation key: we're grouping data by movies
                partition_extractor=lambda mv: mv.movie_id,
                # The value we're aggregating: we're summing up ratings
                value_extractor=lambda mv: mv.rating))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
Beispiel #3
0
    def test_sum_calls_aggregate_with_params(self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                float(i) for i in range(1, 7))
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            sum_params = aggregate_params.SumParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                min_value=1,
                max_value=5,
                budget_weight=1,
                public_partitions=[],
                partition_extractor=lambda x: f"pk:{x // 10}",
                value_extractor=lambda x: x)

            # Act
            transformer = private_beam.Sum(sum_params=sum_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.SUM],
                max_partitions_contributed=sum_params.
                max_partitions_contributed,
                max_contributions_per_partition=sum_params.
                max_contributions_per_partition,
                min_value=sum_params.min_value,
                max_value=sum_params.max_value,
                public_partitions=sum_params.public_partitions)
            self.assertEqual(params, args[1])
Beispiel #4
0
    def test_sum_with_public_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pubK1", 100.0) for u in range(30)]
            col += [(f"{u + 30}", "pubK1", -100.0) for u in range(30)]
            col += [(f"{u + 60}", "privK1", 100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            sum_params = aggregate_params.SumParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                min_value=1.55,
                max_value=2.7889,
                budget_weight=1,
                partition_extractor=lambda x: x[1],
                value_extractor=lambda x: x[2],
                public_partitions=["pubK1", "pubK2"])

            # Act
            result = private_collection | private_beam.Sum(
                sum_params=sum_params)
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pubK1", 130.167), ("pubK2", 0.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 10.0)))