Beispiel #1
0
def compute_on_local():
    public_partitions = get_public_partitions()
    movie_views = parse_file(FLAGS.input_file)
    pipeline_operations = pipeline_dp.LocalPipelineOperations()
    dp_result = list(
        calc_dp_rating_metrics(movie_views, pipeline_operations,
                               public_partitions))
    write_to_file(dp_result, FLAGS.output_file)
Beispiel #2
0
    def test_contribution_bounding_empty_col(self):
        input_col = []
        max_partitions_contributed = 2
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertFalse(bound_result)
Beispiel #3
0
 def test_select_private_partitions(self):
     input_col = [("pid1", ('pk1', 1)), ("pid1", ('pk1', 2)),
                  ("pid1", ('pk2', 3)), ("pid1", ('pk2', 4)),
                  ("pid1", ('pk2', 5)), ("pid1", ('pk3', 6)),
                  ("pid1", ('pk4', 7)), ("pid2", ('pk4', 8))]
     max_partitions_contributed = 3
     engine = pipeline_dp.DPEngine(
         NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10),
         pipeline_dp.LocalPipelineOperations())
     groups = engine._ops.group_by_key(input_col, None)
     groups = engine._ops.map_values(groups,
                                     lambda group: _MockAccumulator(group))
     groups = list(groups)
     expected_data_filtered = [("pid1",
                                _MockAccumulator([
                                    ('pk1', 1),
                                    ('pk1', 2),
                                    ('pk2', 3),
                                    ('pk2', 4),
                                    ('pk2', 5),
                                    ('pk3', 6),
                                    ('pk4', 7),
                                ])),
                               ("pid2", _MockAccumulator([('pk4', 8)]))]
     self._mock_and_assert_private_partitions(engine, groups, 0,
                                              expected_data_filtered,
                                              max_partitions_contributed)
     expected_data_filtered = [
         ("pid1",
          _MockAccumulator([
              ('pk1', 1),
              ('pk1', 2),
              ('pk2', 3),
              ('pk2', 4),
              ('pk2', 5),
              ('pk3', 6),
              ('pk4', 7),
          ])),
     ]
     self._mock_and_assert_private_partitions(engine, groups, 3,
                                              expected_data_filtered,
                                              max_partitions_contributed)
     expected_data_filtered = []
     self._mock_and_assert_private_partitions(engine, groups, 100,
                                              expected_data_filtered,
                                              max_partitions_contributed)
Beispiel #4
0
    def test_contribution_bounding_bound_input_nothing_dropped(self):
        input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2),
                     ("pid1", 'pk2', 3), ("pid1", 'pk2', 4)]
        max_partitions_contributed = 2
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        expected_result = [(('pid1', 'pk2'), (2, 7, 25)),
                           (('pid1', 'pk1'), (2, 3, 5))]
        self.assertEqual(set(expected_result), set(bound_result))
Beispiel #5
0
    def test_aggregate_computation_graph_verification(
            self, mock_bound_contributions):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams([agg.Metrics.COUNT], 5,
                                                        3)
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        accumulator_factory = AccumulatorFactory(
            params=aggregator_params, budget_accountant=budget_accountant)
        accumulator_factory.initialize()

        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: "pid" + str(x),
            partition_extractor=lambda x: "pk" + str(x),
            value_extractor=lambda x: x)

        mock_bound_contributions.return_value = [
            [("pid1", "pk1"),
             CountAccumulator(params=None, values=[1])],
            [("pid2", "pk2"),
             CountAccumulator(params=None, values=[1])],
            [("pid3", "pk3"),
             CountAccumulator(params=None, values=[2])],
        ]

        engine = pipeline_dp.DPEngine(
            budget_accountant=budget_accountant,
            ops=pipeline_dp.LocalPipelineOperations())
        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)

        # Assert
        mock_bound_contributions.assert_called_with(
            unittest.mock.ANY, aggregator_params.max_partitions_contributed,
            aggregator_params.max_contributions_per_partition,
            unittest.mock.ANY)
Beispiel #6
0
 def test_aggregate_report(self, mock_create_accumulator_params_function):
     col = [[1], [2], [3], [3]]
     data_extractor = pipeline_dp.DataExtractors(
         privacy_id_extractor=lambda x: "pid" + str(x),
         partition_extractor=lambda x: "pk" + str(x),
         value_extractor=lambda x: x)
     params1 = pipeline_dp.AggregateParams(
         max_partitions_contributed=3,
         max_contributions_per_partition=2,
         low=1,
         high=5,
         metrics=[
             pipeline_dp.Metrics.PRIVACY_ID_COUNT,
             pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
         ],
     )
     params2 = pipeline_dp.AggregateParams(
         max_partitions_contributed=1,
         max_contributions_per_partition=3,
         low=2,
         high=10,
         metrics=[
             pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM,
             pipeline_dp.Metrics.MEAN
         ],
         public_partitions=list(range(1, 40)),
     )
     mock_create_accumulator_params_function.return_value = [
         pipeline_dp.accumulator.AccumulatorParams(
             pipeline_dp.accumulator.CountAccumulator, None)
     ]
     engine = pipeline_dp.DPEngine(
         budget_accountant=NaiveBudgetAccountant(total_epsilon=1,
                                                 total_delta=1e-10),
         ops=pipeline_dp.LocalPipelineOperations())
     engine.aggregate(col, params1, data_extractor)
     engine.aggregate(col, params2, data_extractor)
     self.assertEqual(len(engine._report_generators), 2)  # pylint: disable=protected-access
Beispiel #7
0
    def test_contribution_bounding_cross_partition_bounding_applied(self):
        input_col = [
            ("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3),
            ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid1", 'pk3', 6),
            ("pid1", 'pk4', 7), ("pid2", 'pk4', 8)
        ]
        max_partitions_contributed = 3
        max_contributions_per_partition = 5

        dp_engine = pipeline_dp.DPEngine(
            NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertEqual(len(bound_result), 4)
        # Check contributions per partitions
        self.assertTrue(
            all(
                map(
                    lambda op_val: op_val[1][0] <=
                    max_contributions_per_partition, bound_result)))
        # Check cross partition contributions
        dict_of_pid_to_pk = collections.defaultdict(lambda: [])
        for key, _ in bound_result:
            dict_of_pid_to_pk[key[0]].append(key[1])
        self.assertEqual(len(dict_of_pid_to_pk), 2)
        self.assertTrue(
            all(
                map(
                    lambda key: len(dict_of_pid_to_pk[key]) <=
                    max_partitions_contributed, dict_of_pid_to_pk)))
Beispiel #8
0
    def test_contribution_bounding_per_partition_bounding_applied(self):
        input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2),
                     ("pid1", 'pk2', 3), ("pid1", 'pk2', 4),
                     ("pid1", 'pk2', 5), ("pid2", 'pk2', 6)]
        max_partitions_contributed = 5
        max_contributions_per_partition = 2

        dp_engine = pipeline_dp.DPEngine(
            NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10),
            pipeline_dp.LocalPipelineOperations())
        bound_result = list(
            dp_engine._bound_contributions(
                input_col,
                max_partitions_contributed=max_partitions_contributed,
                max_contributions_per_partition=max_contributions_per_partition,
                aggregator_fn=dp_engineTest.aggregator_fn))

        self.assertEqual(len(bound_result), 3)
        # Check contributions per partitions
        self.assertTrue(
            all(
                map(
                    lambda op_val: op_val[1][0] <=
                    max_contributions_per_partition, bound_result)))