Beispiel #1
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._privacy_id_count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=self._privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1,
            public_partitions=self._privacy_id_count_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._privacy_id_count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # PrivacyIdCount ignores values.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_privacy_id_count])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - privacy_id_count. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count,
                                       "Extract privacy_id_count")
        # dp_result : (partition_key, dp_privacy_id_count)

        return dp_result
Beispiel #2
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._mean_params.noise_kind,
            metrics=[pipeline_dp.Metrics.MEAN],
            max_partitions_contributed=self._mean_params.
            max_partitions_contributed,
            max_contributions_per_partition=self._mean_params.
            max_contributions_per_partition,
            min_value=self._mean_params.min_value,
            max_value=self._mean_params.max_value,
            public_partitions=self._mean_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._mean_params.partition_extractor(
                x[1]),
            privacy_id_extractor=lambda x: x[0],
            value_extractor=lambda x: self._mean_params.value_extractor(x[1]))

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_sum])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - mean. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.mean,
                                       "Extract mean")
        # dp_result : (partition_key, dp_sum)

        return dp_result
Beispiel #3
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=self._count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.COUNT],
            max_partitions_contributed=self._count_params.
            max_partitions_contributed,
            max_contributions_per_partition=self._count_params.
            max_contributions_per_partition,
            public_partitions=self._count_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # Count calculates the number of elements per partition key and
            # doesn't use value extractor.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(pcol, params, data_extractors)
        # dp_result : (partition_key, [dp_count])

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - count. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.count,
                                       "Extract sum")
        # dp_result : (partition_key, dp_count)

        return dp_result
Beispiel #4
0
    def test_run_e2e_beam(self):
        with test_pipeline.TestPipeline() as p:
            input = p | "Create input" >> beam.Create(list(range(10)))

            output = self.run_e2e_private_partition_selection_large_budget(
                input, pipeline_dp.BeamBackend())

            beam_util.assert_that(output, beam_util.is_not_empty())
def compute_on_beam():
    runner = fn_api_runner.FnApiRunner()  # local runner
    with beam.Pipeline(runner=runner) as pipeline:
        movie_views = pipeline | beam.io.ReadFromText(
            FLAGS.input_file) | beam.ParDo(ParseFile())
        pipeline_backend = pipeline_dp.BeamBackend()
        dp_result = calculate_private_result(movie_views, pipeline_backend)
        dp_result | beam.io.WriteToText(FLAGS.output_file)
Beispiel #6
0
    def expand(self, pcol: pvalue.PCollection) -> pvalue.PCollection:
        backend = pipeline_dp.BeamBackend()
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: self._partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0])

        dp_result = dp_engine.select_partitions(pcol,
                                                self._select_partitions_params,
                                                data_extractors)

        return dp_result
Beispiel #7
0
 def _create_dp_engine(self):
     backend = pipeline_dp.BeamBackend()
     return backend, pipeline_dp.DPEngine(self._budget_accountant, backend)