Esempio n. 1
0
    def test_run_e2e_spark(self):
        import pyspark
        conf = pyspark.SparkConf()
        sc = pyspark.SparkContext.getOrCreate(conf=conf)
        input = sc.parallelize(list(range(10)))

        output = self.run_e2e_private_partition_selection_large_budget(
            input, pipeline_dp.SparkRDDBackend(sc))

        self.assertEqual(5, len(output.collect()))
Esempio n. 2
0
def compute_on_spark():
    master = "local[1]"  # run Spark locally with one worker thread to load the input file into 1 partition
    conf = pyspark.SparkConf().setMaster(master)
    sc = pyspark.SparkContext(conf=conf)
    movie_views = sc.textFile(FLAGS.input_file) \
        .mapPartitions(parse_partition)
    pipeline_backend = pipeline_dp.SparkRDDBackend(sc)
    dp_result = calculate_private_result(movie_views, pipeline_backend)

    delete_if_exists(FLAGS.output_file)
    dp_result.saveAsTextFile(FLAGS.output_file)
Esempio n. 3
0
    def variance(self,
                 variance_params: aggregate_params.VarianceParams,
                 public_partitions=None) -> RDD:
        """Computes a DP variance.

        Args:
            variance_params: parameters for calculation
            public_partitions: A collection of partition keys that will be present in
          the result. Optional. If not provided, partitions will be selected in a DP
          manner.
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=variance_params.noise_kind,
            metrics=[pipeline_dp.Metrics.VARIANCE],
            max_partitions_contributed=variance_params.
            max_partitions_contributed,
            max_contributions_per_partition=variance_params.
            max_contributions_per_partition,
            min_value=variance_params.min_value,
            max_value=variance_params.max_value,
            budget_weight=variance_params.budget_weight)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: variance_params.partition_extractor(x[
                1]),
            privacy_id_extractor=lambda x: x[0],
            value_extractor=lambda x: variance_params.value_extractor(x[1]))

        dp_result = dp_engine.aggregate(self._rdd, params, data_extractors,
                                        public_partitions)
        # dp_result : (partition_key, (variance=dp_variance))

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - variance. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.variance,
                                       "Extract variance")
        # dp_result : (partition_key, dp_variance)

        return dp_result
Esempio n. 4
0
    def privacy_id_count(
            self,
            privacy_id_count_params: aggregate_params.PrivacyIdCountParams,
            public_partitions=None) -> RDD:
        """Computes a DP Privacy ID count.

        Args:
            privacy_id_count_params: parameters for calculation
            public_partitions: A collection of partition keys that will be present in
          the result. Optional. If not provided, partitions will be selected in a DP
          manner.
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=privacy_id_count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: privacy_id_count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # PrivacyIdCount ignores values.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(self._rdd, params, data_extractors,
                                        public_partitions)
        # dp_result : (partition_key, (privacy_id_count=dp_privacy_id_count))

        # aggregate() returns a namedtuple of metrics for each partition key.
        # Here is only one metric - privacy id count. Extract it from the list.
        dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count,
                                       "Extract privacy id count")
        # dp_result : (partition_key, dp_privacy_id_count)

        return dp_result
Esempio n. 5
0
    def select_partitions(
            self,
            select_partitions_params: aggregate_params.SelectPartitionsParams,
            partition_extractor: Callable) -> RDD:
        """Computes a collection of partition keys in a DP manner.

        Args:
            select_partitions_params: parameters for calculation
            partition_extractor: function for extracting partition key from each input element
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.SelectPartitionsParams(
            max_partitions_contributed=select_partitions_params.
            max_partitions_contributed)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0])

        return dp_engine.select_partitions(self._rdd, params, data_extractors)
Esempio n. 6
0
    def privacy_id_count(
            self, privacy_id_count_params: aggregate_params.PrivacyIdCountParams
    ) -> RDD:
        """Computes a DP Privacy ID count.

        Args:
            privacy_id_count_params: parameters for calculation
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.AggregateParams(
            noise_kind=privacy_id_count_params.noise_kind,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1,
            public_partitions=privacy_id_count_params.public_partitions)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: privacy_id_count_params.
            partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0],
            # PrivacyIdCount ignores values.
            value_extractor=lambda x: None)

        dp_result = dp_engine.aggregate(self._rdd, params, data_extractors)
        # dp_result : (partition_key, [dp_privacy_id_count])

        # aggregate() returns a list of metrics for each partition key.
        # Here is only one metric - privacy_id_count. Remove list.
        dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest list")
        # dp_result : (partition_key, dp_privacy_id_count)

        return dp_result