def test_run_e2e_spark(self): import pyspark conf = pyspark.SparkConf() sc = pyspark.SparkContext.getOrCreate(conf=conf) input = sc.parallelize(list(range(10))) output = self.run_e2e_private_partition_selection_large_budget( input, pipeline_dp.SparkRDDBackend(sc)) self.assertEqual(5, len(output.collect()))
def compute_on_spark(): master = "local[1]" # run Spark locally with one worker thread to load the input file into 1 partition conf = pyspark.SparkConf().setMaster(master) sc = pyspark.SparkContext(conf=conf) movie_views = sc.textFile(FLAGS.input_file) \ .mapPartitions(parse_partition) pipeline_backend = pipeline_dp.SparkRDDBackend(sc) dp_result = calculate_private_result(movie_views, pipeline_backend) delete_if_exists(FLAGS.output_file) dp_result.saveAsTextFile(FLAGS.output_file)
def variance(self, variance_params: aggregate_params.VarianceParams, public_partitions=None) -> RDD: """Computes a DP variance. Args: variance_params: parameters for calculation public_partitions: A collection of partition keys that will be present in the result. Optional. If not provided, partitions will be selected in a DP manner. """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=variance_params.noise_kind, metrics=[pipeline_dp.Metrics.VARIANCE], max_partitions_contributed=variance_params. max_partitions_contributed, max_contributions_per_partition=variance_params. max_contributions_per_partition, min_value=variance_params.min_value, max_value=variance_params.max_value, budget_weight=variance_params.budget_weight) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: variance_params.partition_extractor(x[ 1]), privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: variance_params.value_extractor(x[1])) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors, public_partitions) # dp_result : (partition_key, (variance=dp_variance)) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - variance. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.variance, "Extract variance") # dp_result : (partition_key, dp_variance) return dp_result
def privacy_id_count( self, privacy_id_count_params: aggregate_params.PrivacyIdCountParams, public_partitions=None) -> RDD: """Computes a DP Privacy ID count. Args: privacy_id_count_params: parameters for calculation public_partitions: A collection of partition keys that will be present in the result. Optional. If not provided, partitions will be selected in a DP manner. """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=privacy_id_count_params.noise_kind, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: privacy_id_count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # PrivacyIdCount ignores values. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors, public_partitions) # dp_result : (partition_key, (privacy_id_count=dp_privacy_id_count)) # aggregate() returns a namedtuple of metrics for each partition key. # Here is only one metric - privacy id count. Extract it from the list. dp_result = backend.map_values(dp_result, lambda v: v.privacy_id_count, "Extract privacy id count") # dp_result : (partition_key, dp_privacy_id_count) return dp_result
def select_partitions( self, select_partitions_params: aggregate_params.SelectPartitionsParams, partition_extractor: Callable) -> RDD: """Computes a collection of partition keys in a DP manner. Args: select_partitions_params: parameters for calculation partition_extractor: function for extracting partition key from each input element """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.SelectPartitionsParams( max_partitions_contributed=select_partitions_params. max_partitions_contributed) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0]) return dp_engine.select_partitions(self._rdd, params, data_extractors)
def privacy_id_count( self, privacy_id_count_params: aggregate_params.PrivacyIdCountParams ) -> RDD: """Computes a DP Privacy ID count. Args: privacy_id_count_params: parameters for calculation """ backend = pipeline_dp.SparkRDDBackend(self._rdd.context) dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=privacy_id_count_params.noise_kind, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=privacy_id_count_params.public_partitions) data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: privacy_id_count_params. partition_extractor(x[1]), privacy_id_extractor=lambda x: x[0], # PrivacyIdCount ignores values. value_extractor=lambda x: None) dp_result = dp_engine.aggregate(self._rdd, params, data_extractors) # dp_result : (partition_key, [dp_privacy_id_count]) # aggregate() returns a list of metrics for each partition key. # Here is only one metric - privacy_id_count. Remove list. dp_result = backend.map_values(dp_result, lambda v: v[0], "Unnest list") # dp_result : (partition_key, dp_privacy_id_count) return dp_result