def get_private_movies(movie_views, backend):
    """Obtains the list of movies in a differentially private manner.

    This does not calculate any metrics; it merely returns the list of
    movies, making sure the result is differentially private.
    """

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=0.1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id)

    # Run aggregation.
    dp_result = dp_engine.select_partitions(
        movie_views,
        pipeline_dp.SelectPartitionsParams(max_partitions_contributed=2),
        data_extractors=data_extractors)

    budget_accountant.compute_budgets()
    return dp_result
Beispiel #2
0
    def select_partitions(
            self,
            select_partitions_params: aggregate_params.SelectPartitionsParams,
            partition_extractor: Callable) -> RDD:
        """Computes a collection of partition keys in a DP manner.

        Args:
            select_partitions_params: parameters for calculation
            partition_extractor: function for extracting partition key from each input element
        """

        backend = pipeline_dp.SparkRDDBackend(self._rdd.context)
        dp_engine = pipeline_dp.DPEngine(self._budget_accountant, backend)

        params = pipeline_dp.SelectPartitionsParams(
            max_partitions_contributed=select_partitions_params.
            max_partitions_contributed)

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: partition_extractor(x[1]),
            privacy_id_extractor=lambda x: x[0])

        return dp_engine.select_partitions(self._rdd, params, data_extractors)
Beispiel #3
0
    def test_check_select_partitions(self):
        """ Tests validation of parameters for select_partitions()"""
        default_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: x,
            value_extractor=lambda x: x,
        )

        test_cases = [
            {
                "desc":
                "None col",
                "col":
                None,
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "empty col",
                "col": [],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc": "none params",
                "col": [0],
                "params": None,
                "data_extractor": default_extractor,
            },
            {
                "desc":
                "negative max_partitions_contributed",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=-1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "float max_partitions_contributed",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1.1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "None data_extractor",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                None,
            },
            {
                "desc":
                "Not a function data_extractor",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                1,
            },
        ]

        for test_case in test_cases:
            with self.assertRaises(Exception, msg=test_case["desc"]):
                budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-10)
                engine = pipeline_dp.DPEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                engine.select_partitions(test_case["col"], test_case["params"],
                                         test_case["data_extractor"])