Example #1
0
    def test_simulated_venn_diagram_reach_by_spend_without_active_pub(self):
        pdfs = [
            PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                          "pdf1"),
            PublisherData([(2, 0.03), (4, 0.06)], "pdf2"),
            PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"),
        ]
        data_set = DataSet(pdfs, "test")
        params = SystemParameters(
            [0.4, 0.5, 0.4],
            LiquidLegionsParameters(),
            FakeRandomGenerator(),
        )
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(data_set, params, privacy_tracker)

        spends = [0, 0, 0]
        budget = PrivacyBudget(0.2, 0.4)
        privacy_budget_split = 0.5
        max_freq = 1

        reach_points = halo.simulated_venn_diagram_reach_by_spend(
            spends, budget, privacy_budget_split, max_freq)

        expected_reach_points = []

        self.assertEqual(expected_reach_points, reach_points)
        self.assertEqual(halo.privacy_tracker.privacy_consumption.epsilon, 0)
        self.assertEqual(halo.privacy_tracker.privacy_consumption.delta, 0)
        self.assertEqual(len(halo.privacy_tracker._noising_events), 0)
Example #2
0
 def test_sample_venn_diagram(self, regions, sample_size, expected):
     params = SystemParameters([0], LiquidLegionsParameters(),
                               FakeRandomGenerator())
     halo = HaloSimulator(DataSet([], "test"), params, PrivacyTracker())
     self.assertEqual(
         halo._sample_venn_diagram(regions, sample_size),
         expected,
     )
Example #3
0
    def test_m3_strategy_with_ground_truth(self):
        data1 = HeterogeneousImpressionGenerator(1000, gamma_shape=1.0, gamma_scale=2)()
        publisher1 = PublisherData(FixedPriceGenerator(0.1)(data1))
        data2 = HeterogeneousImpressionGenerator(1000, gamma_shape=1.0, gamma_scale=3)()
        publisher2 = PublisherData(FixedPriceGenerator(0.05)(data2))
        dataset = DataSet([publisher1, publisher2], "dataset")

        params = SystemParameters(
            [100.0, 100.0], LiquidLegionsParameters(), np.random.default_rng(seed=1)
        )
        halo = HaloSimulator(dataset, params, PrivacyTracker())

        budget = PrivacyBudget(1.0, 1e-5)
        m3strategy = M3Strategy(
            GammaPoissonModel,
            {},
            RestrictedPairwiseUnionReachSurface,
            {},
            use_ground_truth_for_reach_curves=True,
        )
        surface = m3strategy.fit(halo, params, budget)

        expected0 = surface.by_spend([10.0, 0.0]).reach(1)
        actual0 = dataset.reach_by_spend([10.0, 0.0]).reach(1)
        self.assertAlmostEqual(expected0, actual0, delta=1)

        expected1 = surface.by_spend([0.0, 10.0]).reach(1)
        actual1 = dataset.reach_by_spend([0.0, 10.0]).reach(1)
        self.assertAlmostEqual(expected1, actual1, delta=1)

        expected2 = surface.by_spend([10.0, 10.0]).reach(1)
        actual2 = dataset.reach_by_spend([10.0, 10.0]).reach(1)
        self.assertAlmostEqual(expected2, actual2, delta=10)
Example #4
0
    def fit(
        self, halo: HaloSimulator, params: SystemParameters, budget: PrivacyBudget
    ) -> ReachSurface:
        """Returns a reach curve for a single publisher modeling strategy.

        Args:
            halo: A Halo object for simulating the behavior of the Halo system.
            params:  Simulation parameters.
            budget:  A PrivacyBudget object specifying how much privacy budget
              is to be consumed for this operation.
        Returns:
            A differentially private ReachSurface model which can be queried
            for reach and frequency estimates for arbitrary spend allocations.
        """

        if halo.publisher_count != 1:
            raise ValueError(
                "SinglePublisherStrategy cannot be used with multiple publishers"
            )

        total_reach = halo.simulated_reach_by_spend(
            halo.campaign_spends, budget, max_frequency=MAX_MEASUREMENT_FREQUENCY
        )

        curve = self._single_pub_model([total_reach], **self._single_pub_model_kwargs)
        curve._fit()
        return curve
Example #5
0
    def test_form_venn_diagram_regions_with_publishers_more_than_limit(self):
        num_publishers = MAX_ACTIVE_PUBLISHERS + 1
        data_set = DataSet(
            [
                PublisherData([(1, 0.01)], f"pdf{i + 1}")
                for i in range(num_publishers)
            ],
            "test",
        )
        params = SystemParameters([0.4] * num_publishers,
                                  LiquidLegionsParameters(),
                                  np.random.default_rng(1))
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(data_set, params, privacy_tracker)

        spends = [0.01] * num_publishers
        with self.assertRaises(ValueError):
            halo._form_venn_diagram_regions(spends)
Example #6
0
    def setUpClass(cls):
        pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                             "pdf1")
        pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2")
        data_set = DataSet([pdf1, pdf2], "test")

        cls.params = SystemParameters([0.4, 0.5], LiquidLegionsParameters(),
                                      np.random.default_rng(1))
        cls.privacy_tracker = PrivacyTracker()
        cls.halo = HaloSimulator(data_set, cls.params, cls.privacy_tracker)
Example #7
0
    def test_scale_up_reach_in_primitive_regions(
        self,
        mock_geometric_estimate_noiser,
        regions,
        true_cardinality,
        std,
        budget,
        privacy_budget_split,
        fixed_noise,
        expected,
    ):
        mock_geometric_estimate_noiser.return_value = FakeNoiser(fixed_noise)

        params = SystemParameters([0], LiquidLegionsParameters(),
                                  FakeRandomGenerator())
        halo = HaloSimulator(DataSet([], "test"), params, PrivacyTracker())

        scaled_regions = halo._scale_up_reach_in_primitive_regions(
            regions, true_cardinality, std, budget, privacy_budget_split)

        self.assertEqual(scaled_regions, expected)

        self.assertEqual(halo.privacy_tracker.privacy_consumption.epsilon,
                         budget.epsilon * privacy_budget_split)
        self.assertEqual(halo.privacy_tracker.privacy_consumption.delta,
                         budget.delta * privacy_budget_split)
        self.assertEqual(len(halo.privacy_tracker._noising_events), 1)
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].budget.epsilon,
            budget.epsilon * privacy_budget_split,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].budget.delta,
            budget.delta * privacy_budget_split,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].mechanism,
            DP_NOISE_MECHANISM_DISCRETE_LAPLACE,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].params,
            {"privacy_budget_split": privacy_budget_split},
        )
Example #8
0
    def test_form_venn_diagram_regions(self, num_publishers, spends, max_freq,
                                       expected):
        pdfs = [
            PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                          "pdf1"),
            PublisherData([(2, 0.03), (4, 0.06)], "pdf2"),
            PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"),
        ]
        data_set = DataSet(pdfs[:num_publishers], "test")
        params = SystemParameters(
            [0.4] * num_publishers,
            LiquidLegionsParameters(),
            np.random.default_rng(1),
        )
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(data_set, params, privacy_tracker)

        regions = halo._form_venn_diagram_regions(spends, max_freq)
        self.assertEqual(expected, regions)
Example #9
0
    def test_generate_reach_points_from_venn_diagram(self, num_publishers,
                                                     spends, regions,
                                                     expected):
        pdfs = [
            PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                          "pdf1"),
            PublisherData([(2, 0.03), (4, 0.06)], "pdf2"),
            PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"),
        ]
        data_set = DataSet(pdfs[:num_publishers], "test")
        params = SystemParameters(
            [0.4] * num_publishers,
            LiquidLegionsParameters(),
            np.random.default_rng(1),
        )
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(data_set, params, privacy_tracker)

        # Note that the reach points generated from the Venn diagram only
        # contain 1+ reaches.
        reach_points = halo._generate_reach_points_from_venn_diagram(
            spends, regions)

        self.assertEqual(len(reach_points), len(expected))

        for i, (r_pt, expected_r_pt) in enumerate(zip(reach_points, expected)):
            self.assertEqual(
                r_pt.impressions,
                expected_r_pt.impressions,
                msg=f"The impressions of No.{i + 1} reach point is not correct",
            )
            self.assertEqual(
                r_pt.reach(1),
                expected_r_pt.reach(1),
                msg=f"The reach of No.{i + 1} reach point is not correct",
            )
            self.assertEqual(
                r_pt.spends,
                expected_r_pt.spends,
                msg=f"The spends of No.{i + 1} reach point is not correct",
            )
Example #10
0
    def test_add_dp_noise_to_primitive_regions(
        self,
        mock_geometric_estimate_noiser,
        regions,
        budget,
        privacy_budget_split,
        fixed_noise,
        expected_regions,
    ):
        mock_geometric_estimate_noiser.return_value = FakeNoiser(fixed_noise)

        halo = HaloSimulator(DataSet([], "test"), SystemParameters(),
                             PrivacyTracker())

        noised_regions = halo._add_dp_noise_to_primitive_regions(
            regions, budget, privacy_budget_split)

        self.assertEqual(noised_regions, expected_regions)
        self.assertEqual(halo.privacy_tracker.privacy_consumption.epsilon,
                         budget.epsilon * privacy_budget_split)
        self.assertEqual(halo.privacy_tracker.privacy_consumption.delta,
                         budget.delta * privacy_budget_split)
        self.assertEqual(len(halo.privacy_tracker._noising_events), 1)
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].budget.epsilon,
            budget.epsilon * privacy_budget_split,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].budget.delta,
            budget.delta * privacy_budget_split,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].mechanism,
            DP_NOISE_MECHANISM_DISCRETE_LAPLACE,
        )
        self.assertEqual(
            halo.privacy_tracker._noising_events[0].params,
            {"privacy_budget_split": privacy_budget_split},
        )
Example #11
0
 def test_class_setup_with_campaign_spend_fractions_generator(self):
     pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                          "pdf1")
     pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2")
     data_set = DataSet([pdf1, pdf2], "test")
     params = SystemParameters(
         liquid_legions=LiquidLegionsParameters(),
         generator=np.random.default_rng(1),
         campaign_spend_fractions_generator=lambda dataset: [0.2] * dataset.
         publisher_count,
     )
     params = params.update_from_dataset(data_set)
     privacy_tracker = PrivacyTracker()
     halo = HaloSimulator(data_set, params, privacy_tracker)
     self.assertAlmostEqual(halo._campaign_spends[0], 0.01, 7)
     # using assertAlmostEqual here because of a rounding error
     self.assertAlmostEqual(halo._campaign_spends[1], 0.012, 7)
    def evaluate(
        self, seed: int, filesystem: FsWrapperBase = FsPathlibWrapper()
    ) -> pd.DataFrame:
        """Executes a trial.

        1. Check if the results for the trial have already been computed.
        2. Load the DataSet.
        3. Instantiate Halo Simulator.
        4. Instantiate Modeling Strategy.
        5. Fit model.
        6. Generate set of test points.
        7. Compute metrics.
        8. Construct output DataFrame.
        9. Save to disk.

        Args:
          seed:  A seed value that is used to initialize the random
            number generator.
          filesystem:  The filesystem object that manages all file operations.

        Returns:
          A single row DataFrame containing the results of the evaluation
          of this trial.
        """
        logging.vlog(2, f"Dataset {self._data_set_name}")
        logging.vlog(2, f"Trial   {self._trial_descriptor}")

        rng = np.random.default_rng(seed=seed)
        np.random.seed(seed)

        trial_results_path = self._compute_trial_results_path()

        if trial_results_path.startswith("gs://"):
            filesystem.set_default_client_to_gs_client()

        if filesystem.is_file(trial_results_path):
            logging.vlog(2, "  --> Returning previously computed result")
            try:
                with filesystem.open(trial_results_path) as file:
                    return pd.read_csv(file)
            except Exception as e:
                filesystem.unlink(trial_results_path)
                logging.vlog(
                    2, f"  --> {e}. Failed reading existing result. Re-evaluate."
                )

        # The pending directory contains one entry for each currently executing
        # experimental trial.  If a computation appears to hang, this can be
        # used to check which evaluations are still pending.
        experiment_dir_parent = filesystem.parent(self._experiment_dir)
        pending_path = f"{experiment_dir_parent}/pending/{hashlib.md5(trial_results_path.encode()).hexdigest()}"
        filesystem.mkdir(filesystem.parent(pending_path), parents=True, exist_ok=True)
        filesystem.write_text(
            pending_path,
            f"{datetime.now()}\n{self._data_set_name}\n{self._trial_descriptor}\n\n",
        )

        dataset = self._data_design.by_name(self._data_set_name)
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(
            dataset, self._trial_descriptor.system_params, privacy_tracker
        )
        privacy_budget = self._trial_descriptor.experiment_params.privacy_budget
        modeling_strategy = (
            self._trial_descriptor.modeling_strategy.instantiate_strategy()
        )
        single_publisher_dataframe = pd.DataFrame()
        max_frequency = self._trial_descriptor.experiment_params.max_frequency
        try:
            reach_surface = modeling_strategy.fit(
                halo, self._trial_descriptor.system_params, privacy_budget
            )
            test_points = list(
                self._trial_descriptor.experiment_params.generate_test_points(
                    dataset, rng
                )
            )
            true_reach = [
                halo.true_reach_by_spend(
                    t, self._trial_descriptor.experiment_params.max_frequency
                )
                for t in test_points
            ]
            fitted_reach = [
                reach_surface.by_spend(
                    t, self._trial_descriptor.experiment_params.max_frequency
                )
                for t in test_points
            ]
            metrics = aggregate(true_reach, fitted_reach)
            if self._analysis_type == SINGLE_PUB_ANALYSIS:
                single_publisher_dataframe = (
                    self._compute_single_publisher_fractions_dataframe(
                        halo, reach_surface, max_frequency
                    )
                )
        except Exception as inst:
            if not logging.vlog_is_on(2):
                logging.vlog(1, f"Dataset {self._data_set_name}")
                logging.vlog(1, f"Trial   {self._trial_descriptor}")
            logging.vlog(1, f"Modeling failure: {inst}")
            logging.vlog(2, traceback.format_exc())
            metrics = aggregate_on_exception(inst)
            if self._analysis_type == SINGLE_PUB_ANALYSIS:
                single_publisher_dataframe = (
                    self._single_publisher_fractions_dataframe_on_exception(max_frequency)
                )

        independent_vars = self._make_independent_vars_dataframe()
        privacy_tracking_vars = self._make_privacy_tracking_vars_dataframe(
            privacy_tracker
        )
        result = pd.concat(
            [
                independent_vars,
                privacy_tracking_vars,
                metrics,
                single_publisher_dataframe,
            ],
            axis=1,
        )
        filesystem.mkdir(
            filesystem.parent(trial_results_path), parents=True, exist_ok=True
        )
        filesystem.write_text(trial_results_path, result.to_csv(index=False))
        filesystem.unlink(pending_path, missing_ok=True)

        return result
    def fit(
        self, halo: HaloSimulator, params: SystemParameters, budget: PrivacyBudget
    ) -> ReachSurface:
        """Returns the reach surface computed using the M3 proposal

        Args:
            halo: A Halo object for simulating the behavior of the Halo system.
            params:  Simulation parameters.
            budget:  A PrivacyBudget object specifying how much privacy budget
              is to be consumed for this operation.
        Returns:
            A differentially private ReachSurface model which can be queried
            for reach and frequency estimates for arbitrary spend allocations.
        """

        p = halo.publisher_count

        # TODO: Compute total budget usage with advanced composition or PLD's
        per_request_budget = PrivacyBudget(
            budget.epsilon / (2 * p + 1), budget.delta / (2 * p + 1)
        )

        total_reach = halo.simulated_reach_by_spend(
            halo.campaign_spends, per_request_budget
        )

        # Compute reach for each publisher
        single_pub_reach_list = []
        for i in range(p):
            spend_vec = [0.0] * p
            spend_vec[i] = halo.campaign_spends[i]
            reach_point = halo.simulated_reach_by_spend(
                spend_vec, per_request_budget, max_frequency=10
            )
            kplus_reaches = [
                reach_point.reach(k) for k in range(1, reach_point.max_frequency + 1)
            ]
            single_pub_reach = ReachPoint(
                [reach_point.impressions[i]], kplus_reaches, [reach_point.spends[i]]
            )
            single_pub_reach_list.append(single_pub_reach)

        # Compute reach for all publishers but one
        all_but_one_reach = []
        if p > 2:
            for i in range(p):
                spend_vec = list(halo.campaign_spends)
                spend_vec[i] = 0.0
                reach = halo.simulated_reach_by_spend(spend_vec, per_request_budget)
                all_but_one_reach.append(reach)

        # Compute reach curve for each publisher
        single_pub_curves = []
        for i in range(p):
            if self._use_ground_truth_for_reach_curves:
                curve = GroundTruthReachCurveModel(halo._data_set, i)
            else:
                curve = self._single_pub_model(
                    [single_pub_reach_list[i]], **self._single_pub_model_kwargs
                )
                curve._fit()
            single_pub_curves.append(curve)

        if p == 1:
            return single_pub_curves[0]

        training_points = all_but_one_reach + [total_reach]
        reach_surface = self._multi_pub_model(
            single_pub_curves, training_points, **self._multi_pub_model_kwargs
        )
        reach_surface._fit()

        return reach_surface
Example #14
0
    def test_simulated_venn_diagram_reach_by_spend(
        self,
        mock_geometric_estimate_noiser,
        mock_cardinality_estimate_variance,
        spends,
        budget,
        privacy_budget_split,
        fixed_noise,
        expected_reach_points,
    ):
        mock_geometric_estimate_noiser.return_value = FakeNoiser(fixed_noise)

        pdfs = [
            PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)],
                          "pdf1"),
            PublisherData([(2, 0.03), (4, 0.06)], "pdf2"),
            PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"),
        ]
        data_set = DataSet(pdfs, "test")
        params = SystemParameters(
            [0.4, 0.5, 0.4],
            LiquidLegionsParameters(),
            FakeRandomGenerator(),
        )
        privacy_tracker = PrivacyTracker()
        halo = HaloSimulator(data_set, params, privacy_tracker)

        reach_points = halo.simulated_venn_diagram_reach_by_spend(
            spends, budget, privacy_budget_split)

        # Examine reach points
        for i, (r_pt, expected_r_pt) in enumerate(
                zip(reach_points, expected_reach_points)):
            self.assertEqual(
                r_pt.impressions,
                expected_r_pt.impressions,
                msg=f"The impressions of No.{i + 1} reach point is not correct",
            )
            self.assertEqual(
                r_pt.reach(1),
                expected_r_pt.reach(1),
                msg=f"The reach of No.{i + 1} reach point is not correct",
            )
            self.assertEqual(
                r_pt.spends,
                expected_r_pt.spends,
                msg=f"The spends of No.{i + 1} reach point is not correct",
            )

        # Examine privacy tracker
        expected_noise_event_primitive_regions = NoisingEvent(
            PrivacyBudget(
                budget.epsilon * privacy_budget_split,
                budget.delta * privacy_budget_split,
            ),
            DP_NOISE_MECHANISM_DISCRETE_LAPLACE,
            {"privacy_budget_split": privacy_budget_split},
        )

        expected_noise_event_cardinality = NoisingEvent(
            PrivacyBudget(
                budget.epsilon * (1 - privacy_budget_split),
                budget.delta * (1 - privacy_budget_split),
            ),
            DP_NOISE_MECHANISM_DISCRETE_LAPLACE,
            {"privacy_budget_split": (1 - privacy_budget_split)},
        )

        expected_noise_events = [
            expected_noise_event_primitive_regions,
            expected_noise_event_cardinality,
        ]

        self.assertEqual(
            halo.privacy_tracker.privacy_consumption.epsilon,
            expected_noise_event_primitive_regions.budget.epsilon +
            expected_noise_event_cardinality.budget.epsilon,
        )
        self.assertEqual(
            halo.privacy_tracker.privacy_consumption.delta,
            expected_noise_event_primitive_regions.budget.delta +
            expected_noise_event_cardinality.budget.delta,
        )
        self.assertEqual(len(halo.privacy_tracker._noising_events), 2)

        for noise_event, expected_noise_event in zip(
                halo.privacy_tracker._noising_events, expected_noise_events):
            self.assertEqual(
                noise_event.budget.epsilon,
                expected_noise_event.budget.epsilon,
            )
            self.assertEqual(
                noise_event.budget.delta,
                expected_noise_event.budget.delta,
            )
            self.assertEqual(
                noise_event.mechanism,
                expected_noise_event.mechanism,
            )
            self.assertEqual(
                noise_event.params,
                expected_noise_event.params,
            )