def test_simulated_venn_diagram_reach_by_spend_without_active_pub(self): pdfs = [ PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1"), PublisherData([(2, 0.03), (4, 0.06)], "pdf2"), PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"), ] data_set = DataSet(pdfs, "test") params = SystemParameters( [0.4, 0.5, 0.4], LiquidLegionsParameters(), FakeRandomGenerator(), ) privacy_tracker = PrivacyTracker() halo = HaloSimulator(data_set, params, privacy_tracker) spends = [0, 0, 0] budget = PrivacyBudget(0.2, 0.4) privacy_budget_split = 0.5 max_freq = 1 reach_points = halo.simulated_venn_diagram_reach_by_spend( spends, budget, privacy_budget_split, max_freq) expected_reach_points = [] self.assertEqual(expected_reach_points, reach_points) self.assertEqual(halo.privacy_tracker.privacy_consumption.epsilon, 0) self.assertEqual(halo.privacy_tracker.privacy_consumption.delta, 0) self.assertEqual(len(halo.privacy_tracker._noising_events), 0)
def test_m3_strategy_with_ground_truth(self): data1 = HeterogeneousImpressionGenerator(1000, gamma_shape=1.0, gamma_scale=2)() publisher1 = PublisherData(FixedPriceGenerator(0.1)(data1)) data2 = HeterogeneousImpressionGenerator(1000, gamma_shape=1.0, gamma_scale=3)() publisher2 = PublisherData(FixedPriceGenerator(0.05)(data2)) dataset = DataSet([publisher1, publisher2], "dataset") params = SystemParameters( [100.0, 100.0], LiquidLegionsParameters(), np.random.default_rng(seed=1) ) halo = HaloSimulator(dataset, params, PrivacyTracker()) budget = PrivacyBudget(1.0, 1e-5) m3strategy = M3Strategy( GammaPoissonModel, {}, RestrictedPairwiseUnionReachSurface, {}, use_ground_truth_for_reach_curves=True, ) surface = m3strategy.fit(halo, params, budget) expected0 = surface.by_spend([10.0, 0.0]).reach(1) actual0 = dataset.reach_by_spend([10.0, 0.0]).reach(1) self.assertAlmostEqual(expected0, actual0, delta=1) expected1 = surface.by_spend([0.0, 10.0]).reach(1) actual1 = dataset.reach_by_spend([0.0, 10.0]).reach(1) self.assertAlmostEqual(expected1, actual1, delta=1) expected2 = surface.by_spend([10.0, 10.0]).reach(1) actual2 = dataset.reach_by_spend([10.0, 10.0]).reach(1) self.assertAlmostEqual(expected2, actual2, delta=10)
def test_compute_trial_results_path(self): with TemporaryDirectory() as d: pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "dataset") data_design = DataDesign(join(d, "data_design")) data_design.add(data_set) msd = ModelingStrategyDescriptor("strategy", {}, "single_pub_model", {}, "multi_pub_model", {}) sparams = SystemParameters( [0.03, 0.05], LiquidLegionsParameters(13, 1e6, 1), np.random.default_rng(), ) eparams = ExperimentParameters(PrivacyBudget(1.0, 0.01), 3, 5, "tps") trial_descriptor = TrialDescriptor(msd, sparams, eparams) trial = ExperimentalTrial("edir", data_design, "dataset", trial_descriptor) actual = trial._compute_trial_results_path() expected = "{}/{}/{},{},{},{}".format( "edir", "dataset", "strategy,single_pub_model,multi_pub_model", "spends=(0.03,0.05),decay_rate=13,sketch_size=1000000.0", "epsilon=1.0,delta=0.01,replica_id=3,max_frequency=5", "test_point_strategy=tps.csv", ) self.assertEqual(actual, expected)
def test_spend_by_impressions(self): pdf = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04)], "test") self.assertEqual(pdf.spend_by_impressions(0), 0) self.assertEqual(pdf.spend_by_impressions(1), 0.01) self.assertEqual(pdf.spend_by_impressions(2), 0.02) self.assertEqual(pdf.spend_by_impressions(3), 0.04) self.assertEqual(pdf.spend_by_impressions(4), 0.04)
def setUpClass(cls): pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") cls.data_set = data_set cls.curve1 = GroundTruthReachCurveModel(data_set, 0) cls.curve2 = GroundTruthReachCurveModel(data_set, 1)
def test_two_publishers(self): pdf1 = PublisherData([(1, 3.0)], "pdf1") pdf2 = PublisherData([(1, 6.0)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") generator = GridTestPointGenerator(data_set, np.random.default_rng(1), grid_size=2) values = [(int(x[0]), int(x[1])) for x in generator.test_points()] self.assertLen(values, 4) self.assertEqual(values, [(1, 2), (1, 4), (2, 2), (2, 4)])
def setUpClass(cls): pdf11 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf11") pdf12 = PublisherData([(2, 0.03), (4, 0.06)], "pdf12") cls.data_set1 = DataSet([pdf11, pdf12], "ds1") pdf21 = PublisherData([(1, 0.01), (2, 0.02), (2, 0.04), (3, 0.05)], "pdf21") pdf22 = PublisherData([(2, 0.03), (3, 0.06)], "pdf22") cls.data_set2 = DataSet([pdf21, pdf22], "ds2")
def setUpClass(cls): pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") cls.params = SystemParameters([0.4, 0.5], LiquidLegionsParameters(), np.random.default_rng(1)) cls.privacy_tracker = PrivacyTracker() cls.halo = HaloSimulator(data_set, cls.params, cls.privacy_tracker)
def test_npoints_generator(self): pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(1, 0.02), (2, 0.04), (1, 0.08), (3, 0.10)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") generator = LatinHypercubeRandomTestPointGenerator( data_set, np.random.default_rng(1), npublishers=2, minimum_points_per_publisher=200, ) values = [x for x in generator.test_points()] self.assertLen(values, 400)
def test_read_and_write_publisher_data(self): pdf = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04)], "test") with TemporaryDirectory() as d: filename = join(d, "pdf_data") pdf_file = open(filename, "w") pdf.write_publisher_data(pdf_file) pdf_file.close() new_file = open(filename) new_pdf = PublisherData.read_publisher_data(new_file) self.assertEqual(new_pdf.max_impressions, 3) self.assertEqual(new_pdf.max_spend, 0.04) self.assertEqual(new_pdf.max_reach, 2) new_file.close()
def _generate_data_set(self, params: DataSetParameters) -> DataSet: if self._verbose: print(params) publishers = [] publisher_size = params.largest_publisher_size publisher_size_decay_rate = (1 if params.num_publishers == 1 else params.largest_to_smallest_publisher_ratio **(1 / float(params.num_publishers - 1))) for publisher in range(params.num_publishers): publishers.append( PublisherData.generate_publisher_data( params.impression_generator_params.generator( **{ "n": publisher_size, "random_generator": self._random_generator, **params.impression_generator_params.params, }), params.pricing_generator_params.generator( **params.pricing_generator_params.params), str(publisher + 1), )) publisher_size = math.floor(publisher_size * publisher_size_decay_rate) overlap_params = {**params.overlap_generator_params.params} if "random_generator" in overlap_params: overlap_params["random_generator"] = self._random_generator return params.overlap_generator_params.generator(publishers, name=str(params), **overlap_params)
def read_data_set( cls, dirpath: str, filesystem: FsWrapperBase = FsPathlibWrapper()) -> "DataSet": """Reads a DataSet from disk. A DataSet is given by a directory containing a collection of files, each of which represents a PublisherDataSet. The name associated to the DataSet object is the last component of the dirpath. Args: dirpath: Directory containing the PublisherDataSets that comprise this DataSet. filesystem: The filesystem object that manages all file operations. Returns: The DataSet object representing the contents of this directory. """ pdf_list = [] for filepath in sorted(filesystem.glob(dirpath, "*")): if filesystem.is_file(filepath): with filesystem.open(filepath) as file: try: pdf = PublisherData.read_publisher_data(file) pdf.name = str(filepath) pdf_list.append(pdf) except (ValueError, RuntimeError) as e: raise RuntimeError( "In publisher file {}".format(filepath)) from e return cls(pdf_list, filesystem.name(dirpath))
def _label_ids( cls, labeled_set_ids_iter: Iterable[np.array], unlabeled_publisher_data_iter: Iterable[PublisherData], ): """Label the reached ids to reflect cross-pub overlap. Args: labeled_set_ids_iter: a list or generator of per-publisher reached ids. These ids are labeled, i.e., meaningful of cross-pub overlap. unlabeled_publisher_data_iter: a list or generator of PublisherData. The ids here are unlabeled, i.e., meaningless. For each PublisherData here, its i-th id will be labeled as the i-th id in the corresponding labeled_set_ids. Returns: A labeled list of PublisherData. """ new_publisher_data_list = [] for set_ids, pub_data in zip(labeled_set_ids_iter, unlabeled_publisher_data_iter): assert (len(set_ids) == pub_data.max_reach ), "single-pub reach does not match." original_ids = set([oid for oid, _ in pub_data._data]) id_map = dict(zip(original_ids, set_ids)) new_impression_log_data = [(id_map[oid], x) for oid, x in pub_data._data] new_publisher_data_list.append( PublisherData(new_impression_log_data, pub_data.name)) return new_publisher_data_list
def setUpClass(cls): pdf = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") cls.params = SystemParameters([1.0, 0.5, 3.0], LiquidLegionsParameters(), np.random.default_rng(1)) cls.privacy_tracker = PrivacyTracker() cls.publisher = Publisher(pdf, 1, cls.params, cls.privacy_tracker)
def test_class_setup_with_campaign_spend_fractions_generator(self): pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") params = SystemParameters( liquid_legions=LiquidLegionsParameters(), generator=np.random.default_rng(1), campaign_spend_fractions_generator=lambda dataset: [0.2] * dataset. publisher_count, ) params = params.update_from_dataset(data_set) privacy_tracker = PrivacyTracker() halo = HaloSimulator(data_set, params, privacy_tracker) self.assertAlmostEqual(halo._campaign_spends[0], 0.01, 7) # using assertAlmostEqual here because of a rounding error self.assertAlmostEqual(halo._campaign_spends[1], 0.012, 7)
def test_label_ids(self): set_ids_list = [np.array([3, 4, 5]), np.array([4, 6, 8]), np.array([6, 8, 10])] pdf1 = PublisherData([(2, 0.02), (1, 0.01), (1, 0.03), (3, 0.04)], "a") pdf2 = PublisherData([(3, 0.04), (1, 0.02), (2, 0.01)], "b") pdf3 = PublisherData( [(1, 0.01), (2, 0.02), (1, 0.04), (1, 0.01), (3, 0.05)], "c" ) pdf_list = [pdf1, pdf2, pdf3] expected_data_list = [ [(3, 0.01), (3, 0.03), (4, 0.02), (5, 0.04)], [(4, 0.02), (6, 0.01), (8, 0.04)], [(6, 0.01), (6, 0.01), (6, 0.04), (8, 0.02), (10, 0.05)], ] expected_name_list = ["a", "b", "c"] res = OverlapDataSet._label_ids(set_ids_list, pdf_list) self.assert_equal_pub_data_list(res, 3, expected_data_list, expected_name_list) res = OverlapDataSet._label_ids(iter(set_ids_list), iter(pdf_list)) self.assert_equal_pub_data_list(res, 3, expected_data_list, expected_name_list)
def test_one_publisher(self): pdf = PublisherData([(1, 100.0)], "pdf") data_set = DataSet([pdf], "test") generator = GridTestPointGenerator(data_set, np.random.default_rng(1), grid_size=4) values = [int(x[0]) for x in generator.test_points()] self.assertLen(values, 4) self.assertEqual(values, [20, 40, 60, 80])
def test_evaluate(self): with TemporaryDirectory() as d: pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "dataset") data_design_dir = join(d, "data_design") experiment_dir = join(d, "experiments") data_design = DataDesign(data_design_dir) data_design.add(data_set) MODELING_STRATEGIES["fake"] = FakeModelingStrategy TEST_POINT_STRATEGIES[ "fake_tps"] = lambda ds, rng: FakeTestPointGenerator( ).test_points() msd = ModelingStrategyDescriptor("fake", {"x": 1}, "goerg", {}, "pairwise_union", {}) sparams1 = SystemParameters( [0.03, 0.05], LiquidLegionsParameters(13, 1e6, 1), np.random.default_rng(), ) sparams2 = SystemParameters( [0.05, 0.03], LiquidLegionsParameters(13, 1e6, 1), np.random.default_rng(), ) eparams1 = ExperimentParameters(PrivacyBudget(1.0, 0.01), 1, 5, "fake_tps") eparams2 = ExperimentParameters(PrivacyBudget(0.5, 0.001), 1, 5, "fake_tps") trial_descriptors = [ TrialDescriptor(msd, sparams1, eparams1), TrialDescriptor(msd, sparams1, eparams2), TrialDescriptor(msd, sparams2, eparams1), TrialDescriptor(msd, sparams2, eparams2), ] exp = Experiment(experiment_dir, data_design, "dataset", trial_descriptors) trials = exp.generate_trials() self.assertLen(trials, 4)
def test_impressions_by_spend(self): pdf = PublisherData([(1, 0.01), (2, 0.02), (3, 0.02), (1, 0.04)], "test") self.assertEqual(pdf.impressions_by_spend(0.005), 0) self.assertEqual(pdf.impressions_by_spend(0.01), 1) self.assertEqual(pdf.impressions_by_spend(0.015), 1) self.assertEqual(pdf.impressions_by_spend(0.02), 3) self.assertEqual(pdf.impressions_by_spend(0.04), 4) self.assertEqual(pdf.impressions_by_spend(0.05), 4)
def test_form_venn_diagram_regions(self, num_publishers, spends, max_freq, expected): pdfs = [ PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1"), PublisherData([(2, 0.03), (4, 0.06)], "pdf2"), PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"), ] data_set = DataSet(pdfs[:num_publishers], "test") params = SystemParameters( [0.4] * num_publishers, LiquidLegionsParameters(), np.random.default_rng(1), ) privacy_tracker = PrivacyTracker() halo = HaloSimulator(data_set, params, privacy_tracker) regions = halo._form_venn_diagram_regions(spends, max_freq) self.assertEqual(expected, regions)
def test_make_independent_vars_dataframe(self): with TemporaryDirectory() as d: pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "dataset") data_design = DataDesign(join(d, "data_design")) data_design.add(data_set) msd = ModelingStrategyDescriptor("strategy", {}, "single_pub_model", {}, "multi_pub_model", {}) sparams = SystemParameters( [0.03, 0.05], LiquidLegionsParameters(13, 1e6, 1), np.random.default_rng(), ) eparams = ExperimentParameters(PrivacyBudget(1.0, 0.01), 3, 5, "test_point_strategy") trial_descriptor = TrialDescriptor(msd, sparams, eparams) trial = ExperimentalTrial("edir", data_design, "dataset", trial_descriptor) actual = trial._make_independent_vars_dataframe() expected_trial_name = "strategy,single_pub_model,multi_pub_model,spends=(0.03,0.05),decay_rate=13,sketch_size=1000000.0,epsilon=1.0,delta=0.01,replica_id=3,max_frequency=5,test_point_strategy=test_point_strategy" expected = pd.DataFrame({ "dataset": ["dataset"], "trial": [expected_trial_name], "replica_id": [3], "single_pub_model": ["single_pub_model"], "multi_pub_model": ["multi_pub_model"], "strategy": ["strategy"], "liquid_legions_sketch_size": [1e6], "liquid_legions_decay_rate": [13], "maximum_reach": [4], "ncampaigns": [2], "largest_pub_reach": [3], "max_frequency": [5], "average_spend_fraction": [0.04], }) pd.testing.assert_frame_equal(actual, expected)
def test_sequentially_correlated_publisher_data_generator(self): pdf1 = PublisherData([(2, 0.02), (1, 0.01), (1, 0.03), (3, 0.04)], "a") pdf2 = PublisherData([(3, 0.04), (1, 0.02), (2, 0.01)], "b") pdf3 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (1, 0.01), (3, 0.05)], "c") res = SequentiallyCorrelatedOverlapDataSet( unlabeled_publisher_data_list=[pdf1, pdf2, pdf3], order=OrderOptions.original, correlated_sets=CorrelatedSetsOptions.one, shared_prop=0.5, random_generator=np.random.default_rng(seed=1), ) expected_data_list = [ [(2, 0.02), (4, 0.01), (4, 0.03), (6, 0.04)], [(0, 0.01), (6, 0.02), (5, 0.04)], [(5, 0.01), (1, 0.02), (3, 0.05), (5, 0.04)], ] expected_name_list = ["a", "b", "c"] self.assert_equal_pub_data_list(res._data, 3, expected_data_list, expected_name_list)
def test_independent_overlap_data_set(self): pdf1 = PublisherData([(2, 0.02), (1, 0.01), (1, 0.03), (3, 0.04)], "a") pdf2 = PublisherData([(3, 0.04), (1, 0.02), (2, 0.01)], "b") pdf3 = PublisherData( [(1, 0.01), (2, 0.02), (1, 0.04), (1, 0.01), (3, 0.05)], "c" ) res = IndependentOverlapDataSet( unlabeled_publisher_data_list=[pdf1, pdf2, pdf3], universe_size=5, random_generator=np.random.default_rng(1), ) expected_data_list = [ [(0, 0.01), (0, 0.03), (2, 0.02), (3, 0.04)], [(0, 0.02), (1, 0.01), (2, 0.04)], [(2, 0.02), (1, 0.04), (3, 0.05), (1, 0.01)], ] expected_name_list = ["a", "b", "c"] self.assert_equal_pub_data_list( res._data, 3, expected_data_list, expected_name_list )
def test_fifteen_publishers(self): pdf_list = [] for i in range(15): pdf = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf{}".format(i)) pdf_list.append(pdf) data_set = DataSet(pdf_list, "test") generator = LatinHypercubeRandomTestPointGenerator( data_set, np.random.default_rng(1), npoints=225) values = [x for x in generator.test_points()] self.assertLen(values, 225)
def test_two_publishers(self): pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(1, 0.02), (2, 0.04), (1, 0.08), (3, 0.10)], "pdf2") data_set = DataSet([pdf1, pdf2], "test") generator = LatinHypercubeRandomTestPointGenerator( data_set, np.random.default_rng(1), npoints=100) values = [x for x in generator.test_points()] self.assertLen(values, 100) for i, v in enumerate(values): self.assertLen(v, 2) self.assertTrue(v[0] >= 0.0, "Item {} is negative: {}".format(i, v)) self.assertTrue(v[0] < 0.05, "Item {} is too large: {}".format(i, v)) self.assertTrue(v[1] >= 0.0, "Item {} is negative: {}".format(i, v)) self.assertTrue(v[1] < 0.10, "Item {} is too large: {}".format(i, v))
def test_generate_reach_points_from_venn_diagram(self, num_publishers, spends, regions, expected): pdfs = [ PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1"), PublisherData([(2, 0.03), (4, 0.06)], "pdf2"), PublisherData([(2, 0.01), (3, 0.03), (4, 0.05)], "pdf3"), ] data_set = DataSet(pdfs[:num_publishers], "test") params = SystemParameters( [0.4] * num_publishers, LiquidLegionsParameters(), np.random.default_rng(1), ) privacy_tracker = PrivacyTracker() halo = HaloSimulator(data_set, params, privacy_tracker) # Note that the reach points generated from the Venn diagram only # contain 1+ reaches. reach_points = halo._generate_reach_points_from_venn_diagram( spends, regions) self.assertEqual(len(reach_points), len(expected)) for i, (r_pt, expected_r_pt) in enumerate(zip(reach_points, expected)): self.assertEqual( r_pt.impressions, expected_r_pt.impressions, msg=f"The impressions of No.{i + 1} reach point is not correct", ) self.assertEqual( r_pt.reach(1), expected_r_pt.reach(1), msg=f"The reach of No.{i + 1} reach point is not correct", ) self.assertEqual( r_pt.spends, expected_r_pt.spends, msg=f"The spends of No.{i + 1} reach point is not correct", )
def test_disjoint_overlap_data_set(self): pdf1 = PublisherData([(2, 0.02), (1, 0.01), (1, 0.03), (3, 0.04)], "a") pdf2 = PublisherData([(3, 0.04), (1, 0.02), (2, 0.01)], "b") pdf3 = PublisherData( [(1, 0.01), (2, 0.02), (1, 0.04), (1, 0.01), (3, 0.05)], "c" ) res = OverlapDataSet( unlabeled_publisher_data_list=[pdf1, pdf2, pdf3], overlap_generator=DisjointSetGenerator, name="disjoint", ) self.assertEqual(res.name, "disjoint") expected_data_list = [ [(0, 0.01), (0, 0.03), (1, 0.02), (2, 0.04)], [(3, 0.02), (4, 0.01), (5, 0.04)], [(6, 0.01), (6, 0.01), (6, 0.04), (7, 0.02), (8, 0.05)], ] expected_name_list = ["a", "b", "c"] self.assert_equal_pub_data_list( res._data, 3, expected_data_list, expected_name_list )
def test_user_counts_by_spend(self): pdf = PublisherData([(1, 0.01), (1, 0.04), (2, 0.02)]) self.assertEqual(pdf.user_counts_by_spend(0), {}) self.assertEqual(pdf.user_counts_by_spend(0.01), {1: 1}) self.assertEqual(pdf.user_counts_by_spend(0.015), {1: 1}) self.assertEqual(pdf.user_counts_by_spend(0.03), {1: 1, 2: 1}) self.assertEqual(pdf.user_counts_by_spend(0.07), {1: 2, 2: 1})
def test_latin_hypercube_definition(self): """Check if the points satisifies the definiton of Latin Hypercube. Test if the generated test points are indeed projected into equally space cells along each dimension. """ pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(1, 0.02), (2, 0.04), (1, 0.08), (3, 0.10)], "pdf2") pdf3 = PublisherData([(1, 0.02), (2, 0.04), (1, 0.01), (3, 0.06)], "pdf3") data_set = DataSet([pdf1, pdf2, pdf3], "test") generator = LatinHypercubeRandomTestPointGenerator( data_set, np.random.default_rng(1), npoints=100) design = np.stack([x for x in generator.test_points()]) equally_spaced = set(range(100)) self.assertEqual(set((design[:, 0] / 0.05 * 100).astype("int32")), equally_spaced) self.assertEqual(set((design[:, 1] / 0.10 * 100).astype("int32")), equally_spaced) self.assertEqual(set((design[:, 2] / 0.06 * 100).astype("int32")), equally_spaced)
def test_evaluate(self): with TemporaryDirectory() as d: pdf1 = PublisherData([(1, 0.01), (2, 0.02), (1, 0.04), (3, 0.05)], "pdf1") pdf2 = PublisherData([(2, 0.02), (2, 0.03), (4, 0.06)], "pdf2") data_set = DataSet([pdf1, pdf2], "dataset") data_design_dir = join(d, "data_design") experiment_dir = join(d, "experiments") data_design = DataDesign(data_design_dir) data_design.add(data_set) MODELING_STRATEGIES["fake"] = FakeModelingStrategy TEST_POINT_STRATEGIES["fake_tps"] = FakeTestPointGenerator msd = ModelingStrategyDescriptor("fake", {"x": 1}, "goerg", {}, "pairwise_union", {}) sparams = SystemParameters( [0.9, 0.9], LiquidLegionsParameters(13, 1e6, 1), np.random.default_rng(), ) eparams = ExperimentParameters(PrivacyBudget(1.0, 0.01), 3, 5, "fake_tps") trial_descriptor = TrialDescriptor(msd, sparams, eparams) trial = ExperimentalTrial(experiment_dir, data_design, "dataset", trial_descriptor) result = trial.evaluate(seed=1) # We don't check each column in the resulting dataframe, because these have # been checked by the preceding unit tests. However, we make a few strategic # probes. self.assertEqual(result.shape[0], 1) self.assertEqual(result["dataset"][0], "dataset") self.assertEqual(result["replica_id"][0], 3) self.assertEqual(result["privacy_budget_epsilon"][0], 1.0) self.assertEqual(result["npoints"][0], 1) self.assertEqual(result["model_succeeded"][0], 1) self.assertEqual(result["model_exception"][0], "")