def test_samples_high_weight_elements_priority(self): """Checks that high-weight elements are sampled (using priority sampling). For threshold t, an element with weight at least 1/t will always be sampled, so this test should always succeed. """ s = private_sampling.ThresholdSample( 0.5, private_sampling.PrioritySamplingMethod) s.process("a", 2.0) s.process("b", 3.0) self.assertCountEqual(["a", "b"], s.elements.keys())
def test_does_not_sample_negligible_weight_priority(self): """Checks that a very low weight element is not sampled (with priority). For the fixed threshold 1.0, an element with weight w is sampled with probability min{w,1}. For this test to fail with probability 1/10000000, we add an element with weight 1/10000000 and check that the element was not sampled. """ s = private_sampling.ThresholdSample( 1.0, private_sampling.PrioritySamplingMethod) s.process("a", 1.0 / FAILURE_PROBABILITY_INVERSE) self.assertEmpty(s.elements)
def test_samples_high_weight_elements_ppswor(self): """Checks that an element with high weight is sampled when using PPSWOR. For the fixed threshold 1.0, an element with weight w is sampled with probability 1-exp(-w). Hence, this test uses an element with weight ln(10000000), so the test is supposed to fail (element not sampled) with probability 1/10000000. """ s = private_sampling.ThresholdSample( 1.0, private_sampling.PpsworSamplingMethod) s.process("a", math.log(FAILURE_PROBABILITY_INVERSE, math.e)) self.assertCountEqual(["a"], s.elements.keys())
def test_does_not_sample_twice_ppswor(self): """Checks that an exception is raised when processing the same key twice. The exception is raised when we process a key that is already in the sample (this event should not happen since we assume the data is aggregated). To implement that, we start with an element with high weight (and is thus sampled with high probability), and then try to add it again. As in test_samples_high_weight_elements_ppswor, the test fails with probability 1/10000000 (happens when the first element is not sampled). """ with self.assertRaises(ValueError): s = private_sampling.ThresholdSample( 1.0, private_sampling.PpsworSamplingMethod) s.process("a", math.log(FAILURE_PROBABILITY_INVERSE, math.e)) s.process("a", 1)
def test_estimate_full_statistics_priority(self): """Checks the estimate for the full statistics (using priority sampling). We check the function that estimates the full statistics on a dataset where all the elements are sampled with probability 1.0. As a result, the estimate for the statistics should be exactly accurate. As in test_samples_high_weight_elements_priority, the elements are sampled since for threshold t, an element with weight at least 1/t will always be sampled. """ s = private_sampling.ThresholdSample( 0.5, private_sampling.PrioritySamplingMethod) s.process("a", 2.0) s.process("b", 3.0) self.assertEqual(s.estimate_full_statistics(), 5.0)
def test_does_not_sample_negligible_weight_ppswor(self): """Checks that a very low weight element is not sampled (with PPSWOR). For the fixed threshold 1.0, an element with weight w is sampled with probability 1-exp(-w). For this test to fail with probability 1/10000000, we add an element with weight ln(10000000/(10000000 - 1)) and check that the element was not sampled. """ s = private_sampling.ThresholdSample( 1.0, private_sampling.PpsworSamplingMethod) s.process( "a", math.log( FAILURE_PROBABILITY_INVERSE / (FAILURE_PROBABILITY_INVERSE - 1), math.e)) self.assertEmpty(s.elements)
def test_does_not_sample_twice_priority(self): """Checks that an exception is raised when processing the same key twice. The exception is raised when we process a key that is already in the sample (this event should not happen since we assume the data is aggregated). To implement that, we start with an element with high weight (that is always sampled for priority sampling with this threshold), and then try to add it again. See test_samples_high_weight_elements_priority for why the first element is always sampled. """ with self.assertRaises(ValueError): s = private_sampling.ThresholdSample( 0.5, private_sampling.PrioritySamplingMethod) s.process("a", 2.0) s.process("a", 0.1)
def test_estimate_full_statistics_ppswor(self): """Checks the estimate for the full statistics (using PPSWOR). We check that the function that estimates the full statistics on a dataset that contains one element which is sampled with probability 1-1/10000000 (as in test_samples_high_weight_elements_ppswor). We compare the output of estimate_full_statistics with the estimate we should get when the element is sampled. Therefore, the test should fail with probability 1/10000000 (when the element is not sampled). """ s = private_sampling.ThresholdSample( 1.0, private_sampling.PpsworSamplingMethod) element_weight = math.log(FAILURE_PROBABILITY_INVERSE, math.e) s.process("a", element_weight) sampling_probability = (FAILURE_PROBABILITY_INVERSE - 1) / FAILURE_PROBABILITY_INVERSE self.assertEqual(s.estimate_full_statistics(), element_weight / sampling_probability)
def test_high_delta_sample_stays_the_same(self, sampling_class, sampling_method): """Makes a non-private sample private, and checks it is the same (delta=1). This test checks the functions that create a private sample form an existing non-private threshold sample. When delta is 1.0, privacy does not add constraints, so the new private sample should contain the same elements as the non-private sample. Args: sampling_class: The private sampling class to be tested sampling_method: The underlying sampling method """ s = private_sampling.ThresholdSample(0.5, sampling_method) for i in range(2000): s.process(i, 1) private_priority_sample = sampling_class.from_non_private( s, eps=0.1, delta=1.0) self.assertCountEqual(s.elements.keys(), private_priority_sample.elements)
def test_samples_close_to_inclusion_probability_priority(self): """Confirms sampling close to the correct inclusion probability (priority). The test works as follows: We create an empty sample and process n (a large number) elements into it, such that each element is sampled with probability 0.5. Then, we check that between 0.49n and 0.51n elements were sampled. The number n needed to ensure that the test fails with probability at most 1/10000000 is computed using Chernoff bounds. """ # The range we allow around 0.5n distance_from_half = 0.01 # The number of elements we use (computed using Chernoff bounds) n = int((6.0 / (distance_from_half**2)) * math.log(2 * FAILURE_PROBABILITY_INVERSE, math.e) + 1) s = private_sampling.ThresholdSample( 0.5, private_sampling.PrioritySamplingMethod) for i in range(n): s.process(i, 1.0) self.assertGreaterEqual(len(s.elements), (0.5 - distance_from_half) * n) self.assertLessEqual(len(s.elements), (0.5 + distance_from_half) * n)