def by_impressions(self, impressions: List[int], max_frequency: int = 1) -> ReachPoint: """Returns the estimated reach as a function of impressions. Args: impressions: list of ints of length 1, specifying the hypothetical number of impressions that are shown. max_frequency: int, specifies the number of frequencies for which reach will be reported. Returns: A ReachPoint specifying the estimated reach for this number of impressions. """ if len(impressions) != 1: raise ValueError("Impressions vector must have a length of 1.") self._fit(impressions[0] + 1) p = min(impressions[0] / self._max_impressions, 1.0) freqs = self._max_reach * self._dist.kreach( np.arange(1, max_frequency), p) kplus = self._max_reach * self._dist.kplusreach(max_frequency, p) hist = list(freqs) + [kplus] kplus_reaches = np.cumsum(hist[::-1])[::-1] if self._cpi: return ReachPoint(impressions, kplus_reaches, [impressions[0] * self._cpi]) else: return ReachPoint(impressions, kplus_reaches)
def by_impressions( self, impressions: List[int], max_frequency: int = 1 ) -> ReachPoint: """Returns the estimated reach as a function of impressions. Args: impressions: list of ints of length 1, specifying the hypothetical number of impressions that are shown. max_frequency: int, specifies the number of frequencies for which reach will be reported. Returns: A ReachPoint specifying the estimated reach for this number of impressions. """ if len(impressions) != 1: raise ValueError("Impressions vector must have a length of 1.") hist = self._expected_histogram( min(impressions[0], self._max_impressions - 1), self._max_impressions, self.max_reach, self._alpha, self._beta, MAXIMUM_COMPUTATIONAL_FREQUENCY, ) kplus_reach = self._kplus_reaches_from_frequencies(hist) if self._cpi: return ReachPoint( impressions, kplus_reach[:max_frequency], [impressions[0] * self._cpi] ) else: return ReachPoint(impressions, kplus_reach[:max_frequency])
def _shuffle_distance(xpoint: ReachPoint, ypoint: ReachPoint, k=5) -> float: """Computes shuffle distance of first k frequency buckets.""" if xpoint.max_frequency <= k or ypoint.max_frequency <= k: return 1.0 xfreq = np.array([xpoint.frequency(i + 1) for i in range(k)]) yfreq = np.array([ypoint.frequency(i + 1) for i in range(k)]) if sum(xfreq) == 0 or sum(yfreq) == 0: return 0.5 return 0.5 * np.sum(np.abs(xfreq / sum(xfreq) - yfreq / sum(yfreq)))
def true_reach_by_spend(self, spend: float, max_frequency: int = 1) -> ReachPoint: """Returns the true reach obtained for a given spend vector. Args: spend: The hypothetical amount spent. max_frequency: The maximum frequency for which to report reach. Returns: A ReachPoint representing the true reach that would have been obtained for this spend. """ user_counts = self._publisher_data.user_counts_by_spend(spend) impressions = sum(user_counts.values()) reach = ReachPoint.user_counts_to_kplus_reaches(user_counts, max_frequency) return ReachPoint([impressions], reach, [spend])
def test_fit_exponential_poisson_model(self): p1 = ReachPoint([20000], [10000]) kgpm1 = KInflatedGammaPoissonModel([p1]) N1, dist1 = kgpm1._fit_exponential_poisson_model(p1) self.assertAlmostEqual(N1, 13333.33, delta=1) self.assertAlmostEqual(dist1._alpha, 1.0) self.assertAlmostEqual(dist1._beta, 2.0, delta=0.1) p2 = ReachPoint([19971], [7993, 4815, 2914, 1759, 1011, 604, 355, 214, 122, 75]) kgpm2 = KInflatedGammaPoissonModel([p2]) N2, dist2 = kgpm1._fit_exponential_poisson_model(p2) self.assertAlmostEqual(N2, 8564, delta=1) self.assertAlmostEqual(dist2._alpha, 1.0) self.assertAlmostEqual(dist2._beta, 1.8, delta=0.1)
def reach_by_spend(self, spends: Iterable[float], max_frequency: int = 10) -> ReachPoint: """Number of people reached for a given spend. Args: spends: A list of spend amounts. The length of the list must equal the value of publisher_count. Specifies the amount spent with each publisher. max_frequency: int, The maximum frequency that should be counted. All frequencies about this amount will be grouped into a single bucket. Returns: A ReachPoint object representing the k+ reach for each frequency in the range 1..max_frequency. """ if len(spends) != self.publisher_count: raise ValueError( "Invalid spends vector length. Got {}, expected {}".format( len(spends), self.publisher_count)) counts = defaultdict(int) impressions = [] for i, publisher_spend in enumerate(spends): user_counts = self._data[i].user_counts_by_spend(publisher_spend) impressions.append(sum(user_counts.values())) for id, freq in user_counts.items(): counts[id] += freq kplus_reaches = self._counts_to_histogram(counts, max_frequency) return ReachPoint(impressions, kplus_reaches, spends)
def test_expected_histogram(self): gpm = GammaPoissonModel([ReachPoint([20], [10])]) h_actual = gpm._expected_histogram(4, 12, 16, 1, 1, max_freq=3) self.assertLen(h_actual, 3) self.assertAlmostEqual(h_actual[0], 6) self.assertAlmostEqual(h_actual[1], 3 / 2) self.assertAlmostEqual(h_actual[2], 3 / 8)
def test_logpmf(self): # The coded implementation of the Gamma-Poisson makes use of the fact # that a Gamma-Poisson with parameters (alpha, beta) is equivalent # to a negative binomial with parameters (p, r) = # (beta / (1 + beta), alpha). In this test, we compute the # Gamma-Poisson directly through numerical integration and compare # it to the values computed via the negative binomial. def gamma_poisson_integrand(k, mu, alpha, beta): return scipy.stats.poisson.pmf(k, mu) * scipy.stats.gamma.pdf( mu, alpha, scale=1.0 / beta) def gamma_poisson_pmf(k, alpha, beta): return scipy.integrate.quad( lambda x: gamma_poisson_integrand(k, x, alpha, beta), 0.0, np.Inf)[0] gpm = GammaPoissonModel([ReachPoint([20], [10])]) self.assertAlmostEqual(gpm._logpmf(1, 1.0, 1.0), np.log(gamma_poisson_pmf(0, 1.0, 1.0))) self.assertAlmostEqual(gpm._logpmf(2, 1.0, 1.0), np.log(gamma_poisson_pmf(1, 1.0, 1.0))) self.assertAlmostEqual(gpm._logpmf(2, 3.0, 1.0), np.log(gamma_poisson_pmf(1, 3.0, 1.0))) self.assertAlmostEqual(gpm._logpmf(2, 1.0, 4.0), np.log(gamma_poisson_pmf(1, 1.0, 4.0)))
def test_by_spend(self): surface = FakeReachSurface([ReachPoint([1000, 2000], [1000])]) surface._fit() self.assertEqual(surface.by_spend([1, 2]).reach(), 300) self.assertEqual(surface.by_spend([1, 2], 3).reach(2), 150) self.assertEqual(surface.by_spend([1, 2], 3).reach(3), 100) self.assertEqual(surface.by_spend([0.5, 1], 3).reach(3), 50)
def by_impressions(self, impressions: [int], max_frequency: int = 1) -> ReachPoint: """Returns the estimated reach for a given impression vector.""" kplus_frequencies = [ min(sum(impressions), self.max_reach) // i for i in range(1, max_frequency + 1) ] return ReachPoint(impressions, kplus_frequencies)
def test_exponential_poisson_N(self): kgpm = KInflatedGammaPoissonModel([ReachPoint([100], [100])]) self.assertAlmostEqual(kgpm._exponential_poisson_N(19971, 7992), 9769, delta=1) self.assertAlmostEqual(kgpm._exponential_poisson_N(20000, 10000), 13333.33, delta=1)
def test_fit_variable_N(self, mock_gamma_poisson_model): mock_gamma_poisson_model.return_value = (30000, 10000, 1.0, 2.0) h_actual = [2853, 813, 230, 64, 17, 4, 1, 0, 0, 0] rp = ReachPoint([4000], h_actual) gpm = GammaPoissonModel([rp]) gpm._fit() self.assertAlmostEqual(gpm._max_reach, 10000, delta=1) self.assertAlmostEqual(gpm._alpha, 1.0, delta=0.01)
def test_kreach(self): gpm = GammaPoissonModel([ReachPoint([20], [10])]) self.assertAlmostEqual(gpm._kreach([0], 1, 2, 1, 1)[0], 1 / 3) self.assertAlmostEqual(gpm._kreach([0], 1, 3, 1, 1)[0], 1 / 2) self.assertAlmostEqual(gpm._kreach([1], 1, 3, 1, 1)[0], 3 / 8) self.assertAlmostEqual(gpm._kreach([0, 1, 2], 1, 3, 1, 1)[0], 1 / 2) self.assertAlmostEqual(gpm._kreach([0, 1, 2], 1, 3, 1, 1)[1], 3 / 8) self.assertAlmostEqual(gpm._kreach([0, 1, 2], 1, 3, 1, 1)[2], 3 / 32)
def test_exponential_poisson_reach(self): kgpm = KInflatedGammaPoissonModel([ReachPoint([100], [100])]) self.assertAlmostEqual( kgpm._exponential_poisson_reach(20000, 10000, 3.0), 8000.0) self.assertAlmostEqual(kgpm._exponential_poisson_reach( 19971, 12560, 26.19), 7888.78, delta=1)
def test_exponential_poisson_N_from_beta(self): kgpm = KInflatedGammaPoissonModel([ReachPoint([100], [100])]) self.assertAlmostEqual(kgpm._exponential_poisson_N_from_beta( 19971, 7992, 2.41), 9416.0, delta=1) self.assertAlmostEqual( kgpm._exponential_poisson_N_from_beta(30000, 10000, 2), 10000.0)
def by_spend(self, spend: [float], max_frequency: int = 1) -> ReachPoint: """Returns the estimated reach for a given spend vector.""" impressions = [100 * s for s in spend] kplus_frequencies = [ min(sum(impressions), self.max_reach) // i for i in range(1, max_frequency + 1) ] return ReachPoint(impressions, kplus_frequencies, spend)
def test_by_impressions(self, mock_gamma_poisson_model): mock_gamma_poisson_model.return_value = (25000, 5.0, 2.0) # Imax = 25000, N = 10000, alpha = 5, beta = 2 h_training = [8124, 5464, 3191, 1679, 815, 371, 159, 64, 23, 6, 0] rp = ReachPoint([20000], h_training, [200.0]) gpm = GammaPoissonModel([rp], max_reach=10000) gpm._fit() rp = gpm.by_impressions([10000], max_frequency=5) h_expected = np.array([9682, 8765, 7353, 5750, 4233]) h_actual = np.array([int(rp.reach(i)) for i in range(1, 6)]) total_error = np.sum((h_expected - h_actual)**2 / h_expected) self.assertAlmostEqual(rp.spends[0], 100.0) for i in range(len(h_actual)): self.assertTrue( (h_actual[i] - h_expected[i])**2 / h_actual[i] < 0.1, f"Discrepancy found at position {i}. " f"Got {h_actual[i]} Expected {h_expected[i]}", )
def generate_sample_reach_curves(self, num_publishers, decay_rate, universe_size): max_reaches = [ universe_size * (decay_rate ** pub_num) for pub_num in range(num_publishers) ] reach_curves = [] for max_reach in max_reaches: curve = LinearCappedReachCurve([ReachPoint([max_reach], (max_reach,))]) curve._fit() reach_curves.append(curve) return reach_curves
def generate_true_reach_independent_two_pubs(self, universe_size, reach_curves, impressions): p = len(reach_curves) reach_vector = [ reach_curve.by_impressions([impression]).reach() for reach_curve, impression in zip(reach_curves, impressions) ] reach = sum(reach_vector) - (reduce(operator.mul, reach_vector) / universe_size) return ReachPoint(impressions, [reach])
def test_overspend(self, mock_gamma_poisson_model): # Imax = 25000, N = 10000, alpha = 5, beta = 2 h_training = [8124, 5464, 3191, 1679, 815, 371, 159, 64, 23, 6, 0] rp = ReachPoint([20000], h_training, [200.0]) gpm = GammaPoissonModel([rp], max_reach=10000) gpm._fit() self.assertAlmostEqual(gpm.by_impressions([30000]).reach(1), 10000.0, delta=0.1) self.assertAlmostEqual(gpm.by_spend([300]).reach(1), 10000.0, delta=0.1)
def test_user_counts_to_frequencies(self): self.assertEqual(ReachPoint.user_counts_to_frequencies({}, 3), [0, 0, 0]) self.assertEqual(ReachPoint.user_counts_to_frequencies({3: 1}, 3), [1, 0, 0]) self.assertEqual( ReachPoint.user_counts_to_frequencies({ 3: 1, 2: 1 }, 3), [2, 0, 0]) self.assertEqual( ReachPoint.user_counts_to_frequencies( { 3: 1, 2: 1, 1: 2, 4: 3, 5: 4 }, 3), [2, 1, 2], )
def test_by_impressions(self, mock_fit_point): mock_fit_point.return_value = ( 10000, KInflatedGammaPoissonDistribution(5.0, 2.0, []), ) # Imax = 25000, N = 10000, alpha = 5, beta = 2 h_training = [7412, 4233, 2014, 842, 320, 112, 37, 11, 2] rp = ReachPoint([15000], h_training, [200.0]) kgpm = KInflatedGammaPoissonModel([rp]) kgpm._fit() rp = kgpm.by_impressions([10000], max_frequency=5) h_expected = np.array([6056, 2629, 925, 283, 78]) h_actual = np.array([int(rp.reach(i)) for i in range(1, 6)]) total_error = np.sum((h_expected - h_actual)**2 / h_expected) self.assertAlmostEqual(rp.spends[0], 133.0, delta=1) for i in range(len(h_actual)): self.assertTrue( (h_actual[i] - h_expected[i])**2 / h_actual[i] < 0.1, f"Discrepancy found at position {i}. " f"Got {h_actual[i]} Expected {h_expected[i]}", )
def generate_true_reach(self, a, reach_curves, impressions, spends=None): p = len(reach_curves) reach_vector = [ reach_curve.by_impressions([impression]).reach() for reach_curve, impression in zip(reach_curves, impressions) ] reach = sum(reach_vector) for i in range(len(reach_curves)): for j in range(len(reach_curves)): reach -= (a[i * p + j] * reach_vector[i] * reach_vector[j]) / ( max(reach_curves[i].max_reach, reach_curves[j].max_reach) * 2 ) return ReachPoint(impressions, [reach], spends)
def test_knreach(self): gpm = GammaPoissonModel([ReachPoint([20], [10])]) self.assertAlmostEqual(gpm._knreach(1, 1, 1, 2, 1.0, 1.0), 0.25) self.assertAlmostEqual(gpm._knreach(2, 1, 1, 2, 1.0, 1.0), 0) self.assertAlmostEqual(gpm._knreach(1, 1, 1, 2, 1.0, 2.0), 1.0 / 6.0) self.assertAlmostEqual( gpm._knreach(1, np.array([1]), 1, 2, 1.0, 1.0)[0], 1 / 4) self.assertAlmostEqual( gpm._knreach(1, np.array([1, 2]), 1, 2, 1.0, 1.0)[0], 1 / 4) self.assertAlmostEqual( gpm._knreach(1, np.array([1, 2]), 1, 2, 1.0, 1.0)[1], 1 / 8) self.assertAlmostEqual(gpm._knreach(1, 3, 1, 3, 1.0, 1.0), 3 * (1 / 3) * (2 / 3)**2 * (1 / 8))
def test_user_counts_to_kplus_reaches(self): self.assertEqual(ReachPoint.user_counts_to_kplus_reaches({}, 3), [0, 0, 0]) self.assertEqual(ReachPoint.user_counts_to_kplus_reaches({3: 1}, 3), [1, 0, 0]) self.assertEqual(ReachPoint.user_counts_to_kplus_reaches({3: 2}, 3), [1, 1, 0]) self.assertEqual( ReachPoint.user_counts_to_kplus_reaches({ 3: 1, 2: 1 }, 3), [2, 0, 0]) self.assertEqual( ReachPoint.user_counts_to_kplus_reaches( { 3: 1, 2: 1, 1: 2, 4: 3, 5: 4 }, 3), [5, 3, 2], )
def fit(self, halo: HaloSimulator, params: SystemParameters, budget: PrivacyBudget) -> ReachSurface: total_reach = ReachPoint( [ 2, ], [ 2, ], [2.0], ) curve = GoergModel([total_reach]) curve._fit() return curve
def by_impressions(self, impressions: Iterable[int], max_frequency: int = 1) -> ReachPoint: reach_vector = self.get_reach_vector(impressions) reach = 0 for j in range(self._c): w = 1 for i in range(self._p): w *= 1 - ( (self.a[j + i * self._c] * reach_vector[i]) / self._N) reach += 1 - w reach *= self._N return ReachPoint(impressions, [reach])
def simulated_reach_by_spend( self, spends: List[float], budget: PrivacyBudget, privacy_budget_split: float = 0.5, max_frequency: int = 1, ) -> ReachPoint: rp1 = self.curve.by_spend([spends[0]], max_frequency=max_frequency) rp2 = self.curve.by_spend([spends[1]], max_frequency=max_frequency) imps = rp1.impressions[0] + rp2.impressions[0] spend = rp1.spends[0] + rp2.spends[0] freqs = [rp1.reach(i) + rp2.reach(i) for i in range(1, max_frequency + 1)] return ReachPoint([rp1.impressions[0], rp2.impressions[0]], freqs, spends)
def _generate_reach_points_from_venn_diagram( self, spends: List[float], primitive_regions: Dict[int, int] ) -> List[ReachPoint]: """Return the reach points of the powerset of active publishers. For each subset of active publishers, compute reach estimate for those users who are reached by at least one of the active publishers in the subset. Note that the reach points generated by the implementation contain 1+ reaches. Args: spends: The hypothetical spend vector, equal in length to the number of publishers. spends[i] is the amount that is spent with publisher i. primitive_regions: A dictionary in which each key is the binary representation of a primitive region of the Venn diagram, and each value is the reach in the corresponding region. Note that the binary representation of the key represents the formation of publisher IDs in that primitive region. For example, primitive_regions[key] with key = 5 = bin('101') is the region which belongs to pub_id-0 and id-2. Returns: A list of ReachPoint. Each reach point represents the mapping from the spends of a subset of active publishers to the number of people reached in this subset. """ active_pub_set = [i for i in range(len(spends)) if spends[i]] active_pub_powerset = chain.from_iterable( combinations(active_pub_set, set_size) for set_size in range(1, len(active_pub_set) + 1) ) impressions = self._data_set.impressions_by_spend(spends) reach_points = [] for sub_pub_ids in active_pub_powerset: sub_reach = self._aggregate_reach_in_primitive_venn_diagram_regions( sub_pub_ids, primitive_regions ) pub_subset = set(sub_pub_ids) pub_vector = np.array([int(i in pub_subset) for i in range(len(spends))]) sub_imps = np.array(impressions) * pub_vector sub_spends = np.array(spends) * pub_vector reach_points.append( ReachPoint(sub_imps.tolist(), [sub_reach], sub_spends.tolist()) ) return reach_points
def by_impressions(self, impressions: [int], max_frequency: int = 1) -> ReachPoint: """Returns the estimated reach as a function of impressions. Args: impressions: list of ints of length 1, specifying the hypothetical number of impressions that are shown. max_frequency: int, specifies the number of frequencies for which reach will be reported. Returns: A ReachPoint specifying the estimated reach for this number of impressions. """ if len(impressions) != 1: raise ValueError("Impressions vector must have a length of 1.") kplus_reach_list = [] for k in range(1, max_frequency + 1): kplus_reach = ( self._rho * (impressions[0] / (impressions[0] + self._beta)) ** k ) kplus_reach_list.append(kplus_reach) if self._cpi: spend = impressions[0] * self._cpi return ReachPoint(impressions, kplus_reach_list, [spend]) else: return ReachPoint(impressions, kplus_reach_list)