def find_itemset(self, singletons, singleton_freq, single_test_user, multi_test_user): # step 1: build itemset candidate set_cand_dict, set_cand_dict_inv = self.get_set_cand_thres_prod(singletons, singleton_freq) # step 2: itemset size distribution length_percentile = 0.9 percentile_test_user = single_test_user + int(0.2 * (multi_test_user - single_test_user)) length_distribution_set = self.test_length_itemset(single_test_user + 1, percentile_test_user, len(set_cand_dict), set_cand_dict) length_limit = self.find_percentile_set(length_distribution_set, length_percentile) # step 3: itemset est true_itemset_dist = self.data.test_cand_limit(percentile_test_user + 1, multi_test_user, set_cand_dict, length_limit) use_grr, eps = self.set_grr(true_itemset_dist, length_limit) if use_grr: set_freq = fo.rr(true_itemset_dist, eps)[:-1] else: set_freq = fo.lh(true_itemset_dist, eps)[:-1] set_freq *= single_test_user / length_percentile / (multi_test_user - percentile_test_user) self.update_tail_with_reporting_set(length_limit, length_distribution_set, set_cand_dict, set_freq) return self.build_set_result(singletons, singleton_freq, set_freq, set_cand_dict_inv)
def find(self): domain_len = self.args.n_bits next_gram_n = self.step_bit_n first_gram_n = int(math.log2(self.k)) + 1 num_iterations = 1 + int(math.ceil((domain_len - first_gram_n) / next_gram_n)) total_user = len(self.data.data) group_users = np.linspace(0, total_user, num_iterations + 1) cand_dict = {0: 0} for i in range(self.k): cand_dict[i] = i cand_num = 1 << next_gram_n for i in range(num_iterations): shift_num = first_gram_n + i * next_gram_n prev_shift_num = first_gram_n + (i - 1) * next_gram_n new_cand_dict = {} count = 0 for cand, freq in cand_dict.items(): for suffix in range(cand_num): new_cand_dict[(suffix << prev_shift_num) + cand] = count count += 1 true_counts = self.data.suffix_tally(group_users[i], group_users[i + 1], new_cand_dict, shift_num) true_counts = np.append(true_counts, [int(group_users[1] - sum(true_counts))]) est_counts = fo.lh(true_counts, self.args.eps)[:-1] est_counts = fo.filter_top(est_counts, self.k) cand_dict = {} for cand, index in new_cand_dict.items(): if est_counts[index] >= 0: cand_dict[cand] = est_counts[index] * num_iterations
def find_singleton(self, single_test_user): phase1_user = int(single_test_user * 0.4) phase2_user = phase1_user + int(0.1 * single_test_user) phase3_user = single_test_user # step 1: find singleton candidate set true_singleton_dist = self.data.test_single(0, phase1_user) est_singleton_dist = fo.lh(true_singleton_dist, self.epsilon) top_singleton = 2 * self.top_k singleton_list, value_result = self.build_result( est_singleton_dist, range(len(est_singleton_dist)), top_singleton) # step 2: find an appropriate length key_result = {} for i in range(len(singleton_list)): key_result[(singleton_list[i], )] = i length_percentile = 0.9 length_distribution = self.test_length_singleton( phase1_user + 1, phase2_user, len(singleton_list), singleton_list) length_limit = self.find_percentile_set(length_distribution, length_percentile) # step 3: test with the confined set use_grr, eps = self.set_grr(key_result, length_limit) true_singleton_dist = self.data.test_singleton_cand_limit( phase2_user + 1, phase3_user, key_result, set(singleton_list), length_limit) if use_grr: value_estimates = fo.rr(true_singleton_dist, eps)[:-1] else: value_estimates = fo.lh(true_singleton_dist, eps)[:-1] value_estimates *= single_test_user / length_percentile / ( phase3_user - phase2_user) top_singleton = self.top_k key_list, est_freq = self.build_result(value_estimates, singleton_list, top_singleton) return key_list, est_freq
def test_length_itemset(self, user_start, user_end, length_limit, cand_dict): true_length_dist = self.data.test_length_itemset(user_start, user_end, cand_dict, length_limit) est_length_dist = fo.lh(true_length_dist, self.epsilon) return est_length_dist