Ejemplo n.º 1
0
    def find_itemset(self, singletons, singleton_freq, single_test_user, multi_test_user):

        # step 1: build itemset candidate
        set_cand_dict, set_cand_dict_inv = self.get_set_cand_thres_prod(singletons, singleton_freq)

        # step 2: itemset size distribution
        length_percentile = 0.9
        percentile_test_user = single_test_user + int(0.2 * (multi_test_user - single_test_user))
        length_distribution_set = self.test_length_itemset(single_test_user + 1, percentile_test_user,
                                                           len(set_cand_dict), set_cand_dict)
        length_limit = self.find_percentile_set(length_distribution_set, length_percentile)

        # step 3: itemset est
        true_itemset_dist = self.data.test_cand_limit(percentile_test_user + 1, multi_test_user, set_cand_dict,
                                                      length_limit)
        use_grr, eps = self.set_grr(true_itemset_dist, length_limit)

        if use_grr:
            set_freq = fo.rr(true_itemset_dist, eps)[:-1]
        else:
            set_freq = fo.lh(true_itemset_dist, eps)[:-1]
        set_freq *= single_test_user / length_percentile / (multi_test_user - percentile_test_user)

        self.update_tail_with_reporting_set(length_limit, length_distribution_set, set_cand_dict, set_freq)

        return self.build_set_result(singletons, singleton_freq, set_freq, set_cand_dict_inv)
Ejemplo n.º 2
0
    def find(self):
        domain_len = self.args.n_bits
        next_gram_n = self.step_bit_n
        first_gram_n = int(math.log2(self.k)) + 1
        num_iterations = 1 + int(math.ceil((domain_len - first_gram_n) / next_gram_n))

        total_user = len(self.data.data)
        group_users = np.linspace(0, total_user, num_iterations + 1)

        cand_dict = {0: 0}
        for i in range(self.k):
            cand_dict[i] = i

        cand_num = 1 << next_gram_n
        for i in range(num_iterations):
            shift_num = first_gram_n + i * next_gram_n
            prev_shift_num = first_gram_n + (i - 1) * next_gram_n

            new_cand_dict = {}
            count = 0
            for cand, freq in cand_dict.items():
                for suffix in range(cand_num):
                    new_cand_dict[(suffix << prev_shift_num) + cand] = count
                    count += 1

            true_counts = self.data.suffix_tally(group_users[i], group_users[i + 1], new_cand_dict, shift_num)

            true_counts = np.append(true_counts, [int(group_users[1] - sum(true_counts))])
            est_counts = fo.lh(true_counts, self.args.eps)[:-1]
			est_counts = fo.filter_top(est_counts, self.k)

            cand_dict = {}
            for cand, index in new_cand_dict.items():
                if est_counts[index] >= 0:
                    cand_dict[cand] = est_counts[index] * num_iterations
Ejemplo n.º 3
0
    def find_singleton(self, single_test_user):
        phase1_user = int(single_test_user * 0.4)
        phase2_user = phase1_user + int(0.1 * single_test_user)
        phase3_user = single_test_user

        # step 1: find singleton candidate set
        true_singleton_dist = self.data.test_single(0, phase1_user)
        est_singleton_dist = fo.lh(true_singleton_dist, self.epsilon)
        top_singleton = 2 * self.top_k
        singleton_list, value_result = self.build_result(
            est_singleton_dist, range(len(est_singleton_dist)), top_singleton)

        # step 2: find an appropriate length
        key_result = {}
        for i in range(len(singleton_list)):
            key_result[(singleton_list[i], )] = i

        length_percentile = 0.9
        length_distribution = self.test_length_singleton(
            phase1_user + 1, phase2_user, len(singleton_list), singleton_list)
        length_limit = self.find_percentile_set(length_distribution,
                                                length_percentile)

        # step 3: test with the confined set
        use_grr, eps = self.set_grr(key_result, length_limit)
        true_singleton_dist = self.data.test_singleton_cand_limit(
            phase2_user + 1, phase3_user, key_result, set(singleton_list),
            length_limit)

        if use_grr:
            value_estimates = fo.rr(true_singleton_dist, eps)[:-1]
        else:
            value_estimates = fo.lh(true_singleton_dist, eps)[:-1]
        value_estimates *= single_test_user / length_percentile / (
            phase3_user - phase2_user)

        top_singleton = self.top_k
        key_list, est_freq = self.build_result(value_estimates, singleton_list,
                                               top_singleton)
        return key_list, est_freq
Ejemplo n.º 4
0
 def test_length_itemset(self, user_start, user_end, length_limit, cand_dict):
     true_length_dist = self.data.test_length_itemset(user_start, user_end, cand_dict, length_limit)
     est_length_dist = fo.lh(true_length_dist, self.epsilon)
     return est_length_dist