def run_server_side(self, word_frequency): priv_count_sketch = PrivateCountSketch(self.l, self.w, self.epsilon) for index, row in word_frequency.iterrows(): for i in range(row['trueFrequency']): priv_count_sketch.set_sketch_element(row['word']) priv_frequency = [0] * len(word_frequency) priv_error = [0] * len(word_frequency) for i, word in enumerate(word_frequency['word']): priv_frequency[i] = int(priv_count_sketch.freq_oracle(word)) priv_error[i] = int(priv_frequency[i] - word_frequency['trueFrequency'][i]) word_frequency['privateFreq_run_freq'] = priv_frequency word_frequency['privateFreq_run_error'] = priv_error print(word_frequency) return word_frequency
def run(self, data, domain): # -------------------- Simulating the client-side process -------------------- ldp_data = [] priv_count_sketch = PrivateCountSketch(self.l, self.w, self.epsilon, use_median=self.use_median) for i in range(0, len(data)): priv_count_sketch.set_sketch_element(str(data[i])) # -------------------- Simulating the server-side process -------------------- ldp_freq = np.empty(len(domain)) ldp_plot_data = np.empty(len(domain)) # Generate both frequency data from the oracle and plot data to be graphed for i, item in enumerate(domain): ldp_freq[i] = priv_count_sketch.freq_oracle(str(item)) # Freq Oracle ldp_plot_data = np.append(ldp_plot_data, [item]*int(round(ldp_freq[i]))) # Generate estimated dataset return ldp_plot_data
def run_server_side_word_discovery(self, data, freq_oracle=None, freq_oracle_params=None): word_length = self.num_n_grams * self.gram_length word_frequency = pd.DataFrame(list(dict(Counter(data)).items()), columns=["word", "trueFrequency"]) pres_rec_df = pd.DataFrame(['NumOfWords', 'Precision', 'Recall'], columns=['Measure']) # Simulating client-side # PrivateCountSketch is the standard freq oracle for TreeHistogram # Can also provide other frequency oracles for simulation results if freq_oracle is None or freq_oracle_params is None: fragment_estimator = PrivateCountSketch(self.l, self.w, self.epsilon, use_median=True) word_estimator = PrivateCountSketch(self.l, self.w, self.epsilon) for index, row in word_frequency.iterrows(): for i in range(row['trueFrequency']): current_word = row['word'] word_estimator.set_sketch_element(current_word) if len(current_word) <= word_length: current_word += self.empty_char * (word_length - len(current_word)) else: current_word = current_word[:word_length] word_to_send = self.__choose_random_n_gram_prefix( current_word, self.gram_length) fragment_estimator.set_sketch_element(word_to_send) else: params = freq_oracle_params freq_oracles = { "priv_count_sketch": lambda dataset: PrivateCountSketch(**params, data=dataset), "priv_count_sketch_median": lambda dataset: PrivateCountSketch( **params, data=dataset, use_median=True), "hashtogram": lambda dataset: Hashtogram(dataset, **params), "hashtogram_median": lambda dataset: Hashtogram(dataset, **params, use_median=True), "cms": lambda dataset: ServerCMS(dataset, **params, is_raw_data=True), "hcms": lambda dataset: ServerCMS( dataset, **params, is_hadamard=True, is_raw_data=True) } word_estimator = freq_oracles.get(freq_oracle)(data) fragments = list( map( lambda word: self.__choose_random_n_gram_prefix( word, self.gram_length), data)) fragment_estimator = freq_oracles.get(freq_oracle)(fragments) # ------------------ Server-side Frequency estimation section ------------------ scaling_factor = self.num_n_grams n_gram_set = self.__gen_english_n_grams(self.alphabet) list_n_grams = [ s + self.empty_char * (word_length - len(s)) for s in n_gram_set ] word_queue = deque(list_n_grams) noisy_frequencies = {} while word_queue.__len__() != 0: current_prefix = word_queue.popleft() current_prefix_after_stripping_empty = current_prefix.replace( self.empty_char, '') freq_for_current_prefix = int( fragment_estimator.freq_oracle(current_prefix) * scaling_factor) if freq_for_current_prefix < self.threshold: continue if len(current_prefix_after_stripping_empty) == word_length: noisy_frequencies[ current_prefix_after_stripping_empty] = freq_for_current_prefix continue for gram in n_gram_set: toAdd = current_prefix_after_stripping_empty + gram + self.empty_char * ( word_length - (len(current_prefix_after_stripping_empty) + self.gram_length)) word_queue.append(toAdd) TP = 0.0 FN = 0.0 priv_frequency = [0] * len(word_frequency) for index, row in word_frequency.iterrows(): word = row['word'] true_frequency = row['trueFrequency'] if word not in list(noisy_frequencies.keys()): priv_frequency[index] = 0 if true_frequency > self.threshold: FN += 1 else: priv_frequency[index] = noisy_frequencies[word] if true_frequency > self.threshold: TP += 1 FP = noisy_frequencies.__len__() - TP precision = TP / (TP + FP) recall = TP / (TP + FN) word_frequency['privateFreq_run'] = priv_frequency pres_rec_df['Run'] = [word_frequency.__len__(), precision, recall] # Reapproximate frequencies based on the minimum of the estimates of fragment frequency vs whole dataset frequenc heavy_hitters = {} for key, value in noisy_frequencies.items(): freq = word_estimator.freq_oracle(key) if min(value, freq) >= math.sqrt(len(data)): heavy_hitters[key] = max(value, freq) return list(heavy_hitters.items())