def dp_mechanism_count(self, df, colname): exact_count = df[colname].count() mech = Laplace(self.epsilon) if (self.mechanism == "Gaussian"): mech = Gaussian(self.epsilon) return np.array( [mech.release([exact_count])[0] for i in range(self.repeat_count)])
def test_bounds1b_lap(self): # check that analytic and bootstrap bounds work g = Laplace(0.05) # epsilon of 0.05, very wide bounds lower, upper = g.bounds(0.95, False) # analytic bounds lower2, upper2 = g.bounds(0.95, True) # bootstrap bounds assert (lower < upper) assert (lower2 < upper2)
def dp_mechanism_sum(self, df, colname): exact_sum = df[colname].sum() M = float(abs(max(df[colname]) - min(df[colname]))) mech = Laplace(self.epsilon, sensitivity=M) if (self.mechanism == "Gaussian"): mech = Gaussian(self.epsilon) return np.array( [mech.release([exact_sum])[0] for i in range(self.repeat_count)])
def test_bounds1c_lap(self): # check that analytic and bootstrap bounds work # use very small bounds to make sure order doesn't swap g = Laplace(1.0) # epsilon of 1.0 lower, upper = g.bounds(0.1, False) # analytic bounds lower2, upper2 = g.bounds(0.1, True) # bootstrap bounds assert (lower <= upper) assert (lower2 <= upper2)
def _apply_noise(self, subquery, query, syms, types, sens, srs, pct=0.95): # if user has selected keycount for outer query, use that instead kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"] if len(kcc) > 0: srs["keycount"] = srs[kcc[0][0].lower()] srs = srs.filter("keycount", ">", self.tau**2) # add noise to all columns that need noise for nsym in subquery.numeric_symbols(): name, sym = nsym name = name.lower() sens = sym.sensitivity() mechanism = Laplace(self.epsilon, sens, self.tau) srs.bounds[name] = mechanism.bounds(pct) if sym.sensitivity() == 1: counts = mechanism.release(srs[name]) counts[counts < 0] = 0 srs[name] = counts srs = srs.filter(name, ">", self.tau) else: srs[name] = mechanism.release(srs[name]) syms = query.all_symbols() types = [s[1].type() for s in syms] sens = [s[1].sensitivity() for s in syms] colnames = [s[0] for s in syms] newrs = TypedRowset([colnames], types, sens) srsc = srs.m_cols bindings = dict((name.lower(), srsc[name]) for name in srsc.keys()) cols = [] for c in query.select.namedExpressions: cols.append(c.expression.evaluate(bindings)) for idx in range(len(cols)): newrs[newrs.idxcol[idx]] = cols[idx] # Now sort, if it has order by clause if query.order is not None: sort_fields = [] for si in query.order.sortItems: if type(si.expression) is not ast.Column: raise ValueError( "We only know how to sort by column names right now") colname = si.expression.name.lower() desc = False if si.order is not None and si.order.lower() == "desc": desc = True sf = (colname, desc) sort_fields.append(sf) sf = [("-" if desc else "") + colname for colname, desc in sort_fields] newrs.sort(sf) return (newrs.rows(), srs.bounds)
def release(self, dataset): # get the column count num_obs = dataset.shape[0] # obfuscate the count sens = 2 tau = 5 counts = Laplace(self._epsilon, tau).count([num_obs]) count_release = counts[0] # calculate accuracy from epsilon accuracy = self._compute_accuracy(self._epsilon) accuracy_bound = accuracy * num_obs mci = [num_obs - accuracy_bound, num_obs + accuracy_bound] return CountResult(count_release, self._column, accuracy, self._epsilon, mci)
def test_bounds2_lap(self): # check that outer bounds enclose inner bounds g = Laplace(4.0) # epsilon of 4.0, tighter bounds lower1, upper1 = g.bounds(0.95, False) lower1b, upper1b = g.bounds(0.95, True) lower2, upper2 = g.bounds(0.97, False) lower2b, upper2b = g.bounds(0.97, True) assert (lower2 < lower1) assert (upper2 > upper1) assert (lower2b < lower1b) assert (upper2b > upper1b)
def test_simple_lap(self): g = Laplace(0.1) # epsilon of 0.1 x = range(10000) y = g.count(x) assert (round(np.sum(x) / 10E+6) == round(np.sum(y) / 10E+6))
def _postprocess(self, subquery, query, syms, types, sens, srs, pct=0.95): # Postprocess: # 1. Add Noise to subquery results # 1b. Clamp counts to 0, set SUM = NULL if count = 0 # 2. Filter tau thresh # 3. Evaluate outer expression, set AVG = NULL if count = 0 # 4. Sort # # if user has selected keycount for outer query, use that instead kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"] if len(kcc) > 0: srs["keycount"] = srs[kcc[0][0].lower()] # add noise to all columns that need noise for nsym in subquery.numeric_symbols(): name, sym = nsym name = name.lower() sens = sym.sensitivity() # treat null as 0 before adding noise srs[name] = np.array( [v if v is not None else 0.0 for v in srs[name]]) mechanism = Laplace(self.epsilon, sens, self.tau) srs.bounds[name] = mechanism.bounds(pct) srs[name] = mechanism.release(srs[name]) # BUGBUG: Things other than counts can have sensitivity of 1 if sym.sensitivity() == 1: counts = srs[name] counts[counts < 0] = 0 srs[name] = counts if subquery.agg is not None: srs = srs.filter("keycount", ">", self.tau**2) syms = query.all_symbols() types = [s[1].type() for s in syms] sens = [s[1].sensitivity() for s in syms] colnames = [s[0] for s in syms] newrs = TypedRowset([colnames], types, sens) srsc = srs.m_cols bindings = dict((name.lower(), srsc[name]) for name in srsc.keys()) cols = [] for c in query.select.namedExpressions: cols.append(c.expression.evaluate(bindings)) for idx in range(len(cols)): newrs[newrs.idxcol[idx]] = cols[idx] # Now sort, if it has order by clause if query.order is not None: sort_fields = [] for si in query.order.sortItems: if type(si.expression) is not ast.Column: raise ValueError( "We only know how to sort by column names right now") colname = si.expression.name.lower() desc = False if si.order is not None and si.order.lower() == "desc": desc = True sf = (colname, desc) sort_fields.append(sf) sf = [("-" if desc else "") + colname for colname, desc in sort_fields] newrs.sort(sf) return newrs