def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: privacy = Privacy(epsilon=eps, delta=delta) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, metadata=schema_c, privacy=privacy) assert (private_reader._options.max_contrib == d) r = private_reader._execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def preprocess_df_from_query(schema, df, query_string): """ Returns a dataframe with user_id | tuple based on query grouping keys. """ qp = QueryParser(schema) q = qp.query(query_string) queries = qp.queries(query_string) query_ast = queries[0] group_cols = [ ge.expression.name for ge in query_ast.agg.groupingExpressions ] table_name = q.source.find_node(Table).name key_col = schema[table_name].key_cols()[0].name preprocessed_df = pd.DataFrame() preprocessed_df[key_col] = df[key_col] preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist()) return preprocessed_df