def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(schema, df) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) private_reader = PrivateReader(schema, reader, eps, delta) q.max_ids = d # hijack the AST r = private_reader.execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def test_execute_without_dpsu(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0) query = QueryParser(schema).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] private_reader.options.use_dpsu = False assert (private_reader._get_reader(query) is private_reader.reader)
def test_empty_result_count_typed_notau_prepost(self): reader = PandasReader(schema, df) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(schema, reader, 1.0) private_reader._execute_ast(query, True) for i in range(3): trs = private_reader._execute_ast(query, True) assert(len(trs) == 1)
def preprocess_df_from_query(schema, df, query_string): """ Returns a dataframe with user_id | tuple based on query grouping keys. """ qp = QueryParser(schema) q = qp.query(query_string) queries = qp.queries(query_string) query_ast = queries[0] group_cols = [ ge.expression.name for ge in query_ast.agg.groupingExpressions ] table_name = q.source.find_node(Table).name key_col = schema[table_name].key_cols()[0].name preprocessed_df = pd.DataFrame() preprocessed_df[key_col] = df[key_col] preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist()) return preprocessed_df
def runRewrite(self): qb = QueryParser(metadata).queries(self.queryBatch) for q in qb: print(q) new_q = Rewriter(metadata).query(q) assert q.has_symbols() assert new_q.has_symbols() assert all([ qt[1].type() == nqt[1].type() for qt, nqt in zip(q.m_symbols, new_q.m_symbols) ])
def test_count_no_rows_exact_typed(self): reader = PandasReader(schema, df) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] trs = reader.execute_ast_typed(query) assert(trs['c'][0] == 0)
def test_sum_noisy(self): reader = PandasReader(schema, df) query = QueryParser(schema).queries("SELECT SUM(age) as age_total FROM PUMS.PUMS")[0] trs = reader.execute_ast_typed(query) assert(trs['age_total'][0] > 1000)
def runValidate(self): for qs in self.queries: print(qs) with pytest.raises(ValueError): q = QueryParser(metadata).query(qs) self.validateSingle(q)
def runValidate(self): for qs in self.queries: print(qs) q = QueryParser(metadata).query(qs) Validate().validateQuery(q, metadata)
def runValidate(self): for qs in self.queries: print(qs) q = QueryParser(metadata).query(qs) self.validateSingle(q)
def qp(query_string): return QueryParser().query(query_string)