def test_ok1(self): s = copy.copy(schema) s["PUMS.PUMS"]["income"].upper = None reader = PandasReader(df, s) private_reader = PrivateReader(reader, s, privacy=Privacy(epsilon=4.0)) rs = private_reader.execute_df( "SELECT income FROM PUMS.PUMS GROUP BY income")
def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: privacy = Privacy(epsilon=eps, delta=delta) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, metadata=schema_c, privacy=privacy) assert (private_reader._options.max_contrib == d) r = private_reader._execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def test_err1(self): s = copy.copy(schema) s["PUMS.PUMS"]["income"].upper = None reader = PandasReader(df, s) private_reader = PrivateReader(reader, s, privacy=Privacy(epsilon=4.0)) with pytest.raises(ValueError): rs = private_reader.execute_df("SELECT SUM(income) FROM PUMS.PUMS")
def setup_class(cls): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1)) cls.reader = private_reader
def test_execute_with_dpsu(self): schema_dpsu = copy.copy(schema) schema_dpsu["PUMS.PUMS"].use_dpsu = True reader = PandasReader(df, schema_dpsu) private_reader = PrivateReader(reader, schema_dpsu, 1.0) assert (private_reader._options.use_dpsu == True) query = QueryParser(schema_dpsu).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert (private_reader._get_reader(query) is not private_reader.reader)
def test_viz_child_nodes(self): query = "SELECT AVG(age) AS my_sum FROM PUMS.PUMS GROUP BY age" reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0)) inner, outer = private_reader._rewrite(query) aggfuncs = outer.find_nodes(AggFunction) for aggfunc in aggfuncs: graph = aggfunc.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def test_pandas(self): engine = "pandas" meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) assert(isinstance(reader, SqlReader)) assert(isinstance(reader, PandasReader)) assert(isinstance(reader.compare, NameCompare)) assert(isinstance(reader.compare, PandasNameCompare)) assert(isinstance(reader.serializer, Serializer)) assert(isinstance(reader.serializer, PandasSerializer))
def test_execute_without_dpsu(self): schema_no_dpsu = copy.copy(schema) schema_no_dpsu["PUMS.PUMS"].use_dpsu = False reader = PandasReader(df, schema_no_dpsu) private_reader = PrivateReader(reader, schema_no_dpsu, privacy=Privacy(epsilon=1.0)) assert (private_reader._options.use_dpsu == False) query = QueryParser(schema_no_dpsu).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert (private_reader._get_reader(query) is private_reader.reader)
def setup_class(self): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False meta["PUMS.PUMS"]["sex"].type = "int" meta["PUMS.PUMS"]["educ"].type = "int" meta["PUMS.PUMS"]["married"].type = "bool" df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=10e-3)) self.reader = private_reader
def test_viz_query_rewritten(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0)) inner, outer = private_reader._rewrite_ast(parsed_query) graph = outer.visualize(n_trunc=30) assert (isinstance(graph, Digraph)) #graph.render('ast_digraph', view=True, cleanup=True) graph = inner.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def test_with_censor_dims(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS" q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite_ast(q) ne = outer.select.namedExpressions assert (ne[0].expression.expression.name != 'keycount') assert (ne[1].expression.expression.name == 'keycount')
def test_reuse_expression(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS' q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite(query) names = unique( [f.name for f in outer.select.namedExpressions.find_nodes(Column)]) assert (len(names) == 2) assert ('count_age' in names) assert ('sum_age' in names)
def test_empty_result_count_typed_notau_prepost(self): schema_all = copy.deepcopy(schema) schema_all['PUMS.PUMS'].censor_dims = False reader = PandasReader(df, schema) query = QueryParser(schema).queries( "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema_all, privacy=Privacy(epsilon=1.0)) private_reader._execute_ast(query, True) for i in range(3): print(private_reader._options) trs = private_reader._execute_ast(query, True) print("empty query") print(trs) assert (len(trs) == 2)
def test_dpsu_vs_korolova(self): query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc" reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=3.0)) private_reader.options.max_contrib = 10 result = private_reader.execute_df(query) private_reader_korolova = PrivateReader(reader, schema, privacy=Privacy(epsilon=3.0)) private_reader_korolova.options.dpsu = False private_reader_korolova.options.max_contrib = 10 korolova_result = private_reader_korolova.execute_df(query) assert len(result['n']) > len(korolova_result['n']) assert len(final_df) < len(df)
def test_group_by_noisy_typed_order_desc(self): reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=4.0)) rs = private_reader.execute_df("SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC") assert(rs['c'][0] > rs['c'][1])
def test_count_exact(self): reader = PandasReader(df, schema) rs = reader.execute("SELECT COUNT(*) AS c FROM PUMS.PUMS") assert(rs[1][0] == 1000)
def test_empty_result(self): reader = PandasReader(df, schema) rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100") assert(len(rs) == 1)
def test_empty_result_typed(self): reader = PandasReader(df, schema) rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100") trs = reader._to_df(rs) assert(len(trs) == 0)
def test_group_by_exact_order_expr_desc(self): reader = PandasReader(df, schema) rs = reader.execute("SELECT COUNT(*) * 5 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC") assert(rs[1][0] == 549 * 5) assert(rs[2][0] == 451 * 5)
def test_sum_noisy(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT SUM(age) as age_total FROM PUMS.PUMS")[0] trs = reader._execute_ast_df(query) assert(trs['age_total'][0] > 1000)
def test_sum_noisy_postprocess(self): reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0)) trs = private_reader.execute_df("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS") assert(trs['age_total'][0] > 1000 ** 2)
def test_sum_no_rows_exact_typed(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT SUM(age) as c FROM PUMS.PUMS WHERE age > 100")[0] trs = reader._execute_ast_df(query) assert(trs['c'][0] == None)
def test_load_from_private_reader_path(self): reader = PandasReader(pums, meta_path) priv = PrivateReader(reader, meta_path, privacy=Privacy(epsilon=1.0)) res = priv.execute("SELECT COUNT(age) FROM PUMS.PUMS GROUP BY sex") assert (len(res) == 3)