def test_ok1(self):
     s = copy.copy(schema)
     s["PUMS.PUMS"]["income"].upper = None
     reader = PandasReader(df, s)
     private_reader = PrivateReader(reader, s, privacy=Privacy(epsilon=4.0))
     rs = private_reader.execute_df(
         "SELECT income FROM PUMS.PUMS GROUP BY income")
Example #2
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(df, schema)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 privacy = Privacy(epsilon=eps, delta=delta)
                 privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 schema_c = copy.copy(schema)
                 schema_c["PUMS.PUMS"].max_ids = d
                 private_reader = PrivateReader(reader,
                                                metadata=schema_c,
                                                privacy=privacy)
                 assert (private_reader._options.max_contrib == d)
                 r = private_reader._execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
 def test_err1(self):
     s = copy.copy(schema)
     s["PUMS.PUMS"]["income"].upper = None
     reader = PandasReader(df, s)
     private_reader = PrivateReader(reader, s, privacy=Privacy(epsilon=4.0))
     with pytest.raises(ValueError):
         rs = private_reader.execute_df("SELECT SUM(income) FROM PUMS.PUMS")
Example #4
0
 def setup_class(cls):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1))
     cls.reader = private_reader
Example #5
0
 def test_execute_with_dpsu(self):
     schema_dpsu = copy.copy(schema)
     schema_dpsu["PUMS.PUMS"].use_dpsu = True
     reader = PandasReader(df, schema_dpsu)
     private_reader = PrivateReader(reader, schema_dpsu, 1.0)
     assert (private_reader._options.use_dpsu == True)
     query = QueryParser(schema_dpsu).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert (private_reader._get_reader(query) is not private_reader.reader)
Example #6
0
 def test_viz_child_nodes(self):
     query = "SELECT AVG(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader,
                                    schema,
                                    privacy=Privacy(epsilon=1.0))
     inner, outer = private_reader._rewrite(query)
     aggfuncs = outer.find_nodes(AggFunction)
     for aggfunc in aggfuncs:
         graph = aggfunc.visualize(n_trunc=30)
         assert (isinstance(graph, Digraph))
 def test_pandas(self):
     engine = "pandas"
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     assert(isinstance(reader, SqlReader))
     assert(isinstance(reader, PandasReader))
     assert(isinstance(reader.compare, NameCompare))
     assert(isinstance(reader.compare, PandasNameCompare))
     assert(isinstance(reader.serializer, Serializer))
     assert(isinstance(reader.serializer, PandasSerializer))
Example #8
0
 def test_execute_without_dpsu(self):
     schema_no_dpsu = copy.copy(schema)
     schema_no_dpsu["PUMS.PUMS"].use_dpsu = False
     reader = PandasReader(df, schema_no_dpsu)
     private_reader = PrivateReader(reader,
                                    schema_no_dpsu,
                                    privacy=Privacy(epsilon=1.0))
     assert (private_reader._options.use_dpsu == False)
     query = QueryParser(schema_no_dpsu).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert (private_reader._get_reader(query) is private_reader.reader)
Example #9
0
 def setup_class(self):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     meta["PUMS.PUMS"]["sex"].type = "int"
     meta["PUMS.PUMS"]["educ"].type = "int"
     meta["PUMS.PUMS"]["married"].type = "bool"
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=10.0,
                                                    delta=10e-3))
     self.reader = private_reader
Example #10
0
 def test_viz_query_rewritten(self):
     query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     parsed_query = QueryParser(schema).query(query)
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader,
                                    schema,
                                    privacy=Privacy(epsilon=1.0))
     inner, outer = private_reader._rewrite_ast(parsed_query)
     graph = outer.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
     #graph.render('ast_digraph', view=True, cleanup=True)
     graph = inner.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
Example #11
0
 def test_with_censor_dims(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS"
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite_ast(q)
     ne = outer.select.namedExpressions
     assert (ne[0].expression.expression.name != 'keycount')
     assert (ne[1].expression.expression.name == 'keycount')
Example #12
0
 def test_reuse_expression(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS'
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite(query)
     names = unique(
         [f.name for f in outer.select.namedExpressions.find_nodes(Column)])
     assert (len(names) == 2)
     assert ('count_age' in names)
     assert ('sum_age' in names)
Example #13
0
 def test_empty_result_count_typed_notau_prepost(self):
     schema_all = copy.deepcopy(schema)
     schema_all['PUMS.PUMS'].censor_dims = False
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries(
         "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(reader,
                                    schema_all,
                                    privacy=Privacy(epsilon=1.0))
     private_reader._execute_ast(query, True)
     for i in range(3):
         print(private_reader._options)
         trs = private_reader._execute_ast(query, True)
         print("empty query")
         print(trs)
         assert (len(trs) == 2)
    def test_dpsu_vs_korolova(self):
        query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc"
        reader = PandasReader(df, schema)
        private_reader = PrivateReader(reader,
                                       schema,
                                       privacy=Privacy(epsilon=3.0))
        private_reader.options.max_contrib = 10
        result = private_reader.execute_df(query)

        private_reader_korolova = PrivateReader(reader,
                                                schema,
                                                privacy=Privacy(epsilon=3.0))
        private_reader_korolova.options.dpsu = False
        private_reader_korolova.options.max_contrib = 10
        korolova_result = private_reader_korolova.execute_df(query)

        assert len(result['n']) > len(korolova_result['n'])
        assert len(final_df) < len(df)
Example #15
0
 def test_group_by_noisy_typed_order_desc(self):
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=4.0))
     rs = private_reader.execute_df("SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC")
     assert(rs['c'][0] > rs['c'][1])
Example #16
0
 def test_count_exact(self):
     reader = PandasReader(df, schema)
     rs = reader.execute("SELECT COUNT(*) AS c FROM PUMS.PUMS")
     assert(rs[1][0] == 1000)
Example #17
0
 def test_empty_result(self):
     reader = PandasReader(df, schema)
     rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
     assert(len(rs) == 1)
Example #18
0
 def test_empty_result_typed(self):
     reader = PandasReader(df, schema)
     rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
     trs = reader._to_df(rs)
     assert(len(trs) == 0)
Example #19
0
 def test_group_by_exact_order_expr_desc(self):
     reader = PandasReader(df, schema)
     rs = reader.execute("SELECT COUNT(*) * 5 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC")
     assert(rs[1][0] == 549 * 5)
     assert(rs[2][0] == 451 * 5)
Example #20
0
 def test_sum_noisy(self):
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries("SELECT SUM(age) as age_total FROM PUMS.PUMS")[0]
     trs = reader._execute_ast_df(query)
     assert(trs['age_total'][0] > 1000)
Example #21
0
 def test_sum_noisy_postprocess(self):
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0))
     trs = private_reader.execute_df("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS")
     assert(trs['age_total'][0] > 1000 ** 2)
Example #22
0
 def test_sum_no_rows_exact_typed(self):
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries("SELECT SUM(age) as c FROM PUMS.PUMS WHERE age > 100")[0]
     trs = reader._execute_ast_df(query)
     assert(trs['c'][0] == None)
Example #23
0
 def test_load_from_private_reader_path(self):
     reader = PandasReader(pums, meta_path)
     priv = PrivateReader(reader, meta_path, privacy=Privacy(epsilon=1.0))
     res = priv.execute("SELECT COUNT(age) FROM PUMS.PUMS GROUP BY sex")
     assert (len(res) == 3)