Beispiel #1
0
    def test_interface_count(self):
        logging.getLogger().setLevel(logging.DEBUG)
        # Initialize params and algorithm to benchmark
        pa = DPSingletonQuery()
        pp = PrivacyParams(epsilon=1.0)
        ev = EvaluatorParams(repeat_count=100)
        dd = DatasetParams(dataset_size=500)
        query = "SELECT COUNT(UserId) AS UserCount FROM dataset.dataset"

        # Preparing neighboring datasets
        df, metadata = self.create_simulated_dataset(dd.dataset_size, "dataset")
        d1_dataset, d2_dataset, d1_metadata, d2_metadata = self.generate_neighbors(df, metadata)
        d1 = PandasReader(d1_dataset, d1_metadata)
        d2 = PandasReader(d2_dataset, d2_metadata)

        # Call evaluate
        eval = DPEvaluator()
        key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, query, pp, ev)
        # After evaluation, it should return True and distance metrics should be non-zero
        for key, metrics in key_metrics.items():
            assert(metrics.dp_res == True)
            test_logger.debug("Wasserstein Distance:" + str(metrics.wasserstein_distance))
            test_logger.debug("Jensen Shannon Divergence:" + str(metrics.jensen_shannon_divergence))
            test_logger.debug("KL Divergence:" + str(metrics.kl_divergence))
            test_logger.debug("MSE:" + str(metrics.mse))
            test_logger.debug("Standard Deviation:" + str(metrics.std))
            test_logger.debug("Mean Signed Deviation:" + str(metrics.msd))
            assert(metrics.wasserstein_distance > 0.0)
            assert(metrics.jensen_shannon_divergence > 0.0)
            assert(metrics.kl_divergence != 0.0)
            assert(metrics.mse > 0.0)
            assert(metrics.std != 0.0)
            assert(metrics.msd != 0.0)
 def test_group_by_exact_order_expr_desc(self):
     reader = PandasReader(df, schema)
     rs = reader.execute(
         "SELECT COUNT(*) * 5 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC"
     )
     assert (rs[1][0] == 549 * 5)
     assert (rs[2][0] == 451 * 5)
Beispiel #3
0
 def learn(self, querypool, export_as_csv=False):
     output = []
     for i in range(len(querypool)):
         df, metadata = create_simulated_dataset(self.dd.dataset_size, "dataset")
         d1_dataset, d2_dataset, d1_metadata, d2_metadata = generate_neighbors(df, metadata)
         d1 = PandasReader(d1_metadata, d1_dataset)
         d2 = PandasReader(d2_metadata, d2_dataset)
         eval = DPEvaluator()
         pa = DPSingletonQuery()
         key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, querypool[i], self.pp, self.ev)
         if key_metrics['__key__'].dp_res is None:
             dp_res = key_metrics['__key__'].dp_res
             error =  key_metrics['__key__'].error
             output.append({"query":querypool[i], "dpresult": dp_res, "jensen_shannon_divergence":None, "error": error})
         else:
             res_list = []
             for key, metrics in key_metrics.items():
                 dp_res = metrics.dp_res
                 js_res = metrics.jensen_shannon_divergence
                 res_list.append([dp_res, js_res])
             dp_res = np.all(np.array([res[0] for res in res_list]))
             js_res = (np.array([res[1] for res in res_list])).max()
             output.append({"query":querypool[i], "dpresult": dp_res,"jensen_shannon_divergence": js_res, "error":None})
     if export_as_csv:
         write_to_csv('Bandit.csv', output, flag='bandit')
     else:
         return output
Beispiel #4
0
    def run_agg_query(self, df, metadata, query, confidence, get_exact=True):
        """
        Run the query using the private reader and input query
        Get query response back
        """
        reader = PandasReader(df, metadata)
        actual = 0.0
        # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation
        if (get_exact):
            actual = reader.execute_typed(query).rows()[1:][0][0]
        private_reader = PrivateReader(reader, metadata, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        srs_orig = private_reader.reader.execute_ast_typed(query_ast)

        noisy_values = []
        low_bounds = []
        high_bounds = []
        for idx in range(self.repeat_count):
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            res = private_reader._execute_ast(query_ast, True)
            # Disabled because confidence interval not available in report
            #interval = res.report[res.colnames[0]].intervals[confidence]
            #low_bounds.append(interval[0].low)
            #high_bounds.append(interval[0].high)
            noisy_values.append(res.rows()[1:][0][0])
        return np.array(noisy_values), actual, low_bounds, high_bounds
Beispiel #5
0
 def test_legacy_params_pandas_reader(self):
     # params swapped
     with pytest.warns(Warning):
         reader = PandasReader(schema, df)
     assert("metadata.collection.CollectionMetadata" in str(type(reader.metadata)))
     # doubled up params of wrong type should fail
     with pytest.raises(Exception):
         reader = PandasReader(schema, schema)
     with pytest.raises(Exception):
         reader = PandasReader(df, df)
 def test_group_by_noisy_typed_order_desc(self):
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, 4.0)
     rs = private_reader.execute_typed(
         "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC"
     )
     assert (rs['c'][0] > rs['c'][1])
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(df, schema)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 schema_c = copy.copy(schema)
                 schema_c["PUMS.PUMS"].max_ids = d
                 private_reader = PrivateReader(reader, schema_c, eps,
                                                delta)
                 assert (private_reader._options.max_contrib == d)
                 r = private_reader.execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
Beispiel #8
0
 def test_no_tau_noisy(self):
     # should never drop rows
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, 0.01)
     for i in range(10):
         rs = private_reader.execute_df("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'")
         assert(len(rs['c']) == 1)
 def test_ok1(self):
     s = copy.copy(schema)
     s["PUMS.PUMS"]["income"].maxval = None
     reader = PandasReader(df, s)
     private_reader = PrivateReader(reader, s, 4.0)
     rs = private_reader.execute_df(
         "SELECT income FROM PUMS.PUMS GROUP BY income")
 def test_err1(self):
     s = copy.copy(schema)
     s["PUMS.PUMS"]["income"].maxval = None
     reader = PandasReader(df, s)
     private_reader = PrivateReader(reader, s, 4.0)
     with pytest.raises(ValueError):
         rs = private_reader.execute_df("SELECT SUM(income) FROM PUMS.PUMS")
Beispiel #11
0
 def setup_class(cls):
     meta = CollectionMetadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader, meta, 10.0, 10E-3)
     cls.reader = private_reader
Beispiel #12
0
 def test_empty_result_count_typed_notau_prepost(self):
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(reader, schema, 1.0)
     private_reader._execute_ast(query, True)
     for i in range(3):
         trs = private_reader._execute_ast(query, True)
         assert(len(trs) == 2)
Beispiel #13
0
 def test_execute_without_dpsu(self):
     schema_no_dpsu = copy.copy(schema)
     schema_no_dpsu["PUMS.PUMS"].use_dpsu = False
     reader = PandasReader(df, schema_no_dpsu)
     private_reader = PrivateReader(reader, schema_no_dpsu, 1.0)
     assert(private_reader._options.use_dpsu == False)
     query = QueryParser(schema_no_dpsu).queries("SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert(private_reader._get_reader(query) is private_reader.reader)
Beispiel #14
0
 def test_viz_child_nodes(self):
     query = "SELECT AVG(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, 1.0)
     inner, outer = private_reader.rewrite(query)
     aggfuncs = outer.find_nodes(AggFunction)
     for aggfunc in aggfuncs:
         graph = aggfunc.visualize(n_trunc=30)
         assert (isinstance(graph, Digraph))
Beispiel #15
0
 def QuerytoAST(self, query, meta, data):
     reader = PandasReader(meta, data)
     private_reader = PrivateReader(meta, reader, self.pp.epsilon)
     # query =  'SELECT Usage AS Usage, SUM(Usage) + 3 AS u FROM dataset.dataset GROUP BY Role'
     try:
         ast = private_reader.parse_query_string(query)
     except:
         return
     return ast
Beispiel #16
0
 def test_yes_tau(self):
     # should usually drop some rows
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, 1.0, 1/10000)
     lengths = []
     for i in range(10):
         rs = private_reader.execute_df("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 80 GROUP BY educ")
         lengths.append(len(rs['c']))
     l = lengths[0]
     assert(any([l != ll for ll in lengths]))
Beispiel #17
0
 def setup_class(self):
     meta = CollectionMetadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     meta["PUMS.PUMS"]["sex"].type = "int"
     meta["PUMS.PUMS"]["educ"].type = "int"
     meta["PUMS.PUMS"]["married"].type = "bool"
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader, meta, 10.0, 10E-3)
     self.reader = private_reader
Beispiel #18
0
 def _compute_reward(self, query):
     ast_transform = self.observe(query)
     d1_query = query
     d2_query = query.replace("d1.d1", "d2.d2")
     d1_dataset, d2_dataset, d1_metadata, d2_metadata = generate_neighbors(
         self.df, self.metadata)
     d1 = PandasReader(d1_metadata, d1_dataset)
     d2 = PandasReader(d2_metadata, d2_dataset)
     eval = DPEvaluator()
     pa = DPSingletonQuery()
     key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa,
                                 query, self.pp, self.ev)
     message = None
     if key_metrics["__key__"].dp_res is None:
         dpresult = "DP_BUG"
         self.reward = 1
         message = key_metrics["__key__"].error
     elif key_metrics["__key__"].dp_res == False:
         self._game_ended = True
         dpresult = "DP_FAIL"
         self.reward = 20
         message = "dp_res_False"
     elif (key_metrics["__key__"].dp_res == True and
           key_metrics["__key__"].jensen_shannon_divergence == math.inf):
         self._game_ended = True
         dpresult = "DP_BUG"
         self.reward = 20
         message = "jsdistance_is_inf"
     else:
         res_list = []
         for key, metrics in key_metrics.items():
             dp_res = metrics.dp_res
             js_res = metrics.jensen_shannon_divergence
             # ws_res = metrics.wasserstein_distance
             res_list.append([dp_res, js_res])
         dp_res = np.all(np.array([res[0] for res in res_list]))
         js_res = (np.array([res[1] for res in res_list])).max()
         # ws_res = (np.array([res[2] for res in res_list])).max()
         if dp_res == True:
             dpresult = "DP_PASS"
             self.reward = js_res
     return dpresult, self.reward, message, d1, d2
Beispiel #19
0
 def test_legacy_params_private_reader(self):
     reader = PandasReader(df, schema)
     # params swapped
     with pytest.warns(Warning):
         private_reader = PrivateReader(schema, reader, 1.0)
     assert(isinstance(private_reader.reader, PandasReader))
     # doubled up params of wrong type should fail
     with pytest.raises(Exception):
         private_reader = PrivateReader(schema, schema, 1.0)
     with pytest.raises(Exception):
         private_reader = PrivateReader(reader, reader, 1.0)
Beispiel #20
0
 def test_viz_query_rewritten(self):
     query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     parsed_query = QueryParser(schema).query(query)
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, 1.0)
     inner, outer = private_reader.rewrite_ast(parsed_query)
     graph = outer.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
     #graph.render('ast_digraph', view=True, cleanup=True)
     graph = inner.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
    def test_dpsu_vs_korolova(self):
        query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc"
        reader = PandasReader(df, schema)
        private_reader = PrivateReader(reader, schema, 3.0)
        private_reader.options.max_contrib = 10
        result = private_reader.execute_typed(query)

        private_reader_korolova = PrivateReader(reader, schema, 3.0)
        private_reader_korolova.options.dpsu = False
        private_reader_korolova.options.max_contrib = 10
        korolova_result = private_reader_korolova.execute_typed(query)

        assert len(result['n']) > len(korolova_result['n'])
        assert len(final_df) < len(df)
    def test_calculate_multiplier(self):
        pums_meta_path = os.path.join(
            git_root_dir, os.path.join("service", "datasets", "PUMS.yaml"))
        pums_csv_path = os.path.join(
            git_root_dir, os.path.join("service", "datasets", "PUMS.csv"))
        pums_schema = CollectionMetadata.from_file(pums_meta_path)
        pums_df = pd.read_csv(pums_csv_path)
        pums_reader = PandasReader(pums_df, pums_schema)
        query = "SELECT COUNT(*) FROM PUMS.PUMS"
        cost = PrivateReader.get_budget_multiplier(pums_schema, pums_reader,
                                                   query)

        query = "SELECT AVG(age) FROM PUMS.PUMS"
        cost_avg = PrivateReader.get_budget_multiplier(pums_schema,
                                                       pums_reader, query)
        assert 1 + cost == cost_avg
def test_sklearn_query():
    sklearn_dataset = sklearn.datasets.load_iris()
    sklearn_df = pd.DataFrame(data=sklearn_dataset.data,
                              columns=sklearn_dataset.feature_names)

    iris = Table("dbo", "iris", [
        Float("sepal length (cm)", 4, 8),
        Float("sepal width (cm)", 2, 5),
        Float("petal length (cm)", 1, 7),
        Float("petal width (cm)", 0, 3)
    ], 150)
    schema = CollectionMetadata([iris], "csv")

    reader = PandasReader(sklearn_df, schema)
    rowset = execute_private_query(
        schema, reader, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris')
    df = pd.DataFrame(rowset[1:], columns=rowset[0])
    assert df is not None
    assert len(df) == 1
def test_sklearn_query():
    sklearn_dataset = sklearn.datasets.load_iris()
    sklearn_df = pd.DataFrame(data=sklearn_dataset.data,
                              columns=sklearn_dataset.feature_names)

    iris = Table("dbo", "iris", [
        Float("sepal length (cm)", 4, 8),
        Float("sepal width (cm)", 2, 5),
        Float("petal length (cm)", 1, 7),
        Float("petal width (cm)", 0, 3)
    ], 150)
    schema = CollectionMetadata([iris], "csv")

    reader = PandasReader(sklearn_df, schema)
    # Calling both times for back compat check
    for params in ([reader, schema], [schema, reader]):
        df = execute_private_query(
            *params, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris')
        assert df is not None
        assert len(df) == 1
 def _load_reader(dataset_document):
     return PandasReader(
         LocalCSVAdapter.load_df(dataset_document),
         LocalCSVAdapter.load_metadata(dataset_document),
     )
Beispiel #26
0
 def test_empty_result_typed(self):
     reader = PandasReader(df, schema)
     rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
     trs = reader._to_df(rs)
     assert(len(trs) == 0)
Beispiel #27
0
 def test_count_no_rows_exact_typed(self):
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     trs = reader._execute_ast_df(query)
     assert(trs['c'][0] == 0)
Beispiel #28
0
import pandas as pd
from opendp.smartnoise.sql import PandasReader, PrivateReader
from opendp.smartnoise.metadata import CollectionMetadata

pums = pd.read_csv('PUMS.csv')
meta = CollectionMetadata.from_file('PUMS.yaml')

query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married'

query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ'

reader = PandasReader(meta, pums)
private_reader = PrivateReader(meta, reader, 4.0)
private_reader.options.censor_dims = True
private_reader.options.clamp_counts = True

exact = reader.execute_typed(query)
print(exact)

private = private_reader.execute_typed(query)
print(private)
Beispiel #29
0
    def run_agg_query_df(self,
                         df,
                         metadata,
                         query,
                         confidence,
                         file_name="d1"):
        """
        Run the query using the private reader and input query
        Get query response back for multiple dimensions and aggregations
        """
        # Getting exact result
        reader = PandasReader(df, metadata)
        exact = reader.execute_typed(query).rows()[1:]
        exact_res = []
        for row in exact:
            exact_res.append(row)

        private_reader = PrivateReader(reader, metadata, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        # Distinguishing dimension and measure columns
        srs_orig = private_reader.reader.execute_ast_typed(query_ast)
        srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))

        sample_res = private_reader._execute_ast(query_ast, True)
        headers = sample_res.colnames

        dim_cols = []
        num_cols = []

        for col in headers:
            if (sample_res.types[col] == "string"):
                dim_cols.append(col)
            else:
                num_cols.append(col)

        # Repeated query and store results along with intervals
        res = []
        for idx in range(self.repeat_count):
            dim_rows = []
            num_rows = []
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            singleres = private_reader._execute_ast(query_ast, True)
            values = singleres[col]
            for col in dim_cols:
                dim_rows.append(singleres[col])
            for col in num_cols:
                values = singleres[col]
                #low = singleres.report[col].intervals[confidence].low
                #high = singleres.report[col].intervals[confidence].high
                #num_rows.append(list(zip(values, low, high)))
                num_rows.append(list(zip(values)))

            res.extend(list(zip(*dim_rows, *num_rows)))

        exact_df = pd.DataFrame(exact_res, columns=headers)
        noisy_df = pd.DataFrame(res, columns=headers)

        # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2
        if (len(dim_cols) == 0):
            dim_cols.append("__dim__")

        if (dim_cols[0] == "__dim__"):
            exact_df[dim_cols[0]] = ["key"] * len(exact_df)
            noisy_df[dim_cols[0]] = ["key"] * len(noisy_df)

        return noisy_df, exact_df, dim_cols, num_cols
Beispiel #30
0
 def create(self):
     return PandasReader(self.df, self.meta)