def run_agg_query(self, df, metadata, query, confidence, get_exact=True): """ Run the query using the private reader and input query Get query response back """ reader = PandasReader(metadata, df) actual = 0.0 # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation if (get_exact): actual = reader.execute_typed(query).rows()[1:][0][0] private_reader = PrivateReader(metadata, reader, self.epsilon) query_ast = private_reader.parse_query_string(query) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] low_bounds = [] high_bounds = [] for idx in range(self.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) # Disabled because confidence interval not available in report #interval = res.report[res.colnames[0]].intervals[confidence] #low_bounds.append(interval[0].low) #high_bounds.append(interval[0].high) noisy_values.append(res.rows()[1:][0][0]) return np.array(noisy_values), actual, low_bounds, high_bounds
def test_no_tau_noisy(self): # should never drop rows reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 0.01) for i in range(10): rs = private_reader.execute_typed("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'") assert(len(rs['c']) == 1)
def test_group_by_noisy_typed_order_desc(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 4.0) rs = private_reader.execute_typed( "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC" ) assert (rs['c'][0] > rs['c'][1])
def test_group_by_noisy_order(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 4.0) rs = private_reader.execute( "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c" ) assert (rs[1][0] < rs[2][0])
def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(schema, df) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) private_reader = PrivateReader(schema, reader, eps, delta) q.max_ids = d # hijack the AST r = private_reader.execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def test_execute_without_dpsu(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0) query = QueryParser(schema).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] private_reader.options.use_dpsu = False assert (private_reader._get_reader(query) is private_reader.reader)
def test_empty_result_count_typed_notau_prepost(self): reader = PandasReader(schema, df) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(schema, reader, 1.0) private_reader._execute_ast(query, True) for i in range(3): trs = private_reader._execute_ast(query, True) assert(len(trs) == 1)
def test_yes_tau(self): # should usually drop some rows reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0, 1/10000) lengths = [] for i in range(10): rs = private_reader.execute_typed("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 80 GROUP BY educ") lengths.append(len(rs['c'])) l = lengths[0] assert(any([l != ll for ll in lengths]))
def test_group_by_noisy_typed_order_inter_constant(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0) rs = private_reader.execute_typed( "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c" ) rs2 = private_reader.execute_typed( "SELECT COUNT(*) * 2 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c" ) assert (len(rs.report['c'].intervals[0.95]) == 2) assert (len(rs2.report['c'].intervals[0.95]) == 2) assert (all(a.low < b.low for a, b in zip( rs.report['c'].intervals[0.95], rs2.report['c'].intervals[0.95]))) assert (all(a.low < b.low for a, b in zip(rs.report['c'].intervals[0.985], rs2.report['c'].intervals[0.985])))
def test_group_by_noisy_typed_order_inter(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0) rs = private_reader.execute_typed( "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c" ) assert (rs['c'][0] < rs['c'][1]) assert (len(rs.report['c'].intervals[0.95]) == 2) print(rs.report['c'].intervals[0.95]) assert (all(ival.low < ival.high for ival in rs.report['c'].intervals[0.95])) assert (all(ival.low < ival.high for ival in rs.report['c'].intervals[0.985])) assert (all(outer.low < inner.low for inner, outer in zip( rs.report['c'].intervals[0.95], rs.report['c'].intervals[0.985]))) assert (all(outer.high > inner.high for inner, outer in zip( rs.report['c'].intervals[0.95], rs.report['c'].intervals[0.985])))
def release(self, dataset: object, actual=False) -> Report: """ Dataset is a collection of [Dataset Metadata, PandasReader] Releases response to SQL query based on the number of repetitions requested by eval_params if actual is set of False. Actual response is only returned once """ if (not actual): private_reader = PrivateReader(dataset[0], dataset[1], self.privacy_params.epsilon) query_ast = private_reader.parse_query_string(self.algorithm) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] for idx in range(self.eval_params.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) noisy_values.append(res.rows()[1:][0][0]) return Report({"__key__": noisy_values}) else: reader = dataset[1] exact = reader.execute_typed(self.algorithm).rows()[1:][0][0] return Report({"__key__": exact})
def test_dpsu_vs_korolova(self): query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc" reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 3.0) private_reader.options.max_contrib = 10 result = private_reader.execute_typed(query) private_reader_korolova = PrivateReader(schema, reader, 3.0) private_reader_korolova.options.dpsu = False private_reader_korolova.options.max_contrib = 10 korolova_result = private_reader_korolova.execute_typed(query) assert len(result['n']) > len(korolova_result['n']) assert len(final_df) < len(df)
import mlflow import json import sys import pandas as pd from opendp.whitenoise.client import get_dataset_client from opendp.whitenoise.data.adapters import load_reader, load_metadata, load_dataset from opendp.whitenoise.sql import PrivateReader from pandasql import sqldf if __name__ == "__main__": dataset_name = sys.argv[1] budget = float(sys.argv[2]) query = sys.argv[3] with mlflow.start_run(): dataset_document = get_dataset_client().read(dataset_name, budget) dataset = load_dataset(dataset_document) reader = load_reader(dataset_document) schema = load_metadata(dataset_document) private_reader = PrivateReader(schema, reader, budget) rowset = private_reader.execute(query) result = {"query_result": rowset} df = pd.DataFrame(rowset[1:], columns=rowset[0]) with open("result.json", "w") as stream: json.dump(df.to_dict(), stream) mlflow.log_artifact("result.json")
import pandas as pd from opendp.whitenoise.sql import PostgresReader, PrivateReader from opendp.whitenoise.metadata import CollectionMetadata meta = CollectionMetadata.from_file('PUMS_large.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS_large GROUP BY married' query = 'SELECT AVG(age) FROM PUMS.PUMS_large' reader = PostgresReader('127.0.0.1', 'PUMS', 'postgres') private_reader = PrivateReader(reader, meta, 1.0) exact = reader.execute_typed(query) print(exact) private = private_reader.execute_typed(query) print(private)
def test_sum_noisy_postprocess(self): reader = PandasReader(schema, df) private_reader = PrivateReader(schema, reader, 1.0) trs = private_reader.execute_typed("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS") assert(trs['age_total'][0] > 1000 ** 2)
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): # Getting exact result reader = PandasReader(metadata, df) exact = reader.execute_typed(query).rows()[1:] exact_res = [] for row in exact: exact_res.append(row) private_reader = PrivateReader(metadata, reader, self.epsilon) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns srs_orig = private_reader.reader.execute_ast_typed(query_ast) srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res.colnames dim_cols = [] num_cols = [] for col in headers: if (sample_res.types[col] == "string"): dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results along with intervals res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) singleres = private_reader._execute_ast(query_ast, True) values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col]) for col in num_cols: values = singleres[col] #low = singleres.report[col].intervals[confidence].low #high = singleres.report[col].intervals[confidence].high #num_rows.append(list(zip(values, low, high))) num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if (len(dim_cols) == 0): dim_cols.append("__dim__") if (dim_cols[0] == "__dim__"): exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols
import json import sys import pandas as pd from opendp.whitenoise.client import get_dataset_client from opendp.whitenoise.data.adapters import load_reader, load_metadata from opendp.whitenoise.sql import PrivateReader if __name__ == "__main__": dataset_name = sys.argv[1] budget = float(sys.argv[2]) query = sys.argv[3] with mlflow.start_run(): dataset_document = get_dataset_client().read(dataset_name, budget) reader = load_reader(dataset_document) metadata = load_metadata(dataset_document) budget_per_column = budget / PrivateReader.get_budget_multiplier( metadata, reader, query) private_reader = PrivateReader(metadata, reader, budget_per_column) rowset = private_reader.execute(query) result = {"query_result": rowset} df = pd.DataFrame(rowset[1:], columns=rowset[0]) with open("result.json", "w") as stream: json.dump(df.to_dict(), stream) mlflow.log_artifact("result.json")
import pandas as pd from opendp.whitenoise.sql import PandasReader, PrivateReader from opendp.whitenoise.metadata import CollectionMetadata pums = pd.read_csv('PUMS.csv') meta = CollectionMetadata.from_file('PUMS.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married' reader = PandasReader(meta, pums) private_reader = PrivateReader(meta, reader) result = private_reader.execute_typed(query) print(result)
import pandas as pd from opendp.whitenoise.sql import PandasReader, PrivateReader from opendp.whitenoise.metadata import CollectionMetadata pums = pd.read_csv('PUMS.csv') meta = CollectionMetadata.from_file('PUMS.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married' query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ' reader = PandasReader(meta, pums) private_reader = PrivateReader(meta, reader, 4.0) private_reader.options.censor_dims = True private_reader.options.clamp_counts = True exact = reader.execute_typed(query) print(exact) private = private_reader.execute_typed(query) print(private)